diff options
Diffstat (limited to 'arch')
977 files changed, 28039 insertions, 15518 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 21d0089117fe..2520ca5b42eb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -931,6 +931,18 @@ config STRICT_MODULE_RWX config ARCH_WANT_RELAX_ORDER bool +config ARCH_HAS_REFCOUNT + bool + help + An architecture selects this when it has implemented refcount_t + using open coded assembly primitives that provide an optimized + refcount_t implementation, possibly at the expense of some full + refcount state checks of CONFIG_REFCOUNT_FULL=y. + + The refcount overflow check behavior, however, must be retained. + Catching overflows is the primary security concern for protecting + against bugs in reference counts. + config REFCOUNT_FULL bool "Perform full reference count validation at the expense of speed" help diff --git a/arch/alpha/defconfig b/arch/alpha/defconfig index 539e8b5a6cbd..f4ec420d7f2d 100644 --- a/arch/alpha/defconfig +++ b/arch/alpha/defconfig @@ -19,7 +19,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m # CONFIG_IPV6 is not set CONFIG_NETFILTER=y -CONFIG_IP_NF_QUEUE=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_FILTER=m CONFIG_VLAN_8021Q=m @@ -57,7 +56,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_RTC=y CONFIG_EXT2_FS=y CONFIG_REISERFS_FS=m -CONFIG_AUTOFS_FS=m CONFIG_ISO9660_FS=y CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index d103db5af5ff..5b974ab8425c 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += clkdev.h generic-y += exec.h generic-y += export.h +generic-y += fb.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/alpha/include/asm/asm-prototypes.h b/arch/alpha/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..d12c68ea340b --- /dev/null +++ b/arch/alpha/include/asm/asm-prototypes.h @@ -0,0 +1,18 @@ +#include <linux/spinlock.h> + +#include <asm/checksum.h> +#include <asm/console.h> +#include <asm/page.h> +#include <asm/string.h> +#include <asm/uaccess.h> + +#include <asm-generic/asm-prototypes.h> + +extern void __divl(void); +extern void __reml(void); +extern void __divq(void); +extern void __remq(void); +extern void __divlu(void); +extern void __remlu(void); +extern void __divqu(void); +extern void __remqu(void); diff --git a/arch/alpha/include/asm/core_marvel.h b/arch/alpha/include/asm/core_marvel.h index dad300fa14ce..8dcf9dbda618 100644 --- a/arch/alpha/include/asm/core_marvel.h +++ b/arch/alpha/include/asm/core_marvel.h @@ -312,7 +312,7 @@ struct io7 { io7_port7_csrs *csrs; struct io7_port ports[IO7_NUM_PORTS]; - spinlock_t irq_lock; + raw_spinlock_t irq_lock; }; #ifndef __EXTERN_INLINE diff --git a/arch/alpha/include/asm/fb.h b/arch/alpha/include/asm/fb.h deleted file mode 100644 index fa9bbb96b2b3..000000000000 --- a/arch/alpha/include/asm/fb.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ -#include <linux/device.h> - -/* Caching is off in the I/O space quadrant by design. */ -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index fb01dfb760c2..05a70edd57b6 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -25,18 +25,10 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h index a40b9fc0c6c3..718ac0b64adf 100644 --- a/arch/alpha/include/asm/spinlock.h +++ b/arch/alpha/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_spin_is_locked(x) ((x)->lock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_spin_value_unlocked(arch_spinlock_t lock) { return lock.lock == 0; diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 02760f6e6ca4..3b26cc62dadb 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -64,20 +64,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 7b285dd4fe05..c6133a045352 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -109,4 +109,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/alpha/include/uapi/asm/unistd.h b/arch/alpha/include/uapi/asm/unistd.h index a2945fea6c86..53de540e39a7 100644 --- a/arch/alpha/include/uapi/asm/unistd.h +++ b/arch/alpha/include/uapi/asm/unistd.h @@ -366,11 +366,6 @@ #define __NR_epoll_create 407 #define __NR_epoll_ctl 408 #define __NR_epoll_wait 409 -/* Feb 2007: These three sys_epoll defines shouldn't be here but culling - * them would break userspace apps ... we'll kill them off in 2010 :) */ -#define __NR_sys_epoll_create __NR_epoll_create -#define __NR_sys_epoll_ctl __NR_epoll_ctl -#define __NR_sys_epoll_wait __NR_epoll_wait #define __NR_remap_file_pages 410 #define __NR_set_tid_address 411 #define __NR_restart_syscall 412 diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 03ff832b1cb4..b10c316475dd 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -118,7 +118,7 @@ alloc_io7(unsigned int pe) io7 = alloc_bootmem(sizeof(*io7)); io7->pe = pe; - spin_lock_init(&io7->irq_lock); + raw_spin_lock_init(&io7->irq_lock); for (h = 0; h < 4; h++) { io7->ports[h].io7 = io7; diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index ffbdb3fb672f..676bab6e3123 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -42,11 +42,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - struct resource *res; - - res = alloc_bootmem(sizeof(*res)); - - return res; + return alloc_bootmem(sizeof(struct resource)); } asmlinkage long diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 92c0d460815b..cbecd527c696 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c @@ -38,7 +38,7 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num, unsigned long nr, start, size; int shift = sparse ? 5 : 0; - nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + nr = vma_pages(vma); start = vma->vm_pgoff; size = ((pci_resource_len(pdev, num) - 1) >> (PAGE_SHIFT - shift)) + 1; @@ -64,8 +64,7 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma, int sparse) { - struct pci_dev *pdev = to_pci_dev(container_of(kobj, - struct device, kobj)); + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); struct resource *res = attr->private; enum pci_mmap_state mmap_type; struct pci_bus_region bar; @@ -255,7 +254,7 @@ static int __legacy_mmap_fits(struct pci_controller *hose, { unsigned long nr, start, size; - nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + nr = vma_pages(vma); start = vma->vm_pgoff; size = ((res_size - 1) >> PAGE_SHIFT) + 1; diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 5f387ee5b5c5..8322df174bbf 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -379,11 +379,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - struct resource *res; - - res = alloc_bootmem(sizeof(*res)); - - return res; + return alloc_bootmem(sizeof(struct resource)); } diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 491e6a604e82..249229ab4942 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -1094,8 +1094,9 @@ get_sysnames(unsigned long type, unsigned long variation, unsigned long cpu, default: /* default to variation "0" for now */ break; case ST_DEC_EB164: - if (member < ARRAY_SIZE(eb164_indices)) - *variation_name = eb164_names[eb164_indices[member]]; + if (member >= ARRAY_SIZE(eb164_indices)) + break; + *variation_name = eb164_names[eb164_indices[member]]; /* PC164 may show as EB164 variation, but with EV56 CPU, so, since no true EB164 had anything but EV5... */ if (eb164_indices[member] == 0 && cpu == EV56_CPU) diff --git a/arch/alpha/kernel/smc37c669.c b/arch/alpha/kernel/smc37c669.c index c803fc76ae4f..4dbd4e415041 100644 --- a/arch/alpha/kernel/smc37c669.c +++ b/arch/alpha/kernel/smc37c669.c @@ -2007,11 +2007,8 @@ static void __init SMC37c669_config_mode( static unsigned char __init SMC37c669_read_config( unsigned char index ) { - unsigned char data; - - wb( &SMC37c669->index_port, index ); - data = rb( &SMC37c669->data_port ); - return data; + wb(&SMC37c669->index_port, index); + return rb(&SMC37c669->data_port); } /* diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 24e41bd7d3c9..3e533920371f 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -115,11 +115,11 @@ io7_enable_irq(struct irq_data *d) return; } - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); *ctl |= 1UL << 24; mb(); *ctl; - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void @@ -136,11 +136,11 @@ io7_disable_irq(struct irq_data *d) return; } - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); *ctl &= ~(1UL << 24); mb(); *ctl; - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void @@ -263,7 +263,7 @@ init_io7_irqs(struct io7 *io7, */ printk(" Interrupts reported to CPU at PE %u\n", boot_cpuid); - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); /* set up the error irqs */ io7_redirect_irq(io7, &io7->csrs->HLT_CTL.csr, boot_cpuid); @@ -295,7 +295,7 @@ init_io7_irqs(struct io7 *io7, for (i = 0; i < 16; ++i) init_one_io7_msi(io7, i, boot_cpuid); - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void __init diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 65bb102d985b..ddb89a18cf26 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -193,8 +193,10 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) static long dummy_emul(void) { return 0; } long (*alpha_fp_emul_imprecise)(struct pt_regs *regs, unsigned long writemask) = (void *)dummy_emul; +EXPORT_SYMBOL_GPL(alpha_fp_emul_imprecise); long (*alpha_fp_emul) (unsigned long pc) = (void *)dummy_emul; +EXPORT_SYMBOL_GPL(alpha_fp_emul); #else long alpha_fp_emul_imprecise(struct pt_regs *regs, unsigned long writemask); long alpha_fp_emul (unsigned long pc); diff --git a/arch/alpha/math-emu/math.c b/arch/alpha/math-emu/math.c index d17d705f6545..1c2d456da7f2 100644 --- a/arch/alpha/math-emu/math.c +++ b/arch/alpha/math-emu/math.c @@ -53,6 +53,7 @@ extern void alpha_write_fp_reg_s (unsigned long reg, unsigned long val); #ifdef MODULE MODULE_DESCRIPTION("FP Software completion module"); +MODULE_LICENSE("GPL v2"); extern long (*alpha_fp_emul_imprecise)(struct pt_regs *, unsigned long); extern long (*alpha_fp_emul) (unsigned long pc); diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index 0ff7e07edcd4..2367a67c5f10 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -101,7 +101,6 @@ mmc@0x15000 { compatible = "altr,socfpga-dw-mshc"; reg = < 0x15000 0x400 >; - num-slots = < 1 >; fifo-depth = < 16 >; card-detect-delay = < 200 >; clocks = <&apbclk>, <&mmcclk>; diff --git a/arch/arc/boot/dts/vdk_axs10x_mb.dtsi b/arch/arc/boot/dts/vdk_axs10x_mb.dtsi index 459fc656b759..48bb4b4cd234 100644 --- a/arch/arc/boot/dts/vdk_axs10x_mb.dtsi +++ b/arch/arc/boot/dts/vdk_axs10x_mb.dtsi @@ -104,7 +104,6 @@ mmc@0x15000 { compatible = "snps,dw-mshc"; reg = <0x15000 0x400>; - num-slots = <1>; fifo-depth = <1024>; card-detect-delay = <200>; clocks = <&apbclk>, <&mmcclk>; diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 54b54da6384c..11859287c52a 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i) atomic_ops_unlock(flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #endif /* diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 11e1b1f3acda..eb887dd13e74 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -73,20 +73,11 @@ #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; - #ifndef CONFIG_ARC_HAS_LLSC preempt_disable(); /* to guarantee atomic r-m-w of futex op */ #endif @@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index 233d5ffe6ec7..a325e6a36523 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_is_locked(x) ((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #ifdef CONFIG_ARC_HAS_LLSC static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 61a0cb15067e..f1b3f1d575d4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -50,7 +50,7 @@ config ARM select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) select HAVE_ARCH_TRACEHOOK select HAVE_ARM_SMCCC if CPU_V7 - select HAVE_CBPF_JIT + select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 select HAVE_CC_STACKPROTECTOR select HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT diff --git a/arch/arm/boot/compressed/efi-header.S b/arch/arm/boot/compressed/efi-header.S index a17ca8d78656..c94a88ae834d 100644 --- a/arch/arm/boot/compressed/efi-header.S +++ b/arch/arm/boot/compressed/efi-header.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013-2015 Linaro Ltd + * Copyright (C) 2013-2017 Linaro Ltd * Authors: Roy Franz <roy.franz@linaro.org> * Ard Biesheuvel <ard.biesheuvel@linaro.org> * @@ -8,6 +8,9 @@ * published by the Free Software Foundation. */ +#include <linux/pe.h> +#include <linux/sizes.h> + .macro __nop #ifdef CONFIG_EFI_STUB @ This is almost but not quite a NOP, since it does clobber the @@ -15,7 +18,7 @@ @ PE/COFF expects the magic string "MZ" at offset 0, while the @ ARM/Linux boot protocol expects an executable instruction @ there. - .inst 'M' | ('Z' << 8) | (0x1310 << 16) @ tstne r0, #0x4d000 + .inst MZ_MAGIC | (0x1310 << 16) @ tstne r0, #0x4d000 #else AR_CLASS( mov r0, r0 ) M_CLASS( nop.w ) @@ -34,96 +37,97 @@ @ The only 2 fields of the MSDOS header that are used are this @ PE/COFF offset, and the "MZ" bytes at offset 0x0. @ - .long pe_header - start @ Offset to the PE header. + .long pe_header - start @ Offset to the PE header. pe_header: - .ascii "PE\0\0" + .long PE_MAGIC coff_header: - .short 0x01c2 @ ARM or Thumb - .short 2 @ nr_sections - .long 0 @ TimeDateStamp - .long 0 @ PointerToSymbolTable - .long 1 @ NumberOfSymbols - .short section_table - optional_header - @ SizeOfOptionalHeader - .short 0x306 @ Characteristics. - @ IMAGE_FILE_32BIT_MACHINE | - @ IMAGE_FILE_DEBUG_STRIPPED | - @ IMAGE_FILE_EXECUTABLE_IMAGE | - @ IMAGE_FILE_LINE_NUMS_STRIPPED + .short IMAGE_FILE_MACHINE_THUMB @ Machine + .short section_count @ NumberOfSections + .long 0 @ TimeDateStamp + .long 0 @ PointerToSymbolTable + .long 0 @ NumberOfSymbols + .short section_table - optional_header @ SizeOfOptionalHeader + .short IMAGE_FILE_32BIT_MACHINE | \ + IMAGE_FILE_DEBUG_STRIPPED | \ + IMAGE_FILE_EXECUTABLE_IMAGE | \ + IMAGE_FILE_LINE_NUMS_STRIPPED @ Characteristics + +#define __pecoff_code_size (__pecoff_data_start - __efi_start) optional_header: - .short 0x10b @ PE32 format - .byte 0x02 @ MajorLinkerVersion - .byte 0x14 @ MinorLinkerVersion - .long _end - __efi_start @ SizeOfCode - .long 0 @ SizeOfInitializedData - .long 0 @ SizeOfUninitializedData - .long efi_stub_entry - start @ AddressOfEntryPoint - .long start_offset @ BaseOfCode - .long 0 @ data + .short PE_OPT_MAGIC_PE32 @ PE32 format + .byte 0x02 @ MajorLinkerVersion + .byte 0x14 @ MinorLinkerVersion + .long __pecoff_code_size @ SizeOfCode + .long __pecoff_data_size @ SizeOfInitializedData + .long 0 @ SizeOfUninitializedData + .long efi_stub_entry - start @ AddressOfEntryPoint + .long start_offset @ BaseOfCode + .long __pecoff_data_start - start @ BaseOfData extra_header_fields: - .long 0 @ ImageBase - .long 0x200 @ SectionAlignment - .long 0x200 @ FileAlignment - .short 0 @ MajorOperatingSystemVersion - .short 0 @ MinorOperatingSystemVersion - .short 0 @ MajorImageVersion - .short 0 @ MinorImageVersion - .short 0 @ MajorSubsystemVersion - .short 0 @ MinorSubsystemVersion - .long 0 @ Win32VersionValue + .long 0 @ ImageBase + .long SZ_4K @ SectionAlignment + .long SZ_512 @ FileAlignment + .short 0 @ MajorOsVersion + .short 0 @ MinorOsVersion + .short 0 @ MajorImageVersion + .short 0 @ MinorImageVersion + .short 0 @ MajorSubsystemVersion + .short 0 @ MinorSubsystemVersion + .long 0 @ Win32VersionValue - .long _end - start @ SizeOfImage - .long start_offset @ SizeOfHeaders - .long 0 @ CheckSum - .short 0xa @ Subsystem (EFI application) - .short 0 @ DllCharacteristics - .long 0 @ SizeOfStackReserve - .long 0 @ SizeOfStackCommit - .long 0 @ SizeOfHeapReserve - .long 0 @ SizeOfHeapCommit - .long 0 @ LoaderFlags - .long 0x6 @ NumberOfRvaAndSizes + .long __pecoff_end - start @ SizeOfImage + .long start_offset @ SizeOfHeaders + .long 0 @ CheckSum + .short IMAGE_SUBSYSTEM_EFI_APPLICATION @ Subsystem + .short 0 @ DllCharacteristics + .long 0 @ SizeOfStackReserve + .long 0 @ SizeOfStackCommit + .long 0 @ SizeOfHeapReserve + .long 0 @ SizeOfHeapCommit + .long 0 @ LoaderFlags + .long (section_table - .) / 8 @ NumberOfRvaAndSizes - .quad 0 @ ExportTable - .quad 0 @ ImportTable - .quad 0 @ ResourceTable - .quad 0 @ ExceptionTable - .quad 0 @ CertificationTable - .quad 0 @ BaseRelocationTable + .quad 0 @ ExportTable + .quad 0 @ ImportTable + .quad 0 @ ResourceTable + .quad 0 @ ExceptionTable + .quad 0 @ CertificationTable + .quad 0 @ BaseRelocationTable section_table: - @ - @ The EFI application loader requires a relocation section - @ because EFI applications must be relocatable. This is a - @ dummy section as far as we are concerned. - @ - .ascii ".reloc\0\0" - .long 0 @ VirtualSize - .long 0 @ VirtualAddress - .long 0 @ SizeOfRawData - .long 0 @ PointerToRawData - .long 0 @ PointerToRelocations - .long 0 @ PointerToLineNumbers - .short 0 @ NumberOfRelocations - .short 0 @ NumberOfLineNumbers - .long 0x42100040 @ Characteristics - .ascii ".text\0\0\0" - .long _end - __efi_start @ VirtualSize - .long __efi_start @ VirtualAddress - .long _edata - __efi_start @ SizeOfRawData - .long __efi_start @ PointerToRawData - .long 0 @ PointerToRelocations - .long 0 @ PointerToLineNumbers - .short 0 @ NumberOfRelocations - .short 0 @ NumberOfLineNumbers - .long 0xe0500020 @ Characteristics + .long __pecoff_code_size @ VirtualSize + .long __efi_start @ VirtualAddress + .long __pecoff_code_size @ SizeOfRawData + .long __efi_start @ PointerToRawData + .long 0 @ PointerToRelocations + .long 0 @ PointerToLineNumbers + .short 0 @ NumberOfRelocations + .short 0 @ NumberOfLineNumbers + .long IMAGE_SCN_CNT_CODE | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_EXECUTE @ Characteristics + + .ascii ".data\0\0\0" + .long __pecoff_data_size @ VirtualSize + .long __pecoff_data_start - start @ VirtualAddress + .long __pecoff_data_rawsize @ SizeOfRawData + .long __pecoff_data_start - start @ PointerToRawData + .long 0 @ PointerToRelocations + .long 0 @ PointerToLineNumbers + .short 0 @ NumberOfRelocations + .short 0 @ NumberOfLineNumbers + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_WRITE @ Characteristics + + .set section_count, (. - section_table) / 40 - .align 9 + .align 12 __efi_start: #endif .endm diff --git a/arch/arm/boot/compressed/vmlinux.lds.S b/arch/arm/boot/compressed/vmlinux.lds.S index 81c493156ce8..7a4c59154361 100644 --- a/arch/arm/boot/compressed/vmlinux.lds.S +++ b/arch/arm/boot/compressed/vmlinux.lds.S @@ -48,13 +48,6 @@ SECTIONS *(.rodata) *(.rodata.*) } - .data : { - /* - * The EFI stub always executes from RAM, and runs strictly before the - * decompressor, so we can make an exception for its r/w data, and keep it - */ - *(.data.efistub) - } .piggydata : { *(.piggydata) } @@ -70,6 +63,26 @@ SECTIONS /* ensure the zImage file size is always a multiple of 64 bits */ /* (without a dummy byte, ld just ignores the empty section) */ .pad : { BYTE(0); . = ALIGN(8); } + +#ifdef CONFIG_EFI_STUB + .data : ALIGN(4096) { + __pecoff_data_start = .; + /* + * The EFI stub always executes from RAM, and runs strictly before the + * decompressor, so we can make an exception for its r/w data, and keep it + */ + *(.data.efistub) + __pecoff_data_end = .; + + /* + * PE/COFF mandates a file size which is a multiple of 512 bytes if the + * section size equals or exceeds 4 KB + */ + . = ALIGN(512); + } + __pecoff_data_rawsize = . - ADDR(.data); +#endif + _edata = .; _magic_sig = ZIMAGE_MAGIC(0x016f2818); @@ -84,6 +97,9 @@ SECTIONS . = ALIGN(8); /* the stack must be 64-bit aligned */ .stack : { *(.stack) } + PROVIDE(__pecoff_data_size = ALIGN(512) - ADDR(.data)); + PROVIDE(__pecoff_end = ALIGN(512)); + .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } .stab.excl 0 : { *(.stab.excl) } diff --git a/arch/arm/boot/dts/imx6q-evi.dts b/arch/arm/boot/dts/imx6q-evi.dts index 1f0f950dc11e..e0aea782c666 100644 --- a/arch/arm/boot/dts/imx6q-evi.dts +++ b/arch/arm/boot/dts/imx6q-evi.dts @@ -94,6 +94,15 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_ecspi1 &pinctrl_ecspi1cs>; status = "okay"; + + fpga: fpga@0 { + compatible = "altr,fpga-passive-serial"; + spi-max-frequency = <20000000>; + reg = <0>; + pinctrl-0 = <&pinctrl_fpgaspi>; + nconfig-gpios = <&gpio4 9 GPIO_ACTIVE_LOW>; + nstat-gpios = <&gpio4 11 GPIO_ACTIVE_LOW>; + }; }; &ecspi3 { @@ -319,6 +328,13 @@ >; }; + pinctrl_fpgaspi: fpgaspigrp { + fsl,pins = < + MX6QDL_PAD_KEY_ROW1__GPIO4_IO09 0x1b0b0 + MX6QDL_PAD_KEY_ROW2__GPIO4_IO11 0x1b0b0 + >; + }; + pinctrl_gpminand: gpminandgrp { fsl,pins = < MX6QDL_PAD_NANDF_CLE__NAND_CLE 0xb0b1 diff --git a/arch/arm/boot/dts/imx7ulp-pinfunc.h b/arch/arm/boot/dts/imx7ulp-pinfunc.h new file mode 100644 index 000000000000..fe511775b518 --- /dev/null +++ b/arch/arm/boot/dts/imx7ulp-pinfunc.h @@ -0,0 +1,468 @@ +/* + * Copyright 2016 Freescale Semiconductor, Inc. + * Copyright 2017 NXP + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#ifndef __DTS_IMX7ULP_PINFUNC_H +#define __DTS_IMX7ULP_PINFUNC_H + +/* + * The pin function ID is a tuple of + * <mux_conf_reg input_reg mux_mode input_val> + */ + +#define IMX7ULP_PAD_PTC0__PTC0 0x0000 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC0__TRACE_D15 0x0000 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC0__LPUART4_CTS_B 0x0000 0x0244 0x4 0x1 +#define IMX7ULP_PAD_PTC0__LPI2C4_SCL 0x0000 0x0278 0x5 0x1 +#define IMX7ULP_PAD_PTC0__TPM4_CLKIN 0x0000 0x0298 0x6 0x1 +#define IMX7ULP_PAD_PTC0__FB_AD0 0x0000 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC1__PTC1 0x0004 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC1__TRACE_D14 0x0004 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC1__LPUART4_RTS_B 0x0004 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC1__LPI2C4_SDA 0x0004 0x027c 0x5 0x1 +#define IMX7ULP_PAD_PTC1__TPM4_CH0 0x0004 0x0280 0x6 0x1 +#define IMX7ULP_PAD_PTC1__FB_AD1 0x0004 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC2__PTC2 0x0008 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC2__TRACE_D13 0x0008 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC2__LPUART4_TX 0x0008 0x024c 0x4 0x1 +#define IMX7ULP_PAD_PTC2__LPI2C4_HREQ 0x0008 0x0274 0x5 0x1 +#define IMX7ULP_PAD_PTC2__TPM4_CH1 0x0008 0x0284 0x6 0x1 +#define IMX7ULP_PAD_PTC2__FB_AD2 0x0008 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC3__PTC3 0x000c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC3__TRACE_D12 0x000c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC3__LPUART4_RX 0x000c 0x0248 0x4 0x1 +#define IMX7ULP_PAD_PTC3__TPM4_CH2 0x000c 0x0288 0x6 0x1 +#define IMX7ULP_PAD_PTC3__FB_AD3 0x000c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC4__PTC4 0x0010 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC4__TRACE_D11 0x0010 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC4__FXIO1_D0 0x0010 0x0204 0x2 0x1 +#define IMX7ULP_PAD_PTC4__LPSPI2_PCS1 0x0010 0x02a0 0x3 0x1 +#define IMX7ULP_PAD_PTC4__LPUART5_CTS_B 0x0010 0x0250 0x4 0x1 +#define IMX7ULP_PAD_PTC4__LPI2C5_SCL 0x0010 0x02bc 0x5 0x1 +#define IMX7ULP_PAD_PTC4__TPM4_CH3 0x0010 0x028c 0x6 0x1 +#define IMX7ULP_PAD_PTC4__FB_AD4 0x0010 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC5__PTC5 0x0014 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC5__TRACE_D10 0x0014 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC5__FXIO1_D1 0x0014 0x0208 0x2 0x1 +#define IMX7ULP_PAD_PTC5__LPSPI2_PCS2 0x0014 0x02a4 0x3 0x1 +#define IMX7ULP_PAD_PTC5__LPUART5_RTS_B 0x0014 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC5__LPI2C5_SDA 0x0014 0x02c0 0x5 0x1 +#define IMX7ULP_PAD_PTC5__TPM4_CH4 0x0014 0x0290 0x6 0x1 +#define IMX7ULP_PAD_PTC5__FB_AD5 0x0014 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC6__PTC6 0x0018 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC6__TRACE_D9 0x0018 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC6__FXIO1_D2 0x0018 0x020c 0x2 0x1 +#define IMX7ULP_PAD_PTC6__LPSPI2_PCS3 0x0018 0x02a8 0x3 0x1 +#define IMX7ULP_PAD_PTC6__LPUART5_TX 0x0018 0x0258 0x4 0x1 +#define IMX7ULP_PAD_PTC6__LPI2C5_HREQ 0x0018 0x02b8 0x5 0x1 +#define IMX7ULP_PAD_PTC6__TPM4_CH5 0x0018 0x0294 0x6 0x1 +#define IMX7ULP_PAD_PTC6__FB_AD6 0x0018 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC7__PTC7 0x001c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC7__TRACE_D8 0x001c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC7__FXIO1_D3 0x001c 0x0210 0x2 0x1 +#define IMX7ULP_PAD_PTC7__LPUART5_RX 0x001c 0x0254 0x4 0x1 +#define IMX7ULP_PAD_PTC7__TPM5_CH1 0x001c 0x02c8 0x6 0x1 +#define IMX7ULP_PAD_PTC7__FB_AD7 0x001c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC8__PTC8 0x0020 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC8__TRACE_D7 0x0020 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC8__FXIO1_D4 0x0020 0x0214 0x2 0x1 +#define IMX7ULP_PAD_PTC8__LPSPI2_SIN 0x0020 0x02b0 0x3 0x1 +#define IMX7ULP_PAD_PTC8__LPUART6_CTS_B 0x0020 0x025c 0x4 0x1 +#define IMX7ULP_PAD_PTC8__LPI2C6_SCL 0x0020 0x02fc 0x5 0x1 +#define IMX7ULP_PAD_PTC8__TPM5_CLKIN 0x0020 0x02cc 0x6 0x1 +#define IMX7ULP_PAD_PTC8__FB_AD8 0x0020 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC9__PTC9 0x0024 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC9__TRACE_D6 0x0024 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC9__FXIO1_D5 0x0024 0x0218 0x2 0x1 +#define IMX7ULP_PAD_PTC9__LPSPI2_SOUT 0x0024 0x02b4 0x3 0x1 +#define IMX7ULP_PAD_PTC9__LPUART6_RTS_B 0x0024 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC9__LPI2C6_SDA 0x0024 0x0300 0x5 0x1 +#define IMX7ULP_PAD_PTC9__TPM5_CH0 0x0024 0x02c4 0x6 0x1 +#define IMX7ULP_PAD_PTC9__FB_AD9 0x0024 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC10__PTC10 0x0028 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC10__TRACE_D5 0x0028 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC10__FXIO1_D6 0x0028 0x021c 0x2 0x1 +#define IMX7ULP_PAD_PTC10__LPSPI2_SCK 0x0028 0x02ac 0x3 0x1 +#define IMX7ULP_PAD_PTC10__LPUART6_TX 0x0028 0x0264 0x4 0x1 +#define IMX7ULP_PAD_PTC10__LPI2C6_HREQ 0x0028 0x02f8 0x5 0x1 +#define IMX7ULP_PAD_PTC10__TPM7_CH3 0x0028 0x02e8 0x6 0x1 +#define IMX7ULP_PAD_PTC10__FB_AD10 0x0028 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC11__PTC11 0x002c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC11__TRACE_D4 0x002c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC11__FXIO1_D7 0x002c 0x0220 0x2 0x1 +#define IMX7ULP_PAD_PTC11__LPSPI2_PCS0 0x002c 0x029c 0x3 0x1 +#define IMX7ULP_PAD_PTC11__LPUART6_RX 0x002c 0x0260 0x4 0x1 +#define IMX7ULP_PAD_PTC11__TPM7_CH4 0x002c 0x02ec 0x6 0x1 +#define IMX7ULP_PAD_PTC11__FB_AD11 0x002c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC12__PTC12 0x0030 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC12__TRACE_D3 0x0030 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC12__FXIO1_D8 0x0030 0x0224 0x2 0x1 +#define IMX7ULP_PAD_PTC12__LPSPI3_PCS1 0x0030 0x0314 0x3 0x1 +#define IMX7ULP_PAD_PTC12__LPUART7_CTS_B 0x0030 0x0268 0x4 0x1 +#define IMX7ULP_PAD_PTC12__LPI2C7_SCL 0x0030 0x0308 0x5 0x1 +#define IMX7ULP_PAD_PTC12__TPM7_CH5 0x0030 0x02f0 0x6 0x1 +#define IMX7ULP_PAD_PTC12__FB_AD12 0x0030 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC13__PTC13 0x0034 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC13__TRACE_D2 0x0034 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC13__FXIO1_D9 0x0034 0x0228 0x2 0x1 +#define IMX7ULP_PAD_PTC13__LPSPI3_PCS2 0x0034 0x0318 0x3 0x1 +#define IMX7ULP_PAD_PTC13__LPUART7_RTS_B 0x0034 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC13__LPI2C7_SDA 0x0034 0x030c 0x5 0x1 +#define IMX7ULP_PAD_PTC13__TPM7_CLKIN 0x0034 0x02f4 0x6 0x1 +#define IMX7ULP_PAD_PTC13__FB_AD13 0x0034 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC14__PTC14 0x0038 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC14__TRACE_D1 0x0038 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC14__FXIO1_D10 0x0038 0x022c 0x2 0x1 +#define IMX7ULP_PAD_PTC14__LPSPI3_PCS3 0x0038 0x031c 0x3 0x1 +#define IMX7ULP_PAD_PTC14__LPUART7_TX 0x0038 0x0270 0x4 0x1 +#define IMX7ULP_PAD_PTC14__LPI2C7_HREQ 0x0038 0x0304 0x5 0x1 +#define IMX7ULP_PAD_PTC14__TPM7_CH0 0x0038 0x02dc 0x6 0x1 +#define IMX7ULP_PAD_PTC14__FB_AD14 0x0038 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC15__PTC15 0x003c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC15__TRACE_D0 0x003c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC15__FXIO1_D11 0x003c 0x0230 0x2 0x1 +#define IMX7ULP_PAD_PTC15__LPUART7_RX 0x003c 0x026c 0x4 0x1 +#define IMX7ULP_PAD_PTC15__TPM7_CH1 0x003c 0x02e0 0x6 0x1 +#define IMX7ULP_PAD_PTC15__FB_AD15 0x003c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC16__PTC16 0x0040 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC16__TRACE_CLKOUT 0x0040 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC16__FXIO1_D12 0x0040 0x0234 0x2 0x1 +#define IMX7ULP_PAD_PTC16__LPSPI3_SIN 0x0040 0x0324 0x3 0x1 +#define IMX7ULP_PAD_PTC16__TPM7_CH2 0x0040 0x02e4 0x6 0x1 +#define IMX7ULP_PAD_PTC16__FB_ALE_FB_CS1_B_FB_TS_B 0x0040 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC17__PTC17 0x0044 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC17__FXIO1_D13 0x0044 0x0238 0x2 0x1 +#define IMX7ULP_PAD_PTC17__LPSPI3_SOUT 0x0044 0x0328 0x3 0x1 +#define IMX7ULP_PAD_PTC17__TPM6_CLKIN 0x0044 0x02d8 0x6 0x1 +#define IMX7ULP_PAD_PTC17__FB_CS0_B 0x0044 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC18__PTC18 0x0048 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC18__FXIO1_D14 0x0048 0x023c 0x2 0x1 +#define IMX7ULP_PAD_PTC18__LPSPI3_SCK 0x0048 0x0320 0x3 0x1 +#define IMX7ULP_PAD_PTC18__TPM6_CH0 0x0048 0x02d0 0x6 0x1 +#define IMX7ULP_PAD_PTC18__FB_OE_B 0x0048 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC19__PTC19 0x004c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC19__FXIO1_D15 0x004c 0x0240 0x2 0x1 +#define IMX7ULP_PAD_PTC19__LPSPI3_PCS0 0x004c 0x0310 0x3 0x1 +#define IMX7ULP_PAD_PTC19__TPM6_CH1 0x004c 0x02d4 0x6 0x1 +#define IMX7ULP_PAD_PTC19__FB_A16 0x004c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTD0__PTD0 0x0080 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD0__SDHC0_RESET_B 0x0080 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD1__PTD1 0x0084 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD1__SDHC0_CMD 0x0084 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD2__PTD2 0x0088 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD2__SDHC0_CLK 0x0088 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD3__PTD3 0x008c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD3__SDHC0_D7 0x008c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD4__PTD4 0x0090 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD4__SDHC0_D6 0x0090 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD5__PTD5 0x0094 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD5__SDHC0_D5 0x0094 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD6__PTD6 0x0098 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD6__SDHC0_D4 0x0098 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD7__PTD7 0x009c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD7__SDHC0_D3 0x009c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD8__PTD8 0x00a0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD8__TPM4_CLKIN 0x00a0 0x0298 0x6 0x2 +#define IMX7ULP_PAD_PTD8__SDHC0_D2 0x00a0 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD9__PTD9 0x00a4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD9__TPM4_CH0 0x00a4 0x0280 0x6 0x2 +#define IMX7ULP_PAD_PTD9__SDHC0_D1 0x00a4 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD10__PTD10 0x00a8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD10__TPM4_CH1 0x00a8 0x0284 0x6 0x2 +#define IMX7ULP_PAD_PTD10__SDHC0_D0 0x00a8 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD11__PTD11 0x00ac 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD11__TPM4_CH2 0x00ac 0x0288 0x6 0x2 +#define IMX7ULP_PAD_PTD11__SDHC0_DQS 0x00ac 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE0__PTE0 0x0100 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE0__FXIO1_D31 0x0100 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE0__LPSPI2_PCS1 0x0100 0x02a0 0x3 0x2 +#define IMX7ULP_PAD_PTE0__LPUART4_CTS_B 0x0100 0x0244 0x4 0x2 +#define IMX7ULP_PAD_PTE0__LPI2C4_SCL 0x0100 0x0278 0x5 0x2 +#define IMX7ULP_PAD_PTE0__SDHC1_D1 0x0100 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE0__FB_A25 0x0100 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE1__PTE1 0x0104 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE1__FXIO1_D30 0x0104 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE1__LPSPI2_PCS2 0x0104 0x02a4 0x3 0x2 +#define IMX7ULP_PAD_PTE1__LPUART4_RTS_B 0x0104 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE1__LPI2C4_SDA 0x0104 0x027c 0x5 0x2 +#define IMX7ULP_PAD_PTE1__SDHC1_D0 0x0104 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE1__FB_A26 0x0104 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE2__PTE2 0x0108 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE2__FXIO1_D29 0x0108 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE2__LPSPI2_PCS3 0x0108 0x02a8 0x3 0x2 +#define IMX7ULP_PAD_PTE2__LPUART4_TX 0x0108 0x024c 0x4 0x2 +#define IMX7ULP_PAD_PTE2__LPI2C4_HREQ 0x0108 0x0274 0x5 0x2 +#define IMX7ULP_PAD_PTE2__SDHC1_CLK 0x0108 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE3__PTE3 0x010c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE3__FXIO1_D28 0x010c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE3__LPUART4_RX 0x010c 0x0248 0x4 0x2 +#define IMX7ULP_PAD_PTE3__TPM5_CH1 0x010c 0x02c8 0x6 0x2 +#define IMX7ULP_PAD_PTE3__SDHC1_CMD 0x010c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE4__PTE4 0x0110 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE4__FXIO1_D27 0x0110 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE4__LPSPI2_SIN 0x0110 0x02b0 0x3 0x2 +#define IMX7ULP_PAD_PTE4__LPUART5_CTS_B 0x0110 0x0250 0x4 0x2 +#define IMX7ULP_PAD_PTE4__LPI2C5_SCL 0x0110 0x02bc 0x5 0x2 +#define IMX7ULP_PAD_PTE4__TPM5_CLKIN 0x0110 0x02cc 0x6 0x2 +#define IMX7ULP_PAD_PTE4__SDHC1_D3 0x0110 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE5__PTE5 0x0114 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE5__FXIO1_D26 0x0114 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE5__LPSPI2_SOUT 0x0114 0x02b4 0x3 0x2 +#define IMX7ULP_PAD_PTE5__LPUART5_RTS_B 0x0114 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE5__LPI2C5_SDA 0x0114 0x02c0 0x5 0x2 +#define IMX7ULP_PAD_PTE5__TPM5_CH0 0x0114 0x02c4 0x6 0x2 +#define IMX7ULP_PAD_PTE5__SDHC1_D2 0x0114 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE6__PTE6 0x0118 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE6__FXIO1_D25 0x0118 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE6__LPSPI2_SCK 0x0118 0x02ac 0x3 0x2 +#define IMX7ULP_PAD_PTE6__LPUART5_TX 0x0118 0x0258 0x4 0x2 +#define IMX7ULP_PAD_PTE6__LPI2C5_HREQ 0x0118 0x02b8 0x5 0x2 +#define IMX7ULP_PAD_PTE6__TPM7_CH3 0x0118 0x02e8 0x6 0x2 +#define IMX7ULP_PAD_PTE6__SDHC1_D4 0x0118 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE6__FB_A17 0x0118 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE7__PTE7 0x011c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE7__TRACE_D7 0x011c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE7__VIU_FID 0x011c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE7__FXIO1_D24 0x011c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE7__LPSPI2_PCS0 0x011c 0x029c 0x3 0x2 +#define IMX7ULP_PAD_PTE7__LPUART5_RX 0x011c 0x0254 0x4 0x2 +#define IMX7ULP_PAD_PTE7__TPM7_CH4 0x011c 0x02ec 0x6 0x2 +#define IMX7ULP_PAD_PTE7__SDHC1_D5 0x011c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE7__FB_A18 0x011c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE8__PTE8 0x0120 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE8__TRACE_D6 0x0120 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE8__VIU_D16 0x0120 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE8__FXIO1_D23 0x0120 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE8__LPSPI3_PCS1 0x0120 0x0314 0x3 0x2 +#define IMX7ULP_PAD_PTE8__LPUART6_CTS_B 0x0120 0x025c 0x4 0x2 +#define IMX7ULP_PAD_PTE8__LPI2C6_SCL 0x0120 0x02fc 0x5 0x2 +#define IMX7ULP_PAD_PTE8__TPM7_CH5 0x0120 0x02f0 0x6 0x2 +#define IMX7ULP_PAD_PTE8__SDHC1_WP 0x0120 0x0200 0x7 0x1 +#define IMX7ULP_PAD_PTE8__SDHC1_D6 0x0120 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE8__FB_CS3_B_FB_BE7_0_BLS31_24_B 0x0120 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE9__PTE9 0x0124 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE9__TRACE_D5 0x0124 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE9__VIU_D17 0x0124 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE9__FXIO1_D22 0x0124 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE9__LPSPI3_PCS2 0x0124 0x0318 0x3 0x2 +#define IMX7ULP_PAD_PTE9__LPUART6_RTS_B 0x0124 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE9__LPI2C6_SDA 0x0124 0x0300 0x5 0x2 +#define IMX7ULP_PAD_PTE9__TPM7_CLKIN 0x0124 0x02f4 0x6 0x2 +#define IMX7ULP_PAD_PTE9__SDHC1_CD 0x0124 0x032c 0x7 0x1 +#define IMX7ULP_PAD_PTE9__SDHC1_D7 0x0124 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE9__FB_TBST_B_FB_CS2_B_FB_BE15_8_BLS23_16_B 0x0124 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE10__PTE10 0x0128 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE10__TRACE_D4 0x0128 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE10__VIU_D18 0x0128 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE10__FXIO1_D21 0x0128 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE10__LPSPI3_PCS3 0x0128 0x031c 0x3 0x2 +#define IMX7ULP_PAD_PTE10__LPUART6_TX 0x0128 0x0264 0x4 0x2 +#define IMX7ULP_PAD_PTE10__LPI2C6_HREQ 0x0128 0x02f8 0x5 0x2 +#define IMX7ULP_PAD_PTE10__TPM7_CH0 0x0128 0x02dc 0x6 0x2 +#define IMX7ULP_PAD_PTE10__SDHC1_VS 0x0128 0x0000 0x7 0x0 +#define IMX7ULP_PAD_PTE10__SDHC1_DQS 0x0128 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE10__FB_A19 0x0128 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE11__PTE11 0x012c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE11__TRACE_D3 0x012c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE11__VIU_D19 0x012c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE11__FXIO1_D20 0x012c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE11__LPUART6_RX 0x012c 0x0260 0x4 0x2 +#define IMX7ULP_PAD_PTE11__TPM7_CH1 0x012c 0x02e0 0x6 0x2 +#define IMX7ULP_PAD_PTE11__SDHC1_RESET_B 0x012c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE11__FB_A20 0x012c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE12__PTE12 0x0130 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE12__TRACE_D2 0x0130 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE12__VIU_D20 0x0130 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE12__FXIO1_D19 0x0130 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE12__LPSPI3_SIN 0x0130 0x0324 0x3 0x2 +#define IMX7ULP_PAD_PTE12__LPUART7_CTS_B 0x0130 0x0268 0x4 0x2 +#define IMX7ULP_PAD_PTE12__LPI2C7_SCL 0x0130 0x0308 0x5 0x2 +#define IMX7ULP_PAD_PTE12__TPM7_CH2 0x0130 0x02e4 0x6 0x2 +#define IMX7ULP_PAD_PTE12__SDHC1_WP 0x0130 0x0200 0x8 0x2 +#define IMX7ULP_PAD_PTE12__FB_A21 0x0130 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE13__PTE13 0x0134 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE13__TRACE_D1 0x0134 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE13__VIU_D21 0x0134 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE13__FXIO1_D18 0x0134 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE13__LPSPI3_SOUT 0x0134 0x0328 0x3 0x2 +#define IMX7ULP_PAD_PTE13__LPUART7_RTS_B 0x0134 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE13__LPI2C7_SDA 0x0134 0x030c 0x5 0x2 +#define IMX7ULP_PAD_PTE13__TPM6_CLKIN 0x0134 0x02d8 0x6 0x2 +#define IMX7ULP_PAD_PTE13__SDHC1_CD 0x0134 0x032c 0x8 0x2 +#define IMX7ULP_PAD_PTE13__FB_A22 0x0134 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE14__PTE14 0x0138 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE14__TRACE_D0 0x0138 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE14__VIU_D22 0x0138 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE14__FXIO1_D17 0x0138 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE14__LPSPI3_SCK 0x0138 0x0320 0x3 0x2 +#define IMX7ULP_PAD_PTE14__LPUART7_TX 0x0138 0x0270 0x4 0x2 +#define IMX7ULP_PAD_PTE14__LPI2C7_HREQ 0x0138 0x0304 0x5 0x2 +#define IMX7ULP_PAD_PTE14__TPM6_CH0 0x0138 0x02d0 0x6 0x2 +#define IMX7ULP_PAD_PTE14__SDHC1_VS 0x0138 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE14__FB_A23 0x0138 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE15__PTE15 0x013c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE15__TRACE_CLKOUT 0x013c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE15__VIU_D23 0x013c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE15__FXIO1_D16 0x013c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE15__LPSPI3_PCS0 0x013c 0x0310 0x3 0x2 +#define IMX7ULP_PAD_PTE15__LPUART7_RX 0x013c 0x026c 0x4 0x2 +#define IMX7ULP_PAD_PTE15__TPM6_CH1 0x013c 0x02d4 0x6 0x2 +#define IMX7ULP_PAD_PTE15__FB_A24 0x013c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF0__PTF0 0x0180 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF0__VIU_DE 0x0180 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF0__LPUART4_CTS_B 0x0180 0x0244 0x4 0x3 +#define IMX7ULP_PAD_PTF0__LPI2C4_SCL 0x0180 0x0278 0x5 0x3 +#define IMX7ULP_PAD_PTF0__TPM4_CLKIN 0x0180 0x0298 0x6 0x3 +#define IMX7ULP_PAD_PTF0__FB_RW_B 0x0180 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF1__PTF1 0x0184 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF1__VIU_HSYNC 0x0184 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF1__LPUART4_RTS_B 0x0184 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF1__LPI2C4_SDA 0x0184 0x027c 0x5 0x3 +#define IMX7ULP_PAD_PTF1__TPM4_CH0 0x0184 0x0280 0x6 0x3 +#define IMX7ULP_PAD_PTF1__CLKOUT 0x0184 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF2__PTF2 0x0188 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF2__VIU_VSYNC 0x0188 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF2__LPUART4_TX 0x0188 0x024c 0x4 0x3 +#define IMX7ULP_PAD_PTF2__LPI2C4_HREQ 0x0188 0x0274 0x5 0x3 +#define IMX7ULP_PAD_PTF2__TPM4_CH1 0x0188 0x0284 0x6 0x3 +#define IMX7ULP_PAD_PTF2__FB_TSIZ1_FB_CS5_B_FB_BE23_16_BLS15_8_B 0x0188 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF3__PTF3 0x018c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF3__VIU_PCLK 0x018c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF3__LPUART4_RX 0x018c 0x0248 0x4 0x3 +#define IMX7ULP_PAD_PTF3__TPM4_CH2 0x018c 0x0288 0x6 0x3 +#define IMX7ULP_PAD_PTF3__FB_AD16 0x018c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF4__PTF4 0x0190 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF4__VIU_D0 0x0190 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF4__FXIO1_D0 0x0190 0x0204 0x2 0x2 +#define IMX7ULP_PAD_PTF4__LPSPI2_PCS1 0x0190 0x02a0 0x3 0x3 +#define IMX7ULP_PAD_PTF4__LPUART5_CTS_B 0x0190 0x0250 0x4 0x3 +#define IMX7ULP_PAD_PTF4__LPI2C5_SCL 0x0190 0x02bc 0x5 0x3 +#define IMX7ULP_PAD_PTF4__TPM4_CH3 0x0190 0x028c 0x6 0x2 +#define IMX7ULP_PAD_PTF4__FB_AD17 0x0190 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF5__PTF5 0x0194 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF5__VIU_D1 0x0194 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF5__FXIO1_D1 0x0194 0x0208 0x2 0x2 +#define IMX7ULP_PAD_PTF5__LPSPI2_PCS2 0x0194 0x02a4 0x3 0x3 +#define IMX7ULP_PAD_PTF5__LPUART5_RTS_B 0x0194 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF5__LPI2C5_SDA 0x0194 0x02c0 0x5 0x3 +#define IMX7ULP_PAD_PTF5__TPM4_CH4 0x0194 0x0290 0x6 0x2 +#define IMX7ULP_PAD_PTF5__FB_AD18 0x0194 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF6__PTF6 0x0198 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF6__VIU_D2 0x0198 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF6__FXIO1_D2 0x0198 0x020c 0x2 0x2 +#define IMX7ULP_PAD_PTF6__LPSPI2_PCS3 0x0198 0x02a8 0x3 0x3 +#define IMX7ULP_PAD_PTF6__LPUART5_TX 0x0198 0x0258 0x4 0x3 +#define IMX7ULP_PAD_PTF6__LPI2C5_HREQ 0x0198 0x02b8 0x5 0x3 +#define IMX7ULP_PAD_PTF6__TPM4_CH5 0x0198 0x0294 0x6 0x2 +#define IMX7ULP_PAD_PTF6__FB_AD19 0x0198 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF7__PTF7 0x019c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF7__VIU_D3 0x019c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF7__FXIO1_D3 0x019c 0x0210 0x2 0x2 +#define IMX7ULP_PAD_PTF7__LPUART5_RX 0x019c 0x0254 0x4 0x3 +#define IMX7ULP_PAD_PTF7__TPM5_CH1 0x019c 0x02c8 0x6 0x3 +#define IMX7ULP_PAD_PTF7__FB_AD20 0x019c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF8__PTF8 0x01a0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF8__USB1_ULPI_CLK 0x01a0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF8__VIU_D4 0x01a0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF8__FXIO1_D4 0x01a0 0x0214 0x2 0x2 +#define IMX7ULP_PAD_PTF8__LPSPI2_SIN 0x01a0 0x02b0 0x3 0x3 +#define IMX7ULP_PAD_PTF8__LPUART6_CTS_B 0x01a0 0x025c 0x4 0x3 +#define IMX7ULP_PAD_PTF8__LPI2C6_SCL 0x01a0 0x02fc 0x5 0x3 +#define IMX7ULP_PAD_PTF8__TPM5_CLKIN 0x01a0 0x02cc 0x6 0x3 +#define IMX7ULP_PAD_PTF8__FB_AD21 0x01a0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF9__PTF9 0x01a4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF9__USB1_ULPI_NXT 0x01a4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF9__VIU_D5 0x01a4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF9__FXIO1_D5 0x01a4 0x0218 0x2 0x2 +#define IMX7ULP_PAD_PTF9__LPSPI2_SOUT 0x01a4 0x02b4 0x3 0x3 +#define IMX7ULP_PAD_PTF9__LPUART6_RTS_B 0x01a4 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF9__LPI2C6_SDA 0x01a4 0x0300 0x5 0x3 +#define IMX7ULP_PAD_PTF9__TPM5_CH0 0x01a4 0x02c4 0x6 0x3 +#define IMX7ULP_PAD_PTF9__FB_AD22 0x01a4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF10__PTF10 0x01a8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF10__USB1_ULPI_STP 0x01a8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF10__VIU_D6 0x01a8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF10__FXIO1_D6 0x01a8 0x021c 0x2 0x2 +#define IMX7ULP_PAD_PTF10__LPSPI2_SCK 0x01a8 0x02ac 0x3 0x3 +#define IMX7ULP_PAD_PTF10__LPUART6_TX 0x01a8 0x0264 0x4 0x3 +#define IMX7ULP_PAD_PTF10__LPI2C6_HREQ 0x01a8 0x02f8 0x5 0x3 +#define IMX7ULP_PAD_PTF10__TPM7_CH3 0x01a8 0x02e8 0x6 0x3 +#define IMX7ULP_PAD_PTF10__FB_AD23 0x01a8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF11__PTF11 0x01ac 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF11__USB1_ULPI_DIR 0x01ac 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF11__VIU_D7 0x01ac 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF11__FXIO1_D7 0x01ac 0x0220 0x2 0x2 +#define IMX7ULP_PAD_PTF11__LPSPI2_PCS0 0x01ac 0x029c 0x3 0x3 +#define IMX7ULP_PAD_PTF11__LPUART6_RX 0x01ac 0x0260 0x4 0x3 +#define IMX7ULP_PAD_PTF11__TPM7_CH4 0x01ac 0x02ec 0x6 0x3 +#define IMX7ULP_PAD_PTF11__FB_CS4_B_FB_TSIZ0_FB_BE31_24_BLS7_0_B 0x01ac 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF12__PTF12 0x01b0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF12__USB1_ULPI_DATA0 0x01b0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF12__VIU_D8 0x01b0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF12__FXIO1_D8 0x01b0 0x0224 0x2 0x2 +#define IMX7ULP_PAD_PTF12__LPSPI3_PCS1 0x01b0 0x0314 0x3 0x3 +#define IMX7ULP_PAD_PTF12__LPUART7_CTS_B 0x01b0 0x0268 0x4 0x3 +#define IMX7ULP_PAD_PTF12__LPI2C7_SCL 0x01b0 0x0308 0x5 0x3 +#define IMX7ULP_PAD_PTF12__TPM7_CH5 0x01b0 0x02f0 0x6 0x3 +#define IMX7ULP_PAD_PTF12__FB_AD24 0x01b0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF13__PTF13 0x01b4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF13__USB1_ULPI_DATA1 0x01b4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF13__VIU_D9 0x01b4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF13__FXIO1_D9 0x01b4 0x0228 0x2 0x2 +#define IMX7ULP_PAD_PTF13__LPSPI3_PCS2 0x01b4 0x0318 0x3 0x3 +#define IMX7ULP_PAD_PTF13__LPUART7_RTS_B 0x01b4 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF13__LPI2C7_SDA 0x01b4 0x030c 0x5 0x3 +#define IMX7ULP_PAD_PTF13__TPM7_CLKIN 0x01b4 0x02f4 0x6 0x3 +#define IMX7ULP_PAD_PTF13__FB_AD25 0x01b4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF14__PTF14 0x01b8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF14__USB1_ULPI_DATA2 0x01b8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF14__VIU_D10 0x01b8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF14__FXIO1_D10 0x01b8 0x022c 0x2 0x2 +#define IMX7ULP_PAD_PTF14__LPSPI3_PCS3 0x01b8 0x031c 0x3 0x3 +#define IMX7ULP_PAD_PTF14__LPUART7_TX 0x01b8 0x0270 0x4 0x3 +#define IMX7ULP_PAD_PTF14__LPI2C7_HREQ 0x01b8 0x0304 0x5 0x3 +#define IMX7ULP_PAD_PTF14__TPM7_CH0 0x01b8 0x02dc 0x6 0x3 +#define IMX7ULP_PAD_PTF14__FB_AD26 0x01b8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF15__PTF15 0x01bc 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF15__USB1_ULPI_DATA3 0x01bc 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF15__VIU_D11 0x01bc 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF15__FXIO1_D11 0x01bc 0x0230 0x2 0x2 +#define IMX7ULP_PAD_PTF15__LPUART7_RX 0x01bc 0x026c 0x4 0x3 +#define IMX7ULP_PAD_PTF15__TPM7_CH1 0x01bc 0x02e0 0x6 0x3 +#define IMX7ULP_PAD_PTF15__FB_AD27 0x01bc 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF16__PTF16 0x01c0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF16__USB1_ULPI_DATA4 0x01c0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF16__VIU_D12 0x01c0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF16__FXIO1_D12 0x01c0 0x0234 0x2 0x2 +#define IMX7ULP_PAD_PTF16__LPSPI3_SIN 0x01c0 0x0324 0x3 0x3 +#define IMX7ULP_PAD_PTF16__TPM7_CH2 0x01c0 0x02e4 0x6 0x3 +#define IMX7ULP_PAD_PTF16__FB_AD28 0x01c0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF17__PTF17 0x01c4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF17__USB1_ULPI_DATA5 0x01c4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF17__VIU_D13 0x01c4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF17__FXIO1_D13 0x01c4 0x0238 0x2 0x2 +#define IMX7ULP_PAD_PTF17__LPSPI3_SOUT 0x01c4 0x0328 0x3 0x3 +#define IMX7ULP_PAD_PTF17__TPM6_CLKIN 0x01c4 0x02d8 0x6 0x3 +#define IMX7ULP_PAD_PTF17__FB_AD29 0x01c4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF18__PTF18 0x01c8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF18__USB1_ULPI_DATA6 0x01c8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF18__VIU_D14 0x01c8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF18__FXIO1_D14 0x01c8 0x023c 0x2 0x2 +#define IMX7ULP_PAD_PTF18__LPSPI3_SCK 0x01c8 0x0320 0x3 0x3 +#define IMX7ULP_PAD_PTF18__TPM6_CH0 0x01c8 0x02d0 0x6 0x3 +#define IMX7ULP_PAD_PTF18__FB_AD30 0x01c8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF19__PTF19 0x01cc 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF19__USB1_ULPI_DATA7 0x01cc 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF19__VIU_D15 0x01cc 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF19__FXIO1_D15 0x01cc 0x0240 0x2 0x2 +#define IMX7ULP_PAD_PTF19__LPSPI3_PCS0 0x01cc 0x0310 0x3 0x3 +#define IMX7ULP_PAD_PTF19__TPM6_CH1 0x01cc 0x02d4 0x6 0x3 +#define IMX7ULP_PAD_PTF19__FB_AD31 0x01cc 0x0000 0x9 0x0 + +#endif /* __DTS_IMX7ULP_PINFUNC_H */ diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index 7bb9df2c1460..9319e1f0f1d8 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -129,14 +129,14 @@ }; msi1: msi-controller@1570e00 { - compatible = "fsl,1s1021a-msi"; + compatible = "fsl,ls1021a-msi"; reg = <0x0 0x1570e00 0x0 0x8>; msi-controller; interrupts = <GIC_SPI 179 IRQ_TYPE_LEVEL_HIGH>; }; msi2: msi-controller@1570e08 { - compatible = "fsl,1s1021a-msi"; + compatible = "fsl,ls1021a-msi"; reg = <0x0 0x1570e08 0x0 0x8>; msi-controller; interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_HIGH>; @@ -699,7 +699,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi1>; + msi-parent = <&msi1>, <&msi2>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>, @@ -722,7 +722,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi2>; + msi-parent = <&msi1>, <&msi2>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index df3366fa5409..cb47ae79a5f9 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -265,6 +265,20 @@ &i2c2 { clock-frequency = <400000>; + + as3645a@30 { + reg = <0x30>; + compatible = "ams,as3645a"; + flash { + flash-timeout-us = <150000>; + flash-max-microamp = <320000>; + led-max-microamp = <60000>; + peak-current-limit = <1750000>; + }; + indicator { + led-max-microamp = <10000>; + }; + }; }; &i2c3 { diff --git a/arch/arm/boot/dts/rk3228-evb.dts b/arch/arm/boot/dts/rk3228-evb.dts index 58834330a5ba..1be9daacc4f9 100644 --- a/arch/arm/boot/dts/rk3228-evb.dts +++ b/arch/arm/boot/dts/rk3228-evb.dts @@ -50,6 +50,16 @@ device_type = "memory"; reg = <0x60000000 0x40000000>; }; + + vcc_phy: vcc-phy-regulator { + compatible = "regulator-fixed"; + enable-active-high; + regulator-name = "vcc_phy"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-always-on; + regulator-boot-on; + }; }; &emmc { @@ -60,6 +70,30 @@ status = "okay"; }; +&gmac { + assigned-clocks = <&cru SCLK_MAC_SRC>; + assigned-clock-rates = <50000000>; + clock_in_out = "output"; + phy-supply = <&vcc_phy>; + phy-mode = "rmii"; + phy-handle = <&phy>; + status = "okay"; + + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + + phy: phy@0 { + compatible = "ethernet-phy-id1234.d400", "ethernet-phy-ieee802.3-c22"; + reg = <0>; + clocks = <&cru SCLK_MAC_PHY>; + resets = <&cru SRST_MACPHY>; + phy-is-integrated; + }; + }; +}; + &tsadc { status = "okay"; diff --git a/arch/arm/boot/dts/tango4-smp8758.dtsi b/arch/arm/boot/dts/tango4-smp8758.dtsi index d2e65c46bcc7..eca33d568690 100644 --- a/arch/arm/boot/dts/tango4-smp8758.dtsi +++ b/arch/arm/boot/dts/tango4-smp8758.dtsi @@ -13,7 +13,6 @@ reg = <0>; clocks = <&clkgen CPU_CLK>; clock-latency = <1>; - operating-points = <1215000 0 607500 0 405000 0 243000 0 135000 0>; }; cpu1: cpu@1 { diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index e74de69caeab..1736813bdea7 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -226,7 +226,7 @@ CONFIG_REGULATOR_MC13892=y CONFIG_REGULATOR_PFUZE100=y CONFIG_MEDIA_SUPPORT=y CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_RC_SUPPORT=y +CONFIG_RC_CORE=y CONFIG_RC_DEVICES=y CONFIG_IR_GPIO_CIR=y CONFIG_MEDIA_USB_SUPPORT=y diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 4d19c1b4b8e7..94d7e71c69c4 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -270,6 +270,7 @@ CONFIG_ICPLUS_PHY=y CONFIG_REALTEK_PHY=y CONFIG_MICREL_PHY=y CONFIG_FIXED_PHY=y +CONFIG_ROCKCHIP_PHY=y CONFIG_USB_PEGASUS=y CONFIG_USB_RTL8152=m CONFIG_USB_USBNET=y diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index a120ae816260..0414acf731ce 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -304,7 +304,7 @@ CONFIG_REGULATOR_TPS65910=y CONFIG_REGULATOR_TWL4030=y CONFIG_MEDIA_SUPPORT=m CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_RC_SUPPORT=y +CONFIG_RC_CORE=m CONFIG_MEDIA_CONTROLLER=y CONFIG_VIDEO_V4L2_SUBDEV_API=y CONFIG_LIRC=m diff --git a/arch/arm/configs/sunxi_defconfig b/arch/arm/configs/sunxi_defconfig index 0ec1d1ec130f..22cd559531a9 100644 --- a/arch/arm/configs/sunxi_defconfig +++ b/arch/arm/configs/sunxi_defconfig @@ -95,7 +95,7 @@ CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_REGULATOR_AXP20X=y CONFIG_REGULATOR_GPIO=y CONFIG_MEDIA_SUPPORT=y -CONFIG_MEDIA_RC_SUPPORT=y +CONFIG_RC_CORE=y CONFIG_RC_DEVICES=y CONFIG_IR_SUNXI=y CONFIG_DRM=y diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index b9adedcc5b2e..ec72752d5668 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -94,14 +94,15 @@ config CRYPTO_AES_ARM_CE ARMv8 Crypto Extensions config CRYPTO_GHASH_ARM_CE - tristate "PMULL-accelerated GHASH using ARMv8 Crypto Extensions" + tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_CRYPTD help Use an implementation of GHASH (used by the GCM AEAD chaining mode) that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) - that is part of the ARMv8 Crypto Extensions + that is part of the ARMv8 Crypto Extensions, or a slower variant that + uses the vmull.p8 instruction that is part of the basic NEON ISA. config CRYPTO_CRCT10DIF_ARM_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index 0f966a8ca1ce..d0a9cec73707 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -285,9 +285,7 @@ static int ctr_encrypt(struct skcipher_request *req) ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, num_rounds(ctx), blocks, walk.iv); - if (tdst != tsrc) - memcpy(tdst, tsrc, nbytes); - crypto_xor(tdst, tail, nbytes); + crypto_xor_cpy(tdst, tsrc, tail, nbytes); err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index c817a86c4ca8..54b384084637 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ */ #include <linux/linkage.h> +#include <asm/cache.h> .text .align 5 @@ -32,19 +33,19 @@ .endif .endm - .macro __load, out, in, idx + .macro __load, out, in, idx, sz, op .if __LINUX_ARM_ARCH__ < 7 && \idx > 0 - ldr \out, [ttab, \in, lsr #(8 * \idx) - 2] + ldr\op \out, [ttab, \in, lsr #(8 * \idx) - \sz] .else - ldr \out, [ttab, \in, lsl #2] + ldr\op \out, [ttab, \in, lsl #\sz] .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc + .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op __select \out0, \in0, 0 __select t0, \in1, 1 - __load \out0, \out0, 0 - __load t0, t0, 1 + __load \out0, \out0, 0, \sz, \op + __load t0, t0, 1, \sz, \op .if \enc __select \out1, \in1, 0 @@ -53,10 +54,10 @@ __select \out1, \in3, 0 __select t1, \in0, 1 .endif - __load \out1, \out1, 0 + __load \out1, \out1, 0, \sz, \op __select t2, \in2, 2 - __load t1, t1, 1 - __load t2, t2, 2 + __load t1, t1, 1, \sz, \op + __load t2, t2, 2, \sz, \op eor \out0, \out0, t0, ror #24 @@ -68,9 +69,9 @@ __select \t3, \in1, 2 __select \t4, \in2, 3 .endif - __load \t3, \t3, 2 - __load t0, t0, 3 - __load \t4, \t4, 3 + __load \t3, \t3, 2, \sz, \op + __load t0, t0, 3, \sz, \op + __load \t4, \t4, 3, \sz, \op eor \out1, \out1, t1, ror #24 eor \out0, \out0, t2, ror #16 @@ -82,14 +83,14 @@ eor \out1, \out1, t2 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op .endm .macro __rev, out, in @@ -114,7 +115,7 @@ .endif .endm - .macro do_crypt, round, ttab, ltab + .macro do_crypt, round, ttab, ltab, bsz push {r3-r11, lr} ldr r4, [in] @@ -146,9 +147,12 @@ 1: subs rounds, rounds, #4 \round r8, r9, r10, r11, r4, r5, r6, r7 - __adrl ttab, \ltab, ls + bls 2f \round r4, r5, r6, r7, r8, r9, r10, r11 - bhi 0b + b 0b + +2: __adrl ttab, \ltab + \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b #ifdef CONFIG_CPU_BIG_ENDIAN __rev r4, r4 @@ -170,10 +174,48 @@ .ltorg .endm + .align L1_CACHE_SHIFT + .type __aes_arm_inverse_sbox, %object +__aes_arm_inverse_sbox: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .size __aes_arm_inverse_sbox, . - __aes_arm_inverse_sbox + ENTRY(__aes_arm_encrypt) - do_crypt fround, crypto_ft_tab, crypto_fl_tab + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 ENDPROC(__aes_arm_encrypt) + .align 5 ENTRY(__aes_arm_decrypt) - do_crypt iround, crypto_it_tab, crypto_il_tab + do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0 ENDPROC(__aes_arm_decrypt) diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index c76377961444..18768f330449 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -221,9 +221,8 @@ static int ctr_encrypt(struct skcipher_request *req) u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; - if (dst != src) - memcpy(dst, src, walk.total % AES_BLOCK_SIZE); - crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); + crypto_xor_cpy(dst, src, final, + walk.total % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, 0); break; diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S index f6ab8bcc9efe..2f78c10b1881 100644 --- a/arch/arm/crypto/ghash-ce-core.S +++ b/arch/arm/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* - * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions. + * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. * - * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -12,40 +12,162 @@ #include <asm/assembler.h> SHASH .req q0 - SHASH2 .req q1 - T1 .req q2 - T2 .req q3 - MASK .req q4 - XL .req q5 - XM .req q6 - XH .req q7 - IN1 .req q7 + T1 .req q1 + XL .req q2 + XM .req q3 + XH .req q4 + IN1 .req q4 SHASH_L .req d0 SHASH_H .req d1 - SHASH2_L .req d2 - T1_L .req d4 - MASK_L .req d8 - XL_L .req d10 - XL_H .req d11 - XM_L .req d12 - XM_H .req d13 - XH_L .req d14 + T1_L .req d2 + T1_H .req d3 + XL_L .req d4 + XL_H .req d5 + XM_L .req d6 + XM_H .req d7 + XH_L .req d8 + + t0l .req d10 + t0h .req d11 + t1l .req d12 + t1h .req d13 + t2l .req d14 + t2h .req d15 + t3l .req d16 + t3h .req d17 + t4l .req d18 + t4h .req d19 + + t0q .req q5 + t1q .req q6 + t2q .req q7 + t3q .req q8 + t4q .req q9 + T2 .req q9 + + s1l .req d20 + s1h .req d21 + s2l .req d22 + s2h .req d23 + s3l .req d24 + s3h .req d25 + s4l .req d26 + s4h .req d27 + + MASK .req d28 + SHASH2_p8 .req d28 + + k16 .req d29 + k32 .req d30 + k48 .req d31 + SHASH2_p64 .req d31 .text .fpu crypto-neon-fp-armv8 + .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 + vmull.p64 \rd, \rn, \rm + .endm + /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) + * This implementation of 64x64 -> 128 bit polynomial multiplication + * using vmull.p8 instructions (8x8 -> 16) is taken from the paper + * "Fast Software Polynomial Multiplication on ARM Processors Using + * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and + * Ricardo Dahab (https://hal.inria.fr/hal-01506572) + * + * It has been slightly tweaked for in-order performance, and to allow + * 'rq' to overlap with 'ad' or 'bd'. */ -ENTRY(pmull_ghash_update) - vld1.64 {SHASH}, [r3] + .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l + vext.8 t0l, \ad, \ad, #1 @ A1 + .ifc \b1, t4l + vext.8 t4l, \bd, \bd, #1 @ B1 + .endif + vmull.p8 t0q, t0l, \bd @ F = A1*B + vext.8 t1l, \ad, \ad, #2 @ A2 + vmull.p8 t4q, \ad, \b1 @ E = A*B1 + .ifc \b2, t3l + vext.8 t3l, \bd, \bd, #2 @ B2 + .endif + vmull.p8 t1q, t1l, \bd @ H = A2*B + vext.8 t2l, \ad, \ad, #3 @ A3 + vmull.p8 t3q, \ad, \b2 @ G = A*B2 + veor t0q, t0q, t4q @ L = E + F + .ifc \b3, t4l + vext.8 t4l, \bd, \bd, #3 @ B3 + .endif + vmull.p8 t2q, t2l, \bd @ J = A3*B + veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 + veor t1q, t1q, t3q @ M = G + H + .ifc \b4, t3l + vext.8 t3l, \bd, \bd, #4 @ B4 + .endif + vmull.p8 t4q, \ad, \b3 @ I = A*B3 + veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 + vmull.p8 t3q, \ad, \b4 @ K = A*B4 + vand t0h, t0h, k48 + vand t1h, t1h, k32 + veor t2q, t2q, t4q @ N = I + J + veor t0l, t0l, t0h + veor t1l, t1l, t1h + veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 + vand t2h, t2h, k16 + veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 + vmov.i64 t3h, #0 + vext.8 t0q, t0q, t0q, #15 + veor t2l, t2l, t2h + vext.8 t1q, t1q, t1q, #14 + vmull.p8 \rq, \ad, \bd @ D = A*B + vext.8 t2q, t2q, t2q, #13 + vext.8 t3q, t3q, t3q, #12 + veor t0q, t0q, t1q + veor t2q, t2q, t3q + veor \rq, \rq, t0q + veor \rq, \rq, t2q + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + vmull.p64 T1, XL_L, MASK + + veor XH_L, XH_L, XM_H + vext.8 T1, T1, T1, #8 + veor XL_H, XL_H, XM_L + veor T1, T1, XL + + vmull.p64 XL, T1_H, MASK + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + veor XL_H, XL_H, XM_L + veor XH_L, XH_L, XM_H + + vshl.i64 T1, XL, #57 + vshl.i64 T2, XL, #62 + veor T1, T1, T2 + vshl.i64 T2, XL, #63 + veor T1, T1, T2 + veor XL_H, XL_H, T1_L + veor XH_L, XH_L, T1_H + + vshr.u64 T1, XL, #1 + veor XH, XH, XL + veor XL, XL, T1 + vshr.u64 T1, T1, #6 + vshr.u64 XL, XL, #1 + .endm + + .macro ghash_update, pn vld1.64 {XL}, [r1] - vmov.i8 MASK, #0xe1 - vext.8 SHASH2, SHASH, SHASH, #8 - vshl.u64 MASK, MASK, #57 - veor SHASH2, SHASH2, SHASH /* do the head block first, if supplied */ ldr ip, [sp] @@ -62,33 +184,59 @@ ENTRY(pmull_ghash_update) #ifndef CONFIG_CPU_BIG_ENDIAN vrev64.8 T1, T1 #endif - vext.8 T2, XL, XL, #8 vext.8 IN1, T1, T1, #8 - veor T1, T1, T2 + veor T1_L, T1_L, XL_H veor XL, XL, IN1 - vmull.p64 XH, SHASH_H, XL_H @ a1 * b1 + __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 veor T1, T1, XL - vmull.p64 XL, SHASH_L, XL_L @ a0 * b0 - vmull.p64 XM, SHASH2_L, T1_L @ (a1 + a0)(b1 + b0) + __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 + __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) - vext.8 T1, XL, XH, #8 - veor T2, XL, XH + veor T1, XL, XH veor XM, XM, T1 - veor XM, XM, T2 - vmull.p64 T2, XL_L, MASK_L - vmov XH_L, XM_H - vmov XM_H, XL_L + __pmull_reduce_\pn - veor XL, XM, T2 - vext.8 T2, XL, XL, #8 - vmull.p64 XL, XL_L, MASK_L - veor T2, T2, XH - veor XL, XL, T2 + veor T1, T1, XH + veor XL, XL, T1 bne 0b vst1.64 {XL}, [r1] bx lr -ENDPROC(pmull_ghash_update) + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update_p64) + vld1.64 {SHASH}, [r3] + veor SHASH2_p64, SHASH_L, SHASH_H + + vmov.i8 MASK, #0xe1 + vshl.u64 MASK, MASK, #57 + + ghash_update p64 +ENDPROC(pmull_ghash_update_p64) + +ENTRY(pmull_ghash_update_p8) + vld1.64 {SHASH}, [r3] + veor SHASH2_p8, SHASH_L, SHASH_H + + vext.8 s1l, SHASH_L, SHASH_L, #1 + vext.8 s2l, SHASH_L, SHASH_L, #2 + vext.8 s3l, SHASH_L, SHASH_L, #3 + vext.8 s4l, SHASH_L, SHASH_L, #4 + vext.8 s1h, SHASH_H, SHASH_H, #1 + vext.8 s2h, SHASH_H, SHASH_H, #2 + vext.8 s3h, SHASH_H, SHASH_H, #3 + vext.8 s4h, SHASH_H, SHASH_H, #4 + + vmov.i64 k16, #0xffff + vmov.i64 k32, #0xffffffff + vmov.i64 k48, #0xffffffffffff + + ghash_update p8 +ENDPROC(pmull_ghash_update_p8) diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index 6bac8bea9f1e..d9bb52cae2ac 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -22,6 +22,7 @@ MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -41,8 +42,17 @@ struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; -asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, const char *head); +asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); static int ghash_init(struct shash_desc *desc) { @@ -312,6 +322,14 @@ static int __init ghash_ce_mod_init(void) { int err; + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + if (elf_hwcap2 & HWCAP2_PMULL) + pmull_ghash_update = pmull_ghash_update_p64; + else + pmull_ghash_update = pmull_ghash_update_p8; + err = crypto_register_shash(&ghash_alg); if (err) return err; @@ -332,5 +350,5 @@ static void __exit ghash_ce_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -module_cpu_feature_match(PMULL, ghash_ce_mod_init); +module_init(ghash_ce_mod_init); module_exit(ghash_ce_mod_exit); diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 27475904e096..eee269321923 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -276,6 +276,12 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) #define gicr_write_pendbaser(v, c) __gic_writeq_nonatomic(v, c) /* + * GICR_xLPIR - only the lower bits are significant + */ +#define gic_read_lpir(c) readl_relaxed(c) +#define gic_write_lpir(v, c) writel_relaxed(lower_32_bits(v), c) + +/* * GITS_TYPER is an ID register and doesn't need atomicity. */ #define gits_read_typer(c) __gic_readq_nonatomic(c) @@ -291,5 +297,33 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) */ #define gits_write_cwriter(v, c) __gic_writeq_nonatomic(v, c) +/* + * GITS_VPROPBASER - hi and lo bits may be accessed independently. + */ +#define gits_write_vpropbaser(v, c) __gic_writeq_nonatomic(v, c) + +/* + * GITS_VPENDBASER - the Valid bit must be cleared before changing + * anything else. + */ +static inline void gits_write_vpendbaser(u64 val, void * __iomem addr) +{ + u32 tmp; + + tmp = readl_relaxed(addr + 4); + if (tmp & (GICR_VPENDBASER_Valid >> 32)) { + tmp &= ~(GICR_VPENDBASER_Valid >> 32); + writel_relaxed(tmp, addr + 4); + } + + /* + * Use the fact that __gic_writeq_nonatomic writes the second + * half of the 64bit quantity after the first. + */ + __gic_writeq_nonatomic(val, addr); +} + +#define gits_read_vpendbaser(c) __gic_readq_nonatomic(c) + #endif /* !__ASSEMBLY__ */ #endif /* !__ASM_ARCH_GICV3_H */ diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 6795368ad023..cc414382dab4 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, #endif /* !SMP */ static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - #ifndef CONFIG_SMP preempt_disable(); #endif @@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 4bec45442072..c030143c18c6 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -52,22 +52,6 @@ static inline void dsb_sev(void) * memory. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u16 owner = READ_ONCE(lock->tickets.owner); - - for (;;) { - arch_spinlock_t tmp = READ_ONCE(*lock); - - if (tmp.tickets.owner == tmp.tickets.next || - tmp.tickets.owner != owner) - break; - - wfe(); - } - smp_acquire__after_ctrl_dep(); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 776757d1604a..1d468b527b7b 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ +#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */ +#define TIF_SYSCALL_TRACE 5 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_FSCHECK) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index f555bb3664dc..683d9230984a 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -18,7 +18,6 @@ struct undef_hook { void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { extern char __irqentry_text_start[]; @@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr) return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 0bf2347495f1..87936dd5d151 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -70,6 +70,8 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); } #define segment_eq(a, b) ((a) == (b)) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index eb5cd77bf1d8..e33c32d56193 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -41,7 +41,9 @@ ret_fast_syscall: UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK + bne fast_work_pending + tst r1, #_TIF_WORK_MASK bne fast_work_pending /* perform architecture specific actions before user return */ @@ -67,12 +69,15 @@ ret_fast_syscall: str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK + bne fast_work_pending + tst r1, #_TIF_WORK_MASK beq no_work_pending UNWIND(.fnend ) ENDPROC(ret_fast_syscall) /* Slower path - fall through to work_pending */ +fast_work_pending: #endif tst r1, #_TIF_SYSCALL_WORK diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 5814298ef0b7..e2de50bf8742 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/tracehook.h> #include <linux/uprobes.h> +#include <linux/syscalls.h> #include <asm/elf.h> #include <asm/cacheflush.h> @@ -613,6 +614,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) * Update the trace code with the current status. */ trace_hardirqs_off(); + + /* Check valid user FS if needed */ + addr_limit_user_check(); + do { if (likely(thread_flags & _TIF_NEED_RESCHED)) { schedule(); diff --git a/arch/arm/mach-hisi/Kconfig b/arch/arm/mach-hisi/Kconfig index a3b091a4d344..65a048fa08ec 100644 --- a/arch/arm/mach-hisi/Kconfig +++ b/arch/arm/mach-hisi/Kconfig @@ -39,6 +39,7 @@ config ARCH_HIP04 select HAVE_ARM_ARCH_TIMER select MCPM if SMP select MCPM_QUAD_CLUSTER if SMP + select GENERIC_IRQ_EFFECTIVE_AFF_MASK help Support for Hisilicon HiP04 SoC family diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 779fb1f680b3..b3b3b3a19183 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -8,7 +8,7 @@ ccflags-y := -I$(srctree)/$(src)/include \ # Common support obj-y := id.o io.o control.o devices.o fb.o timer.o pm.o \ common.o dma.o wd_timer.o display.o i2c.o hdq1w.o omap_hwmod.o \ - omap_device.o omap-headsmp.o sram.o drm.o + omap_device.o omap-headsmp.o sram.o hwmod-common = omap_hwmod.o omap_hwmod_reset.o \ omap_hwmod_common_data.o diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c index b1e661bb5521..583fc39d84cd 100644 --- a/arch/arm/mach-omap2/board-generic.c +++ b/arch/arm/mach-omap2/board-generic.c @@ -33,6 +33,7 @@ static void __init __maybe_unused omap_generic_init(void) pdata_quirks_init(omap_dt_match_table); omapdss_init_of(); + omap_soc_device_init(); } #ifdef CONFIG_SOC_OMAP2420 diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c index 8fa01c0ecdb2..b3f6eb5d04a2 100644 --- a/arch/arm/mach-omap2/display.c +++ b/arch/arm/mach-omap2/display.c @@ -66,6 +66,7 @@ */ #define FRAMEDONE_IRQ_TIMEOUT 100 +#if defined(CONFIG_FB_OMAP2) static struct platform_device omap_display_device = { .name = "omapdss", .id = -1, @@ -163,6 +164,65 @@ static enum omapdss_version __init omap_display_get_version(void) return OMAPDSS_VER_UNKNOWN; } +static int __init omapdss_init_fbdev(void) +{ + static struct omap_dss_board_info board_data = { + .dsi_enable_pads = omap_dsi_enable_pads, + .dsi_disable_pads = omap_dsi_disable_pads, + .set_min_bus_tput = omap_dss_set_min_bus_tput, + }; + struct device_node *node; + int r; + + board_data.version = omap_display_get_version(); + if (board_data.version == OMAPDSS_VER_UNKNOWN) { + pr_err("DSS not supported on this SoC\n"); + return -ENODEV; + } + + omap_display_device.dev.platform_data = &board_data; + + r = platform_device_register(&omap_display_device); + if (r < 0) { + pr_err("Unable to register omapdss device\n"); + return r; + } + + /* create vrfb device */ + r = omap_init_vrfb(); + if (r < 0) { + pr_err("Unable to register omapvrfb device\n"); + return r; + } + + /* create FB device */ + r = omap_init_fb(); + if (r < 0) { + pr_err("Unable to register omapfb device\n"); + return r; + } + + /* create V4L2 display device */ + r = omap_init_vout(); + if (r < 0) { + pr_err("Unable to register omap_vout device\n"); + return r; + } + + /* add DSI info for omap4 */ + node = of_find_node_by_name(NULL, "omap4_padconf_global"); + if (node) + omap4_dsi_mux_syscon = syscon_node_to_regmap(node); + + return 0; +} +#else +static inline int omapdss_init_fbdev(void) +{ + return 0; +} +#endif /* CONFIG_FB_OMAP2 */ + static void dispc_disable_outputs(void) { u32 v, irq_mask = 0; @@ -335,16 +395,9 @@ static struct device_node * __init omapdss_find_dss_of_node(void) int __init omapdss_init_of(void) { int r; - enum omapdss_version ver; struct device_node *node; struct platform_device *pdev; - static struct omap_dss_board_info board_data = { - .dsi_enable_pads = omap_dsi_enable_pads, - .dsi_disable_pads = omap_dsi_disable_pads, - .set_min_bus_tput = omap_dss_set_min_bus_tput, - }; - /* only create dss helper devices if dss is enabled in the .dts */ node = omapdss_find_dss_of_node(); @@ -354,13 +407,6 @@ int __init omapdss_init_of(void) if (!of_device_is_available(node)) return 0; - ver = omap_display_get_version(); - - if (ver == OMAPDSS_VER_UNKNOWN) { - pr_err("DSS not supported on this SoC\n"); - return -ENODEV; - } - pdev = of_find_device_by_node(node); if (!pdev) { @@ -374,48 +420,5 @@ int __init omapdss_init_of(void) return r; } - board_data.version = ver; - - omap_display_device.dev.platform_data = &board_data; - - r = platform_device_register(&omap_display_device); - if (r < 0) { - pr_err("Unable to register omapdss device\n"); - return r; - } - - /* create DRM device */ - r = omap_init_drm(); - if (r < 0) { - pr_err("Unable to register omapdrm device\n"); - return r; - } - - /* create vrfb device */ - r = omap_init_vrfb(); - if (r < 0) { - pr_err("Unable to register omapvrfb device\n"); - return r; - } - - /* create FB device */ - r = omap_init_fb(); - if (r < 0) { - pr_err("Unable to register omapfb device\n"); - return r; - } - - /* create V4L2 display device */ - r = omap_init_vout(); - if (r < 0) { - pr_err("Unable to register omap_vout device\n"); - return r; - } - - /* add DSI info for omap4 */ - node = of_find_node_by_name(NULL, "omap4_padconf_global"); - if (node) - omap4_dsi_mux_syscon = syscon_node_to_regmap(node); - - return 0; + return omapdss_init_fbdev(); } diff --git a/arch/arm/mach-omap2/display.h b/arch/arm/mach-omap2/display.h index 9a39646d4316..42ec2e99a2f4 100644 --- a/arch/arm/mach-omap2/display.h +++ b/arch/arm/mach-omap2/display.h @@ -26,7 +26,6 @@ struct omap_dss_dispc_dev_attr { bool has_framedonetv_irq; }; -int omap_init_drm(void); int omap_init_vrfb(void); int omap_init_fb(void); int omap_init_vout(void); diff --git a/arch/arm/mach-omap2/drm.c b/arch/arm/mach-omap2/drm.c deleted file mode 100644 index 44fef961bb70..000000000000 --- a/arch/arm/mach-omap2/drm.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * DRM/KMS device registration for TI OMAP platforms - * - * Copyright (C) 2012 Texas Instruments - * Author: Rob Clark <rob.clark@linaro.org> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/platform_device.h> -#include <linux/dma-mapping.h> -#include <linux/platform_data/omap_drm.h> - -#include "soc.h" -#include "display.h" - -#if IS_ENABLED(CONFIG_DRM_OMAP) - -static struct omap_drm_platform_data platform_data; - -static struct platform_device omap_drm_device = { - .dev = { - .coherent_dma_mask = DMA_BIT_MASK(32), - .platform_data = &platform_data, - }, - .name = "omapdrm", - .id = 0, -}; - -int __init omap_init_drm(void) -{ - platform_data.omaprev = GET_OMAP_TYPE; - - return platform_device_register(&omap_drm_device); - -} -#else -int __init omap_init_drm(void) { return 0; } -#endif diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c index 1cd20e4d56b0..cb5d7314cf99 100644 --- a/arch/arm/mach-omap2/io.c +++ b/arch/arm/mach-omap2/io.c @@ -428,7 +428,6 @@ static void __init __maybe_unused omap_hwmod_init_postsetup(void) static void __init __maybe_unused omap_common_late_init(void) { omap2_common_pm_late_init(); - omap_soc_device_init(); } #ifdef CONFIG_SOC_OMAP2420 diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c index e2c97728b3c6..9d662fed03ec 100644 --- a/arch/arm/mach-pxa/raumfeld.c +++ b/arch/arm/mach-pxa/raumfeld.c @@ -377,7 +377,7 @@ static struct gpiod_lookup_table raumfeld_rotary_gpios_table = { }, }; -static struct property_entry raumfeld_rotary_properties[] = { +static const struct property_entry raumfeld_rotary_properties[] __initconst = { PROPERTY_ENTRY_INTEGER("rotary-encoder,steps-per-period", u32, 24), PROPERTY_ENTRY_INTEGER("linux,axis", u32, REL_X), PROPERTY_ENTRY_INTEGER("rotary-encoder,relative_axis", u32, 1), diff --git a/arch/arm/mach-tegra/cpuidle-tegra114.c b/arch/arm/mach-tegra/cpuidle-tegra114.c index d3aa9be16621..e3fbcfedf845 100644 --- a/arch/arm/mach-tegra/cpuidle-tegra114.c +++ b/arch/arm/mach-tegra/cpuidle-tegra114.c @@ -60,7 +60,7 @@ static int tegra114_idle_power_down(struct cpuidle_device *dev, return index; } -static void tegra114_idle_enter_freeze(struct cpuidle_device *dev, +static void tegra114_idle_enter_s2idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { @@ -77,7 +77,7 @@ static struct cpuidle_driver tegra_idle_driver = { #ifdef CONFIG_PM_SLEEP [1] = { .enter = tegra114_idle_power_down, - .enter_freeze = tegra114_idle_enter_freeze, + .enter_s2idle = tegra114_idle_enter_s2idle, .exit_latency = 500, .target_residency = 1000, .flags = CPUIDLE_FLAG_TIMER_STOP, diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index d5b9fa19b684..c199990e12b6 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1,6 +1,7 @@ /* - * Just-In-Time compiler for BPF filters on 32bit ARM + * Just-In-Time compiler for eBPF filters on 32bit ARM * + * Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com> * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com> * * This program is free software; you can redistribute it and/or modify it @@ -8,6 +9,7 @@ * Free Software Foundation; version 2 of the License. */ +#include <linux/bpf.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/errno.h> @@ -18,54 +20,101 @@ #include <linux/if_vlan.h> #include <asm/cacheflush.h> -#include <asm/set_memory.h> #include <asm/hwcap.h> #include <asm/opcodes.h> #include "bpf_jit_32.h" +int bpf_jit_enable __read_mostly; + +#define STACK_OFFSET(k) (k) +#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ +#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ +#define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */ + +/* Flags used for JIT optimization */ +#define SEEN_CALL (1 << 0) + +#define FLAG_IMM_OVERFLOW (1 << 0) + /* - * ABI: + * Map eBPF registers to ARM 32bit registers or stack scratch space. + * + * 1. First argument is passed using the arm 32bit registers and rest of the + * arguments are passed on stack scratch space. + * 2. First callee-saved arugument is mapped to arm 32 bit registers and rest + * arguments are mapped to scratch space on stack. + * 3. We need two 64 bit temp registers to do complex operations on eBPF + * registers. + * + * As the eBPF registers are all 64 bit registers and arm has only 32 bit + * registers, we have to map each eBPF registers with two arm 32 bit regs or + * scratch memory space and we have to build eBPF 64 bit register from those. * - * r0 scratch register - * r4 BPF register A - * r5 BPF register X - * r6 pointer to the skb - * r7 skb->data - * r8 skb_headlen(skb) */ +static const u8 bpf2a32[][2] = { + /* return value from in-kernel function, and exit value from eBPF */ + [BPF_REG_0] = {ARM_R1, ARM_R0}, + /* arguments from eBPF program to in-kernel function */ + [BPF_REG_1] = {ARM_R3, ARM_R2}, + /* Stored on stack scratch space */ + [BPF_REG_2] = {STACK_OFFSET(0), STACK_OFFSET(4)}, + [BPF_REG_3] = {STACK_OFFSET(8), STACK_OFFSET(12)}, + [BPF_REG_4] = {STACK_OFFSET(16), STACK_OFFSET(20)}, + [BPF_REG_5] = {STACK_OFFSET(24), STACK_OFFSET(28)}, + /* callee saved registers that in-kernel function will preserve */ + [BPF_REG_6] = {ARM_R5, ARM_R4}, + /* Stored on stack scratch space */ + [BPF_REG_7] = {STACK_OFFSET(32), STACK_OFFSET(36)}, + [BPF_REG_8] = {STACK_OFFSET(40), STACK_OFFSET(44)}, + [BPF_REG_9] = {STACK_OFFSET(48), STACK_OFFSET(52)}, + /* Read only Frame Pointer to access Stack */ + [BPF_REG_FP] = {STACK_OFFSET(56), STACK_OFFSET(60)}, + /* Temporary Register for internal BPF JIT, can be used + * for constant blindings and others. + */ + [TMP_REG_1] = {ARM_R7, ARM_R6}, + [TMP_REG_2] = {ARM_R10, ARM_R8}, + /* Tail call count. Stored on stack scratch space. */ + [TCALL_CNT] = {STACK_OFFSET(64), STACK_OFFSET(68)}, + /* temporary register for blinding constants. + * Stored on stack scratch space. + */ + [BPF_REG_AX] = {STACK_OFFSET(72), STACK_OFFSET(76)}, +}; -#define r_scratch ARM_R0 -/* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */ -#define r_off ARM_R1 -#define r_A ARM_R4 -#define r_X ARM_R5 -#define r_skb ARM_R6 -#define r_skb_data ARM_R7 -#define r_skb_hl ARM_R8 - -#define SCRATCH_SP_OFFSET 0 -#define SCRATCH_OFF(k) (SCRATCH_SP_OFFSET + 4 * (k)) - -#define SEEN_MEM ((1 << BPF_MEMWORDS) - 1) -#define SEEN_MEM_WORD(k) (1 << (k)) -#define SEEN_X (1 << BPF_MEMWORDS) -#define SEEN_CALL (1 << (BPF_MEMWORDS + 1)) -#define SEEN_SKB (1 << (BPF_MEMWORDS + 2)) -#define SEEN_DATA (1 << (BPF_MEMWORDS + 3)) +#define dst_lo dst[1] +#define dst_hi dst[0] +#define src_lo src[1] +#define src_hi src[0] -#define FLAG_NEED_X_RESET (1 << 0) -#define FLAG_IMM_OVERFLOW (1 << 1) +/* + * JIT Context: + * + * prog : bpf_prog + * idx : index of current last JITed instruction. + * prologue_bytes : bytes used in prologue. + * epilogue_offset : offset of epilogue starting. + * seen : bit mask used for JIT optimization. + * offsets : array of eBPF instruction offsets in + * JITed code. + * target : final JITed code. + * epilogue_bytes : no of bytes used in epilogue. + * imm_count : no of immediate counts used for global + * variables. + * imms : array of global variable addresses. + */ struct jit_ctx { - const struct bpf_prog *skf; - unsigned idx; - unsigned prologue_bytes; - int ret0_fp_idx; + const struct bpf_prog *prog; + unsigned int idx; + unsigned int prologue_bytes; + unsigned int epilogue_offset; u32 seen; u32 flags; u32 *offsets; u32 *target; + u32 stack_size; #if __LINUX_ARM_ARCH__ < 7 u16 epilogue_bytes; u16 imm_count; @@ -73,68 +122,16 @@ struct jit_ctx { #endif }; -int bpf_jit_enable __read_mostly; - -static inline int call_neg_helper(struct sk_buff *skb, int offset, void *ret, - unsigned int size) -{ - void *ptr = bpf_internal_load_pointer_neg_helper(skb, offset, size); - - if (!ptr) - return -EFAULT; - memcpy(ret, ptr, size); - return 0; -} - -static u64 jit_get_skb_b(struct sk_buff *skb, int offset) -{ - u8 ret; - int err; - - if (offset < 0) - err = call_neg_helper(skb, offset, &ret, 1); - else - err = skb_copy_bits(skb, offset, &ret, 1); - - return (u64)err << 32 | ret; -} - -static u64 jit_get_skb_h(struct sk_buff *skb, int offset) -{ - u16 ret; - int err; - - if (offset < 0) - err = call_neg_helper(skb, offset, &ret, 2); - else - err = skb_copy_bits(skb, offset, &ret, 2); - - return (u64)err << 32 | ntohs(ret); -} - -static u64 jit_get_skb_w(struct sk_buff *skb, int offset) -{ - u32 ret; - int err; - - if (offset < 0) - err = call_neg_helper(skb, offset, &ret, 4); - else - err = skb_copy_bits(skb, offset, &ret, 4); - - return (u64)err << 32 | ntohl(ret); -} - /* * Wrappers which handle both OABI and EABI and assures Thumb2 interworking * (where the assembly routines like __aeabi_uidiv could cause problems). */ -static u32 jit_udiv(u32 dividend, u32 divisor) +static u32 jit_udiv32(u32 dividend, u32 divisor) { return dividend / divisor; } -static u32 jit_mod(u32 dividend, u32 divisor) +static u32 jit_mod32(u32 dividend, u32 divisor) { return dividend % divisor; } @@ -158,36 +155,22 @@ static inline void emit(u32 inst, struct jit_ctx *ctx) _emit(ARM_COND_AL, inst, ctx); } -static u16 saved_regs(struct jit_ctx *ctx) +/* + * Checks if immediate value can be converted to imm12(12 bits) value. + */ +static int16_t imm8m(u32 x) { - u16 ret = 0; - - if ((ctx->skf->len > 1) || - (ctx->skf->insns[0].code == (BPF_RET | BPF_A))) - ret |= 1 << r_A; - -#ifdef CONFIG_FRAME_POINTER - ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC); -#else - if (ctx->seen & SEEN_CALL) - ret |= 1 << ARM_LR; -#endif - if (ctx->seen & (SEEN_DATA | SEEN_SKB)) - ret |= 1 << r_skb; - if (ctx->seen & SEEN_DATA) - ret |= (1 << r_skb_data) | (1 << r_skb_hl); - if (ctx->seen & SEEN_X) - ret |= 1 << r_X; - - return ret; -} + u32 rot; -static inline int mem_words_used(struct jit_ctx *ctx) -{ - /* yes, we do waste some stack space IF there are "holes" in the set" */ - return fls(ctx->seen & SEEN_MEM); + for (rot = 0; rot < 16; rot++) + if ((x & ~ror32(0xff, 2 * rot)) == 0) + return rol32(x, 2 * rot) | (rot << 8); + return -1; } +/* + * Initializes the JIT space with undefined instructions. + */ static void jit_fill_hole(void *area, unsigned int size) { u32 *ptr; @@ -196,88 +179,34 @@ static void jit_fill_hole(void *area, unsigned int size) *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF); } -static void build_prologue(struct jit_ctx *ctx) -{ - u16 reg_set = saved_regs(ctx); - u16 off; - -#ifdef CONFIG_FRAME_POINTER - emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx); - emit(ARM_PUSH(reg_set), ctx); - emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); -#else - if (reg_set) - emit(ARM_PUSH(reg_set), ctx); -#endif - - if (ctx->seen & (SEEN_DATA | SEEN_SKB)) - emit(ARM_MOV_R(r_skb, ARM_R0), ctx); - - if (ctx->seen & SEEN_DATA) { - off = offsetof(struct sk_buff, data); - emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx); - /* headlen = len - data_len */ - off = offsetof(struct sk_buff, len); - emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx); - off = offsetof(struct sk_buff, data_len); - emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); - emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx); - } - - if (ctx->flags & FLAG_NEED_X_RESET) - emit(ARM_MOV_I(r_X, 0), ctx); - - /* do not leak kernel data to userspace */ - if (bpf_needs_clear_a(&ctx->skf->insns[0])) - emit(ARM_MOV_I(r_A, 0), ctx); - - /* stack space for the BPF_MEM words */ - if (ctx->seen & SEEN_MEM) - emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); -} - -static void build_epilogue(struct jit_ctx *ctx) -{ - u16 reg_set = saved_regs(ctx); - - if (ctx->seen & SEEN_MEM) - emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); - - reg_set &= ~(1 << ARM_LR); +/* Stack must be multiples of 16 Bytes */ +#define STACK_ALIGN(sz) (((sz) + 3) & ~3) -#ifdef CONFIG_FRAME_POINTER - /* the first instruction of the prologue was: mov ip, sp */ - reg_set &= ~(1 << ARM_IP); - reg_set |= (1 << ARM_SP); - emit(ARM_LDM(ARM_SP, reg_set), ctx); -#else - if (reg_set) { - if (ctx->seen & SEEN_CALL) - reg_set |= 1 << ARM_PC; - emit(ARM_POP(reg_set), ctx); - } +/* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4, + * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9, + * BPF_REG_FP and Tail call counts. + */ +#define SCRATCH_SIZE 80 - if (!(ctx->seen & SEEN_CALL)) - emit(ARM_BX(ARM_LR), ctx); -#endif -} +/* total stack size used in JITed code */ +#define _STACK_SIZE \ + (ctx->prog->aux->stack_depth + \ + + SCRATCH_SIZE + \ + + 4 /* extra for skb_copy_bits buffer */) -static int16_t imm8m(u32 x) -{ - u32 rot; +#define STACK_SIZE STACK_ALIGN(_STACK_SIZE) - for (rot = 0; rot < 16; rot++) - if ((x & ~ror32(0xff, 2 * rot)) == 0) - return rol32(x, 2 * rot) | (rot << 8); +/* Get the offset of eBPF REGISTERs stored on scratch space. */ +#define STACK_VAR(off) (STACK_SIZE-off-4) - return -1; -} +/* Offset of skb_copy_bits buffer */ +#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE) #if __LINUX_ARM_ARCH__ < 7 static u16 imm_offset(u32 k, struct jit_ctx *ctx) { - unsigned i = 0, offset; + unsigned int i = 0, offset; u16 imm; /* on the "fake" run we just count them (duplicates included) */ @@ -296,7 +225,7 @@ static u16 imm_offset(u32 k, struct jit_ctx *ctx) ctx->imms[i] = k; /* constants go just after the epilogue */ - offset = ctx->offsets[ctx->skf->len]; + offset = ctx->offsets[ctx->prog->len - 1] * 4; offset += ctx->prologue_bytes; offset += ctx->epilogue_bytes; offset += i * 4; @@ -320,10 +249,22 @@ static u16 imm_offset(u32 k, struct jit_ctx *ctx) #endif /* __LINUX_ARM_ARCH__ */ +static inline int bpf2a32_offset(int bpf_to, int bpf_from, + const struct jit_ctx *ctx) { + int to, from; + + if (ctx->target == NULL) + return 0; + to = ctx->offsets[bpf_to]; + from = ctx->offsets[bpf_from]; + + return to - from - 1; +} + /* * Move an immediate that's not an imm8m to a core register. */ -static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx) +static inline void emit_mov_i_no8m(const u8 rd, u32 val, struct jit_ctx *ctx) { #if __LINUX_ARM_ARCH__ < 7 emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx); @@ -334,7 +275,7 @@ static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx) #endif } -static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx) +static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx) { int imm12 = imm8m(val); @@ -344,676 +285,1594 @@ static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx) emit_mov_i_no8m(rd, val, ctx); } -#if __LINUX_ARM_ARCH__ < 6 - -static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) { - _emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx); - _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); - _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx); - _emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx); - _emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx); - _emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx); - _emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx); - _emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx); + ctx->seen |= SEEN_CALL; +#if __LINUX_ARM_ARCH__ < 5 + emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); + + if (elf_hwcap & HWCAP_THUMB) + emit(ARM_BX(tgt_reg), ctx); + else + emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); +#else + emit(ARM_BLX_R(tgt_reg), ctx); +#endif } -static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +static inline int epilogue_offset(const struct jit_ctx *ctx) { - _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); - _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx); - _emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx); + int to, from; + /* No need for 1st dummy run */ + if (ctx->target == NULL) + return 0; + to = ctx->epilogue_offset; + from = ctx->idx; + + return to - from - 2; } -static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx) +static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) { - /* r_dst = (r_src << 8) | (r_src >> 8) */ - emit(ARM_LSL_I(ARM_R1, r_src, 8), ctx); - emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSR, 8), ctx); + const u8 *tmp = bpf2a32[TMP_REG_1]; + s32 jmp_offset; + + /* checks if divisor is zero or not. If it is, then + * exit directly. + */ + emit(ARM_CMP_I(rn, 0), ctx); + _emit(ARM_COND_EQ, ARM_MOV_I(ARM_R0, 0), ctx); + jmp_offset = epilogue_offset(ctx); + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); +#if __LINUX_ARM_ARCH__ == 7 + if (elf_hwcap & HWCAP_IDIVA) { + if (op == BPF_DIV) + emit(ARM_UDIV(rd, rm, rn), ctx); + else { + emit(ARM_UDIV(ARM_IP, rm, rn), ctx); + emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx); + } + return; + } +#endif /* - * we need to mask out the bits set in r_dst[23:16] due to - * the first shift instruction. - * - * note that 0x8ff is the encoded immediate 0x00ff0000. + * For BPF_ALU | BPF_DIV | BPF_K instructions + * As ARM_R1 and ARM_R0 contains 1st argument of bpf + * function, we need to save it on caller side to save + * it from getting destroyed within callee. + * After the return from the callee, we restore ARM_R0 + * ARM_R1. */ - emit(ARM_BIC_I(r_dst, r_dst, 0x8ff), ctx); -} + if (rn != ARM_R1) { + emit(ARM_MOV_R(tmp[0], ARM_R1), ctx); + emit(ARM_MOV_R(ARM_R1, rn), ctx); + } + if (rm != ARM_R0) { + emit(ARM_MOV_R(tmp[1], ARM_R0), ctx); + emit(ARM_MOV_R(ARM_R0, rm), ctx); + } -#else /* ARMv6+ */ + /* Call appropriate function */ + ctx->seen |= SEEN_CALL; + emit_mov_i(ARM_IP, op == BPF_DIV ? + (u32)jit_udiv32 : (u32)jit_mod32, ctx); + emit_blx_r(ARM_IP, ctx); -static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) -{ - _emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx); -#ifdef __LITTLE_ENDIAN - _emit(cond, ARM_REV(r_res, r_res), ctx); -#endif + /* Save return value */ + if (rd != ARM_R0) + emit(ARM_MOV_R(rd, ARM_R0), ctx); + + /* Restore ARM_R0 and ARM_R1 */ + if (rn != ARM_R1) + emit(ARM_MOV_R(ARM_R1, tmp[0]), ctx); + if (rm != ARM_R0) + emit(ARM_MOV_R(ARM_R0, tmp[1]), ctx); } -static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +/* Checks whether BPF register is on scratch stack space or not. */ +static inline bool is_on_stack(u8 bpf_reg) { - _emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx); -#ifdef __LITTLE_ENDIAN - _emit(cond, ARM_REV16(r_res, r_res), ctx); -#endif + static u8 stack_regs[] = {BPF_REG_AX, BPF_REG_3, BPF_REG_4, BPF_REG_5, + BPF_REG_7, BPF_REG_8, BPF_REG_9, TCALL_CNT, + BPF_REG_2, BPF_REG_FP}; + int i, reg_len = sizeof(stack_regs); + + for (i = 0 ; i < reg_len ; i++) { + if (bpf_reg == stack_regs[i]) + return true; + } + return false; } -static inline void emit_swap16(u8 r_dst __maybe_unused, - u8 r_src __maybe_unused, - struct jit_ctx *ctx __maybe_unused) +static inline void emit_a32_mov_i(const u8 dst, const u32 val, + bool dstk, struct jit_ctx *ctx) { -#ifdef __LITTLE_ENDIAN - emit(ARM_REV16(r_dst, r_src), ctx); -#endif + const u8 *tmp = bpf2a32[TMP_REG_1]; + + if (dstk) { + emit_mov_i(tmp[1], val, ctx); + emit(ARM_STR_I(tmp[1], ARM_SP, STACK_VAR(dst)), ctx); + } else { + emit_mov_i(dst, val, ctx); + } } -#endif /* __LINUX_ARM_ARCH__ < 6 */ +/* Sign extended move */ +static inline void emit_a32_mov_i64(const bool is64, const u8 dst[], + const u32 val, bool dstk, + struct jit_ctx *ctx) { + u32 hi = 0; + if (is64 && (val & (1<<31))) + hi = (u32)~0; + emit_a32_mov_i(dst_lo, val, dstk, ctx); + emit_a32_mov_i(dst_hi, hi, dstk, ctx); +} -/* Compute the immediate value for a PC-relative branch. */ -static inline u32 b_imm(unsigned tgt, struct jit_ctx *ctx) -{ - u32 imm; +static inline void emit_a32_add_r(const u8 dst, const u8 src, + const bool is64, const bool hi, + struct jit_ctx *ctx) { + /* 64 bit : + * adds dst_lo, dst_lo, src_lo + * adc dst_hi, dst_hi, src_hi + * 32 bit : + * add dst_lo, dst_lo, src_lo + */ + if (!hi && is64) + emit(ARM_ADDS_R(dst, dst, src), ctx); + else if (hi && is64) + emit(ARM_ADC_R(dst, dst, src), ctx); + else + emit(ARM_ADD_R(dst, dst, src), ctx); +} - if (ctx->target == NULL) - return 0; - /* - * BPF allows only forward jumps and the offset of the target is - * still the one computed during the first pass. +static inline void emit_a32_sub_r(const u8 dst, const u8 src, + const bool is64, const bool hi, + struct jit_ctx *ctx) { + /* 64 bit : + * subs dst_lo, dst_lo, src_lo + * sbc dst_hi, dst_hi, src_hi + * 32 bit : + * sub dst_lo, dst_lo, src_lo */ - imm = ctx->offsets[tgt] + ctx->prologue_bytes - (ctx->idx * 4 + 8); + if (!hi && is64) + emit(ARM_SUBS_R(dst, dst, src), ctx); + else if (hi && is64) + emit(ARM_SBC_R(dst, dst, src), ctx); + else + emit(ARM_SUB_R(dst, dst, src), ctx); +} - return imm >> 2; +static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64, + const bool hi, const u8 op, struct jit_ctx *ctx){ + switch (BPF_OP(op)) { + /* dst = dst + src */ + case BPF_ADD: + emit_a32_add_r(dst, src, is64, hi, ctx); + break; + /* dst = dst - src */ + case BPF_SUB: + emit_a32_sub_r(dst, src, is64, hi, ctx); + break; + /* dst = dst | src */ + case BPF_OR: + emit(ARM_ORR_R(dst, dst, src), ctx); + break; + /* dst = dst & src */ + case BPF_AND: + emit(ARM_AND_R(dst, dst, src), ctx); + break; + /* dst = dst ^ src */ + case BPF_XOR: + emit(ARM_EOR_R(dst, dst, src), ctx); + break; + /* dst = dst * src */ + case BPF_MUL: + emit(ARM_MUL(dst, dst, src), ctx); + break; + /* dst = dst << src */ + case BPF_LSH: + emit(ARM_LSL_R(dst, dst, src), ctx); + break; + /* dst = dst >> src */ + case BPF_RSH: + emit(ARM_LSR_R(dst, dst, src), ctx); + break; + /* dst = dst >> src (signed)*/ + case BPF_ARSH: + emit(ARM_MOV_SR(dst, dst, SRTYPE_ASR, src), ctx); + break; + } } -#define OP_IMM3(op, r1, r2, imm_val, ctx) \ - do { \ - imm12 = imm8m(imm_val); \ - if (imm12 < 0) { \ - emit_mov_i_no8m(r_scratch, imm_val, ctx); \ - emit(op ## _R((r1), (r2), r_scratch), ctx); \ - } else { \ - emit(op ## _I((r1), (r2), imm12), ctx); \ - } \ - } while (0) - -static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx) -{ - if (ctx->ret0_fp_idx >= 0) { - _emit(cond, ARM_B(b_imm(ctx->ret0_fp_idx, ctx)), ctx); - /* NOP to keep the size constant between passes */ - emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx); +/* ALU operation (32 bit) + * dst = dst (op) src + */ +static inline void emit_a32_alu_r(const u8 dst, const u8 src, + bool dstk, bool sstk, + struct jit_ctx *ctx, const bool is64, + const bool hi, const u8 op) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rn = sstk ? tmp[1] : src; + + if (sstk) + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src)), ctx); + + /* ALU operation */ + if (dstk) { + emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(dst)), ctx); + emit_alu_r(tmp[0], rn, is64, hi, op, ctx); + emit(ARM_STR_I(tmp[0], ARM_SP, STACK_VAR(dst)), ctx); } else { - _emit(cond, ARM_MOV_I(ARM_R0, 0), ctx); - _emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx); + emit_alu_r(dst, rn, is64, hi, op, ctx); } } -static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) -{ -#if __LINUX_ARM_ARCH__ < 5 - emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); +/* ALU operation (64 bit) */ +static inline void emit_a32_alu_r64(const bool is64, const u8 dst[], + const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx, + const u8 op) { + emit_a32_alu_r(dst_lo, src_lo, dstk, sstk, ctx, is64, false, op); + if (is64) + emit_a32_alu_r(dst_hi, src_hi, dstk, sstk, ctx, is64, true, op); + else + emit_a32_mov_i(dst_hi, 0, dstk, ctx); +} - if (elf_hwcap & HWCAP_THUMB) - emit(ARM_BX(tgt_reg), ctx); +/* dst = imm (4 bytes)*/ +static inline void emit_a32_mov_r(const u8 dst, const u8 src, + bool dstk, bool sstk, + struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rt = sstk ? tmp[0] : src; + + if (sstk) + emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(src)), ctx); + if (dstk) + emit(ARM_STR_I(rt, ARM_SP, STACK_VAR(dst)), ctx); else - emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); -#else - emit(ARM_BLX_R(tgt_reg), ctx); -#endif + emit(ARM_MOV_R(dst, rt), ctx); } -static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, - int bpf_op) -{ -#if __LINUX_ARM_ARCH__ == 7 - if (elf_hwcap & HWCAP_IDIVA) { - if (bpf_op == BPF_DIV) - emit(ARM_UDIV(rd, rm, rn), ctx); - else { - emit(ARM_UDIV(ARM_R3, rm, rn), ctx); - emit(ARM_MLS(rd, rn, ARM_R3, rm), ctx); - } - return; +/* dst = src */ +static inline void emit_a32_mov_r64(const bool is64, const u8 dst[], + const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + emit_a32_mov_r(dst_lo, src_lo, dstk, sstk, ctx); + if (is64) { + /* complete 8 byte move */ + emit_a32_mov_r(dst_hi, src_hi, dstk, sstk, ctx); + } else { + /* Zero out high 4 bytes */ + emit_a32_mov_i(dst_hi, 0, dstk, ctx); } -#endif +} - /* - * For BPF_ALU | BPF_DIV | BPF_K instructions, rm is ARM_R4 - * (r_A) and rn is ARM_R0 (r_scratch) so load rn first into - * ARM_R1 to avoid accidentally overwriting ARM_R0 with rm - * before using it as a source for ARM_R1. - * - * For BPF_ALU | BPF_DIV | BPF_X rm is ARM_R4 (r_A) and rn is - * ARM_R5 (r_X) so there is no particular register overlap - * issues. - */ - if (rn != ARM_R1) - emit(ARM_MOV_R(ARM_R1, rn), ctx); - if (rm != ARM_R0) - emit(ARM_MOV_R(ARM_R0, rm), ctx); +/* Shift operations */ +static inline void emit_a32_alu_i(const u8 dst, const u32 val, bool dstk, + struct jit_ctx *ctx, const u8 op) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[0] : dst; + + if (dstk) + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); + + /* Do shift operation */ + switch (op) { + case BPF_LSH: + emit(ARM_LSL_I(rd, rd, val), ctx); + break; + case BPF_RSH: + emit(ARM_LSR_I(rd, rd, val), ctx); + break; + case BPF_NEG: + emit(ARM_RSB_I(rd, rd, val), ctx); + break; + } + if (dstk) + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); +} + +/* dst = ~dst (64 bit) */ +static inline void emit_a32_neg64(const u8 dst[], bool dstk, + struct jit_ctx *ctx){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[1] : dst[1]; + u8 rm = dstk ? tmp[0] : dst[0]; + + /* Setup Operand */ + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do Negate Operation */ + emit(ARM_RSBS_I(rd, rd, 0), ctx); + emit(ARM_RSC_I(rm, rm, 0), ctx); + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +/* dst = dst << src */ +static inline void emit_a32_lsh_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + + /* Setup Operands */ + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSH operation */ + emit(ARM_SUB_I(ARM_IP, rt, 32), ctx); + emit(ARM_RSB_I(tmp2[0], rt, 32), ctx); + /* As we are using ARM_LR */ ctx->seen |= SEEN_CALL; - emit_mov_i(ARM_R3, bpf_op == BPF_DIV ? (u32)jit_udiv : (u32)jit_mod, - ctx); - emit_blx_r(ARM_R3, ctx); + emit(ARM_MOV_SR(ARM_LR, rm, SRTYPE_ASL, rt), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd, SRTYPE_ASL, ARM_IP), ctx); + emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd, SRTYPE_LSR, tmp2[0]), ctx); + emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_ASL, rt), ctx); + + if (dstk) { + emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_LR), ctx); + emit(ARM_MOV_R(rm, ARM_IP), ctx); + } +} - if (rd != ARM_R0) - emit(ARM_MOV_R(rd, ARM_R0), ctx); +/* dst = dst >> src (signed)*/ +static inline void emit_a32_arsh_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup Operands */ + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do the ARSH operation */ + emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); + emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); + /* As we are using ARM_LR */ + ctx->seen |= SEEN_CALL; + emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); + _emit(ARM_COND_MI, ARM_B(0), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASR, tmp2[0]), ctx); + emit(ARM_MOV_SR(ARM_IP, rm, SRTYPE_ASR, rt), ctx); + if (dstk) { + emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_LR), ctx); + emit(ARM_MOV_R(rm, ARM_IP), ctx); + } +} + +/* dst = dst >> src */ +static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup Operands */ + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSH operation */ + emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); + emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); + /* As we are using ARM_LR */ + ctx->seen |= SEEN_CALL; + emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_LSR, tmp2[0]), ctx); + emit(ARM_MOV_SR(ARM_IP, rm, SRTYPE_LSR, rt), ctx); + if (dstk) { + emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_LR), ctx); + emit(ARM_MOV_R(rm, ARM_IP), ctx); + } } -static inline void update_on_xread(struct jit_ctx *ctx) +/* dst = dst << val */ +static inline void emit_a32_lsh_i64(const u8 dst[], bool dstk, + const u32 val, struct jit_ctx *ctx){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSH operation */ + if (val < 32) { + emit(ARM_MOV_SI(tmp2[0], rm, SRTYPE_ASL, val), ctx); + emit(ARM_ORR_SI(rm, tmp2[0], rd, SRTYPE_LSR, 32 - val), ctx); + emit(ARM_MOV_SI(rd, rd, SRTYPE_ASL, val), ctx); + } else { + if (val == 32) + emit(ARM_MOV_R(rm, rd), ctx); + else + emit(ARM_MOV_SI(rm, rd, SRTYPE_ASL, val - 32), ctx); + emit(ARM_EOR_R(rd, rd, rd), ctx); + } + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +/* dst = dst >> val */ +static inline void emit_a32_lsr_i64(const u8 dst[], bool dstk, + const u32 val, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSR operation */ + if (val < 32) { + emit(ARM_MOV_SI(tmp2[1], rd, SRTYPE_LSR, val), ctx); + emit(ARM_ORR_SI(rd, tmp2[1], rm, SRTYPE_ASL, 32 - val), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_LSR, val), ctx); + } else if (val == 32) { + emit(ARM_MOV_R(rd, rm), ctx); + emit(ARM_MOV_I(rm, 0), ctx); + } else { + emit(ARM_MOV_SI(rd, rm, SRTYPE_LSR, val - 32), ctx); + emit(ARM_MOV_I(rm, 0), ctx); + } + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +/* dst = dst >> val (signed) */ +static inline void emit_a32_arsh_i64(const u8 dst[], bool dstk, + const u32 val, struct jit_ctx *ctx){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do ARSH operation */ + if (val < 32) { + emit(ARM_MOV_SI(tmp2[1], rd, SRTYPE_LSR, val), ctx); + emit(ARM_ORR_SI(rd, tmp2[1], rm, SRTYPE_ASL, 32 - val), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, val), ctx); + } else if (val == 32) { + emit(ARM_MOV_R(rd, rm), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, 31), ctx); + } else { + emit(ARM_MOV_SI(rd, rm, SRTYPE_ASR, val - 32), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, 31), ctx); + } + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +static inline void emit_a32_mul_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands for multiplication */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rn = sstk ? tmp2[0] : src_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + if (sstk) { + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_hi)), ctx); + } + + /* Do Multiplication */ + emit(ARM_MUL(ARM_IP, rd, rn), ctx); + emit(ARM_MUL(ARM_LR, rm, rt), ctx); + /* As we are using ARM_LR */ + ctx->seen |= SEEN_CALL; + emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx); + + emit(ARM_UMULL(ARM_IP, rm, rd, rt), ctx); + emit(ARM_ADD_R(rm, ARM_LR, rm), ctx); + if (dstk) { + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_IP), ctx); + } +} + +/* *(size *)(dst + off) = src */ +static inline void emit_str_r(const u8 dst, const u8 src, bool dstk, + const s32 off, struct jit_ctx *ctx, const u8 sz){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[1] : dst; + + if (dstk) + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); + if (off) { + emit_a32_mov_i(tmp[0], off, false, ctx); + emit(ARM_ADD_R(tmp[0], rd, tmp[0]), ctx); + rd = tmp[0]; + } + switch (sz) { + case BPF_W: + /* Store a Word */ + emit(ARM_STR_I(src, rd, 0), ctx); + break; + case BPF_H: + /* Store a HalfWord */ + emit(ARM_STRH_I(src, rd, 0), ctx); + break; + case BPF_B: + /* Store a Byte */ + emit(ARM_STRB_I(src, rd, 0), ctx); + break; + } +} + +/* dst = *(size*)(src + off) */ +static inline void emit_ldx_r(const u8 dst, const u8 src, bool dstk, + const s32 off, struct jit_ctx *ctx, const u8 sz){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[1] : dst; + u8 rm = src; + + if (off) { + emit_a32_mov_i(tmp[0], off, false, ctx); + emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); + rm = tmp[0]; + } + switch (sz) { + case BPF_W: + /* Load a Word */ + emit(ARM_LDR_I(rd, rm, 0), ctx); + break; + case BPF_H: + /* Load a HalfWord */ + emit(ARM_LDRH_I(rd, rm, 0), ctx); + break; + case BPF_B: + /* Load a Byte */ + emit(ARM_LDRB_I(rd, rm, 0), ctx); + break; + } + if (dstk) + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); +} + +/* Arithmatic Operation */ +static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm, + const u8 rn, struct jit_ctx *ctx, u8 op) { + switch (op) { + case BPF_JSET: + ctx->seen |= SEEN_CALL; + emit(ARM_AND_R(ARM_IP, rt, rn), ctx); + emit(ARM_AND_R(ARM_LR, rd, rm), ctx); + emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx); + break; + case BPF_JEQ: + case BPF_JNE: + case BPF_JGT: + case BPF_JGE: + case BPF_JLE: + case BPF_JLT: + emit(ARM_CMP_R(rd, rm), ctx); + _emit(ARM_COND_EQ, ARM_CMP_R(rt, rn), ctx); + break; + case BPF_JSLE: + case BPF_JSGT: + emit(ARM_CMP_R(rn, rt), ctx); + emit(ARM_SBCS_R(ARM_IP, rm, rd), ctx); + break; + case BPF_JSLT: + case BPF_JSGE: + emit(ARM_CMP_R(rt, rn), ctx); + emit(ARM_SBCS_R(ARM_IP, rd, rm), ctx); + break; + } +} + +static int out_offset = -1; /* initialized on the first pass of build_body() */ +static int emit_bpf_tail_call(struct jit_ctx *ctx) { - if (!(ctx->seen & SEEN_X)) - ctx->flags |= FLAG_NEED_X_RESET; - ctx->seen |= SEEN_X; + /* bpf_tail_call(void *prog_ctx, struct bpf_array *array, u64 index) */ + const u8 *r2 = bpf2a32[BPF_REG_2]; + const u8 *r3 = bpf2a32[BPF_REG_3]; + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + const u8 *tcc = bpf2a32[TCALL_CNT]; + const int idx0 = ctx->idx; +#define cur_offset (ctx->idx - idx0) +#define jmp_offset (out_offset - (cur_offset)) + u32 off, lo, hi; + + /* if (index >= array->map.max_entries) + * goto out; + */ + off = offsetof(struct bpf_array, map.max_entries); + /* array->map.max_entries */ + emit_a32_mov_i(tmp[1], off, false, ctx); + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx); + emit(ARM_LDR_R(tmp[1], tmp2[1], tmp[1]), ctx); + /* index (64 bit) */ + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx); + /* index >= array->map.max_entries */ + emit(ARM_CMP_R(tmp2[1], tmp[1]), ctx); + _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); + + /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + * tail_call_cnt++; + */ + lo = (u32)MAX_TAIL_CALL_CNT; + hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32); + emit(ARM_LDR_I(tmp[1], ARM_SP, STACK_VAR(tcc[1])), ctx); + emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(tcc[0])), ctx); + emit(ARM_CMP_I(tmp[0], hi), ctx); + _emit(ARM_COND_EQ, ARM_CMP_I(tmp[1], lo), ctx); + _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); + emit(ARM_ADDS_I(tmp[1], tmp[1], 1), ctx); + emit(ARM_ADC_I(tmp[0], tmp[0], 0), ctx); + emit(ARM_STR_I(tmp[1], ARM_SP, STACK_VAR(tcc[1])), ctx); + emit(ARM_STR_I(tmp[0], ARM_SP, STACK_VAR(tcc[0])), ctx); + + /* prog = array->ptrs[index] + * if (prog == NULL) + * goto out; + */ + off = offsetof(struct bpf_array, ptrs); + emit_a32_mov_i(tmp[1], off, false, ctx); + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx); + emit(ARM_ADD_R(tmp[1], tmp2[1], tmp[1]), ctx); + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx); + emit(ARM_MOV_SI(tmp[0], tmp2[1], SRTYPE_ASL, 2), ctx); + emit(ARM_LDR_R(tmp[1], tmp[1], tmp[0]), ctx); + emit(ARM_CMP_I(tmp[1], 0), ctx); + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); + + /* goto *(prog->bpf_func + prologue_size); */ + off = offsetof(struct bpf_prog, bpf_func); + emit_a32_mov_i(tmp2[1], off, false, ctx); + emit(ARM_LDR_R(tmp[1], tmp[1], tmp2[1]), ctx); + emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx); + emit(ARM_BX(tmp[1]), ctx); + + /* out: */ + if (out_offset == -1) + out_offset = cur_offset; + if (cur_offset != out_offset) { + pr_err_once("tail_call out_offset = %d, expected %d!\n", + cur_offset, out_offset); + return -1; + } + return 0; +#undef cur_offset +#undef jmp_offset } -static int build_body(struct jit_ctx *ctx) +/* 0xabcd => 0xcdab */ +static inline void emit_rev16(const u8 rd, const u8 rn, struct jit_ctx *ctx) { - void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; - const struct bpf_prog *prog = ctx->skf; - const struct sock_filter *inst; - unsigned i, load_order, off, condt; - int imm12; - u32 k; +#if __LINUX_ARM_ARCH__ < 6 + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + + emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 8), ctx); + emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx); + emit(ARM_ORR_SI(rd, tmp2[0], tmp2[1], SRTYPE_LSL, 8), ctx); +#else /* ARMv6+ */ + emit(ARM_REV16(rd, rn), ctx); +#endif +} - for (i = 0; i < prog->len; i++) { - u16 code; +/* 0xabcdefgh => 0xghefcdab */ +static inline void emit_rev32(const u8 rd, const u8 rn, struct jit_ctx *ctx) +{ +#if __LINUX_ARM_ARCH__ < 6 + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + + emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 24), ctx); + emit(ARM_ORR_SI(ARM_IP, tmp2[0], tmp2[1], SRTYPE_LSL, 24), ctx); + + emit(ARM_MOV_SI(tmp2[1], rn, SRTYPE_LSR, 8), ctx); + emit(ARM_AND_I(tmp2[1], tmp2[1], 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 16), ctx); + emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], tmp2[0], SRTYPE_LSL, 8), ctx); + emit(ARM_ORR_SI(tmp2[0], tmp2[0], tmp2[1], SRTYPE_LSL, 16), ctx); + emit(ARM_ORR_R(rd, ARM_IP, tmp2[0]), ctx); + +#else /* ARMv6+ */ + emit(ARM_REV(rd, rn), ctx); +#endif +} - inst = &(prog->insns[i]); - /* K as an immediate value operand */ - k = inst->k; - code = bpf_anc_helper(inst); +// push the scratch stack register on top of the stack +static inline void emit_push_r64(const u8 src[], const u8 shift, + struct jit_ctx *ctx) +{ + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + u16 reg_set = 0; - /* compute offsets only in the fake pass */ - if (ctx->target == NULL) - ctx->offsets[i] = ctx->idx * 4; + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(src[1]+shift)), ctx); + emit(ARM_LDR_I(tmp2[0], ARM_SP, STACK_VAR(src[0]+shift)), ctx); + + reg_set = (1 << tmp2[1]) | (1 << tmp2[0]); + emit(ARM_PUSH(reg_set), ctx); +} + +static void build_prologue(struct jit_ctx *ctx) +{ + const u8 r0 = bpf2a32[BPF_REG_0][1]; + const u8 r2 = bpf2a32[BPF_REG_1][1]; + const u8 r3 = bpf2a32[BPF_REG_1][0]; + const u8 r4 = bpf2a32[BPF_REG_6][1]; + const u8 r5 = bpf2a32[BPF_REG_6][0]; + const u8 r6 = bpf2a32[TMP_REG_1][1]; + const u8 r7 = bpf2a32[TMP_REG_1][0]; + const u8 r8 = bpf2a32[TMP_REG_2][1]; + const u8 r10 = bpf2a32[TMP_REG_2][0]; + const u8 fplo = bpf2a32[BPF_REG_FP][1]; + const u8 fphi = bpf2a32[BPF_REG_FP][0]; + const u8 sp = ARM_SP; + const u8 *tcc = bpf2a32[TCALL_CNT]; + + u16 reg_set = 0; + + /* + * eBPF prog stack layout + * + * high + * original ARM_SP => +-----+ eBPF prologue + * |FP/LR| + * current ARM_FP => +-----+ + * | ... | callee saved registers + * eBPF fp register => +-----+ <= (BPF_FP) + * | ... | eBPF JIT scratch space + * | | eBPF prog stack + * +-----+ + * |RSVD | JIT scratchpad + * current A64_SP => +-----+ <= (BPF_FP - STACK_SIZE) + * | | + * | ... | Function call stack + * | | + * +-----+ + * low + */ + + /* Save callee saved registers. */ + reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10); +#ifdef CONFIG_FRAME_POINTER + reg_set |= (1<<ARM_FP) | (1<<ARM_IP) | (1<<ARM_LR) | (1<<ARM_PC); + emit(ARM_MOV_R(ARM_IP, sp), ctx); + emit(ARM_PUSH(reg_set), ctx); + emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); +#else + /* Check if call instruction exists in BPF body */ + if (ctx->seen & SEEN_CALL) + reg_set |= (1<<ARM_LR); + emit(ARM_PUSH(reg_set), ctx); +#endif + /* Save frame pointer for later */ + emit(ARM_SUB_I(ARM_IP, sp, SCRATCH_SIZE), ctx); + + ctx->stack_size = imm8m(STACK_SIZE); + + /* Set up function call stack */ + emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); - switch (code) { - case BPF_LD | BPF_IMM: - emit_mov_i(r_A, k, ctx); + /* Set up BPF prog stack base register */ + emit_a32_mov_r(fplo, ARM_IP, true, false, ctx); + emit_a32_mov_i(fphi, 0, true, ctx); + + /* mov r4, 0 */ + emit(ARM_MOV_I(r4, 0), ctx); + + /* Move BPF_CTX to BPF_R1 */ + emit(ARM_MOV_R(r3, r4), ctx); + emit(ARM_MOV_R(r2, r0), ctx); + /* Initialize Tail Count */ + emit(ARM_STR_I(r4, ARM_SP, STACK_VAR(tcc[0])), ctx); + emit(ARM_STR_I(r4, ARM_SP, STACK_VAR(tcc[1])), ctx); + /* end of prologue */ +} + +static void build_epilogue(struct jit_ctx *ctx) +{ + const u8 r4 = bpf2a32[BPF_REG_6][1]; + const u8 r5 = bpf2a32[BPF_REG_6][0]; + const u8 r6 = bpf2a32[TMP_REG_1][1]; + const u8 r7 = bpf2a32[TMP_REG_1][0]; + const u8 r8 = bpf2a32[TMP_REG_2][1]; + const u8 r10 = bpf2a32[TMP_REG_2][0]; + u16 reg_set = 0; + + /* unwind function call stack */ + emit(ARM_ADD_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); + + /* restore callee saved registers. */ + reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10); +#ifdef CONFIG_FRAME_POINTER + /* the first instruction of the prologue was: mov ip, sp */ + reg_set |= (1<<ARM_FP) | (1<<ARM_SP) | (1<<ARM_PC); + emit(ARM_LDM(ARM_SP, reg_set), ctx); +#else + if (ctx->seen & SEEN_CALL) + reg_set |= (1<<ARM_PC); + /* Restore callee saved registers. */ + emit(ARM_POP(reg_set), ctx); + /* Return back to the callee function */ + if (!(ctx->seen & SEEN_CALL)) + emit(ARM_BX(ARM_LR), ctx); +#endif +} + +/* + * Convert an eBPF instruction to native instruction, i.e + * JITs an eBPF instruction. + * Returns : + * 0 - Successfully JITed an 8-byte eBPF instruction + * >0 - Successfully JITed a 16-byte eBPF instruction + * <0 - Failed to JIT. + */ +static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) +{ + const u8 code = insn->code; + const u8 *dst = bpf2a32[insn->dst_reg]; + const u8 *src = bpf2a32[insn->src_reg]; + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + const s16 off = insn->off; + const s32 imm = insn->imm; + const int i = insn - ctx->prog->insnsi; + const bool is64 = BPF_CLASS(code) == BPF_ALU64; + const bool dstk = is_on_stack(insn->dst_reg); + const bool sstk = is_on_stack(insn->src_reg); + u8 rd, rt, rm, rn; + s32 jmp_offset; + +#define check_imm(bits, imm) do { \ + if ((((imm) > 0) && ((imm) >> (bits))) || \ + (((imm) < 0) && (~(imm) >> (bits)))) { \ + pr_info("[%2d] imm=%d(0x%x) out of range\n", \ + i, imm, imm); \ + return -EINVAL; \ + } \ +} while (0) +#define check_imm24(imm) check_imm(24, imm) + + switch (code) { + /* ALU operations */ + + /* dst = src */ + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_MOV | BPF_X: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_a32_mov_r64(is64, dst, src, dstk, sstk, ctx); break; - case BPF_LD | BPF_W | BPF_LEN: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - emit(ARM_LDR_I(r_A, r_skb, - offsetof(struct sk_buff, len)), ctx); + case BPF_K: + /* Sign-extend immediate value to destination reg */ + emit_a32_mov_i64(is64, dst, imm, dstk, ctx); break; - case BPF_LD | BPF_MEM: - /* A = scratch[k] */ - ctx->seen |= SEEN_MEM_WORD(k); - emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); + } + break; + /* dst = dst + src/imm */ + /* dst = dst - src/imm */ + /* dst = dst | src/imm */ + /* dst = dst & src/imm */ + /* dst = dst ^ src/imm */ + /* dst = dst * src/imm */ + /* dst = dst << src */ + /* dst = dst >> src */ + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_ARSH | BPF_K: + case BPF_ALU | BPF_ARSH | BPF_X: + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_ADD | BPF_X: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_X: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_X: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_X: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_a32_alu_r64(is64, dst, src, dstk, sstk, + ctx, BPF_OP(code)); break; - case BPF_LD | BPF_W | BPF_ABS: - load_order = 2; - goto load; - case BPF_LD | BPF_H | BPF_ABS: - load_order = 1; - goto load; - case BPF_LD | BPF_B | BPF_ABS: - load_order = 0; -load: - emit_mov_i(r_off, k, ctx); -load_common: - ctx->seen |= SEEN_DATA | SEEN_CALL; - - if (load_order > 0) { - emit(ARM_SUB_I(r_scratch, r_skb_hl, - 1 << load_order), ctx); - emit(ARM_CMP_R(r_scratch, r_off), ctx); - condt = ARM_COND_GE; - } else { - emit(ARM_CMP_R(r_skb_hl, r_off), ctx); - condt = ARM_COND_HI; - } - - /* - * test for negative offset, only if we are - * currently scheduled to take the fast - * path. this will update the flags so that - * the slowpath instruction are ignored if the - * offset is negative. - * - * for loard_order == 0 the HI condition will - * make loads at offset 0 take the slow path too. + case BPF_K: + /* Move immediate value to the temporary register + * and then do the ALU operation on the temporary + * register as this will sign-extend the immediate + * value into temporary reg and then it would be + * safe to do the operation on it. */ - _emit(condt, ARM_CMP_I(r_off, 0), ctx); - - _emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data), - ctx); - - if (load_order == 0) - _emit(condt, ARM_LDRB_I(r_A, r_scratch, 0), - ctx); - else if (load_order == 1) - emit_load_be16(condt, r_A, r_scratch, ctx); - else if (load_order == 2) - emit_load_be32(condt, r_A, r_scratch, ctx); - - _emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx); - - /* the slowpath */ - emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx); - emit(ARM_MOV_R(ARM_R0, r_skb), ctx); - /* the offset is already in R1 */ - emit_blx_r(ARM_R3, ctx); - /* check the result of skb_copy_bits */ - emit(ARM_CMP_I(ARM_R1, 0), ctx); - emit_err_ret(ARM_COND_NE, ctx); - emit(ARM_MOV_R(r_A, ARM_R0), ctx); + emit_a32_mov_i64(is64, tmp2, imm, false, ctx); + emit_a32_alu_r64(is64, dst, tmp2, dstk, false, + ctx, BPF_OP(code)); break; - case BPF_LD | BPF_W | BPF_IND: - load_order = 2; - goto load_ind; - case BPF_LD | BPF_H | BPF_IND: - load_order = 1; - goto load_ind; - case BPF_LD | BPF_B | BPF_IND: - load_order = 0; -load_ind: - update_on_xread(ctx); - OP_IMM3(ARM_ADD, r_off, r_X, k, ctx); - goto load_common; - case BPF_LDX | BPF_IMM: - ctx->seen |= SEEN_X; - emit_mov_i(r_X, k, ctx); + } + break; + /* dst = dst / src(imm) */ + /* dst = dst % src(imm) */ + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_MOD | BPF_X: + rt = src_lo; + rd = dstk ? tmp2[1] : dst_lo; + if (dstk) + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + switch (BPF_SRC(code)) { + case BPF_X: + rt = sstk ? tmp2[0] : rt; + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), + ctx); break; - case BPF_LDX | BPF_W | BPF_LEN: - ctx->seen |= SEEN_X | SEEN_SKB; - emit(ARM_LDR_I(r_X, r_skb, - offsetof(struct sk_buff, len)), ctx); + case BPF_K: + rt = tmp2[0]; + emit_a32_mov_i(rt, imm, false, ctx); break; - case BPF_LDX | BPF_MEM: - ctx->seen |= SEEN_X | SEEN_MEM_WORD(k); - emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); + } + emit_udivmod(rd, rd, rt, ctx, BPF_OP(code)); + if (dstk) + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); + break; + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_X: + case BPF_ALU64 | BPF_MOD | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_X: + goto notyet; + /* dst = dst >> imm */ + /* dst = dst << imm */ + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_K: + if (unlikely(imm > 31)) + return -EINVAL; + if (imm) + emit_a32_alu_i(dst_lo, imm, dstk, ctx, BPF_OP(code)); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); + break; + /* dst = dst << imm */ + case BPF_ALU64 | BPF_LSH | BPF_K: + if (unlikely(imm > 63)) + return -EINVAL; + emit_a32_lsh_i64(dst, dstk, imm, ctx); + break; + /* dst = dst >> imm */ + case BPF_ALU64 | BPF_RSH | BPF_K: + if (unlikely(imm > 63)) + return -EINVAL; + emit_a32_lsr_i64(dst, dstk, imm, ctx); + break; + /* dst = dst << src */ + case BPF_ALU64 | BPF_LSH | BPF_X: + emit_a32_lsh_r64(dst, src, dstk, sstk, ctx); + break; + /* dst = dst >> src */ + case BPF_ALU64 | BPF_RSH | BPF_X: + emit_a32_lsr_r64(dst, src, dstk, sstk, ctx); + break; + /* dst = dst >> src (signed) */ + case BPF_ALU64 | BPF_ARSH | BPF_X: + emit_a32_arsh_r64(dst, src, dstk, sstk, ctx); + break; + /* dst = dst >> imm (signed) */ + case BPF_ALU64 | BPF_ARSH | BPF_K: + if (unlikely(imm > 63)) + return -EINVAL; + emit_a32_arsh_i64(dst, dstk, imm, ctx); + break; + /* dst = ~dst */ + case BPF_ALU | BPF_NEG: + emit_a32_alu_i(dst_lo, 0, dstk, ctx, BPF_OP(code)); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); + break; + /* dst = ~dst (64 bit) */ + case BPF_ALU64 | BPF_NEG: + emit_a32_neg64(dst, dstk, ctx); + break; + /* dst = dst * src/imm */ + case BPF_ALU64 | BPF_MUL | BPF_X: + case BPF_ALU64 | BPF_MUL | BPF_K: + switch (BPF_SRC(code)) { + case BPF_X: + emit_a32_mul_r64(dst, src, dstk, sstk, ctx); break; - case BPF_LDX | BPF_B | BPF_MSH: - /* x = ((*(frame + k)) & 0xf) << 2; */ - ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL; - /* the interpreter should deal with the negative K */ - if ((int)k < 0) - return -1; - /* offset in r1: we might have to take the slow path */ - emit_mov_i(r_off, k, ctx); - emit(ARM_CMP_R(r_skb_hl, r_off), ctx); - - /* load in r0: common with the slowpath */ - _emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data, - ARM_R1), ctx); - /* - * emit_mov_i() might generate one or two instructions, - * the same holds for emit_blx_r() + case BPF_K: + /* Move immediate value to the temporary register + * and then do the multiplication on it as this + * will sign-extend the immediate value into temp + * reg then it would be safe to do the operation + * on it. */ - _emit(ARM_COND_HI, ARM_B(b_imm(i + 1, ctx) - 2), ctx); - - emit(ARM_MOV_R(ARM_R0, r_skb), ctx); - /* r_off is r1 */ - emit_mov_i(ARM_R3, (u32)jit_get_skb_b, ctx); - emit_blx_r(ARM_R3, ctx); - /* check the return value of skb_copy_bits */ - emit(ARM_CMP_I(ARM_R1, 0), ctx); - emit_err_ret(ARM_COND_NE, ctx); - - emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx); - emit(ARM_LSL_I(r_X, r_X, 2), ctx); - break; - case BPF_ST: - ctx->seen |= SEEN_MEM_WORD(k); - emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); - break; - case BPF_STX: - update_on_xread(ctx); - ctx->seen |= SEEN_MEM_WORD(k); - emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); - break; - case BPF_ALU | BPF_ADD | BPF_K: - /* A += K */ - OP_IMM3(ARM_ADD, r_A, r_A, k, ctx); - break; - case BPF_ALU | BPF_ADD | BPF_X: - update_on_xread(ctx); - emit(ARM_ADD_R(r_A, r_A, r_X), ctx); - break; - case BPF_ALU | BPF_SUB | BPF_K: - /* A -= K */ - OP_IMM3(ARM_SUB, r_A, r_A, k, ctx); - break; - case BPF_ALU | BPF_SUB | BPF_X: - update_on_xread(ctx); - emit(ARM_SUB_R(r_A, r_A, r_X), ctx); - break; - case BPF_ALU | BPF_MUL | BPF_K: - /* A *= K */ - emit_mov_i(r_scratch, k, ctx); - emit(ARM_MUL(r_A, r_A, r_scratch), ctx); - break; - case BPF_ALU | BPF_MUL | BPF_X: - update_on_xread(ctx); - emit(ARM_MUL(r_A, r_A, r_X), ctx); + emit_a32_mov_i64(is64, tmp2, imm, false, ctx); + emit_a32_mul_r64(dst, tmp2, dstk, false, ctx); break; - case BPF_ALU | BPF_DIV | BPF_K: - if (k == 1) - break; - emit_mov_i(r_scratch, k, ctx); - emit_udivmod(r_A, r_A, r_scratch, ctx, BPF_DIV); - break; - case BPF_ALU | BPF_DIV | BPF_X: - update_on_xread(ctx); - emit(ARM_CMP_I(r_X, 0), ctx); - emit_err_ret(ARM_COND_EQ, ctx); - emit_udivmod(r_A, r_A, r_X, ctx, BPF_DIV); + } + break; + /* dst = htole(dst) */ + /* dst = htobe(dst) */ + case BPF_ALU | BPF_END | BPF_FROM_LE: + case BPF_ALU | BPF_END | BPF_FROM_BE: + rd = dstk ? tmp[0] : dst_hi; + rt = dstk ? tmp[1] : dst_lo; + if (dstk) { + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + if (BPF_SRC(code) == BPF_FROM_LE) + goto emit_bswap_uxt; + switch (imm) { + case 16: + emit_rev16(rt, rt, ctx); + goto emit_bswap_uxt; + case 32: + emit_rev32(rt, rt, ctx); + goto emit_bswap_uxt; + case 64: + /* Because of the usage of ARM_LR */ + ctx->seen |= SEEN_CALL; + emit_rev32(ARM_LR, rt, ctx); + emit_rev32(rt, rd, ctx); + emit(ARM_MOV_R(rd, ARM_LR), ctx); break; - case BPF_ALU | BPF_MOD | BPF_K: - if (k == 1) { - emit_mov_i(r_A, 0, ctx); - break; - } - emit_mov_i(r_scratch, k, ctx); - emit_udivmod(r_A, r_A, r_scratch, ctx, BPF_MOD); + } + goto exit; +emit_bswap_uxt: + switch (imm) { + case 16: + /* zero-extend 16 bits into 64 bits */ +#if __LINUX_ARM_ARCH__ < 6 + emit_a32_mov_i(tmp2[1], 0xffff, false, ctx); + emit(ARM_AND_R(rt, rt, tmp2[1]), ctx); +#else /* ARMv6+ */ + emit(ARM_UXTH(rt, rt), ctx); +#endif + emit(ARM_EOR_R(rd, rd, rd), ctx); break; - case BPF_ALU | BPF_MOD | BPF_X: - update_on_xread(ctx); - emit(ARM_CMP_I(r_X, 0), ctx); - emit_err_ret(ARM_COND_EQ, ctx); - emit_udivmod(r_A, r_A, r_X, ctx, BPF_MOD); + case 32: + /* zero-extend 32 bits into 64 bits */ + emit(ARM_EOR_R(rd, rd, rd), ctx); break; - case BPF_ALU | BPF_OR | BPF_K: - /* A |= K */ - OP_IMM3(ARM_ORR, r_A, r_A, k, ctx); + case 64: + /* nop */ break; - case BPF_ALU | BPF_OR | BPF_X: - update_on_xread(ctx); - emit(ARM_ORR_R(r_A, r_A, r_X), ctx); + } +exit: + if (dstk) { + emit(ARM_STR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + break; + /* dst = imm64 */ + case BPF_LD | BPF_IMM | BPF_DW: + { + const struct bpf_insn insn1 = insn[1]; + u32 hi, lo = imm; + + hi = insn1.imm; + emit_a32_mov_i(dst_lo, lo, dstk, ctx); + emit_a32_mov_i(dst_hi, hi, dstk, ctx); + + return 1; + } + /* LDX: dst = *(size *)(src + off) */ + case BPF_LDX | BPF_MEM | BPF_W: + case BPF_LDX | BPF_MEM | BPF_H: + case BPF_LDX | BPF_MEM | BPF_B: + case BPF_LDX | BPF_MEM | BPF_DW: + rn = sstk ? tmp2[1] : src_lo; + if (sstk) + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); + switch (BPF_SIZE(code)) { + case BPF_W: + /* Load a Word */ + case BPF_H: + /* Load a Half-Word */ + case BPF_B: + /* Load a Byte */ + emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_SIZE(code)); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); break; - case BPF_ALU | BPF_XOR | BPF_K: - /* A ^= K; */ - OP_IMM3(ARM_EOR, r_A, r_A, k, ctx); + case BPF_DW: + /* Load a double word */ + emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_W); + emit_ldx_r(dst_hi, rn, dstk, off+4, ctx, BPF_W); break; - case BPF_ANC | SKF_AD_ALU_XOR_X: - case BPF_ALU | BPF_XOR | BPF_X: - /* A ^= X */ - update_on_xread(ctx); - emit(ARM_EOR_R(r_A, r_A, r_X), ctx); + } + break; + /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */ + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + { + const u8 r4 = bpf2a32[BPF_REG_6][1]; /* r4 = ptr to sk_buff */ + const u8 r0 = bpf2a32[BPF_REG_0][1]; /*r0: struct sk_buff *skb*/ + /* rtn value */ + const u8 r1 = bpf2a32[BPF_REG_0][0]; /* r1: int k */ + const u8 r2 = bpf2a32[BPF_REG_1][1]; /* r2: unsigned int size */ + const u8 r3 = bpf2a32[BPF_REG_1][0]; /* r3: void *buffer */ + const u8 r6 = bpf2a32[TMP_REG_1][1]; /* r6: void *(*func)(..) */ + int size; + + /* Setting up first argument */ + emit(ARM_MOV_R(r0, r4), ctx); + + /* Setting up second argument */ + emit_a32_mov_i(r1, imm, false, ctx); + if (BPF_MODE(code) == BPF_IND) + emit_a32_alu_r(r1, src_lo, false, sstk, ctx, + false, false, BPF_ADD); + + /* Setting up third argument */ + switch (BPF_SIZE(code)) { + case BPF_W: + size = 4; break; - case BPF_ALU | BPF_AND | BPF_K: - /* A &= K */ - OP_IMM3(ARM_AND, r_A, r_A, k, ctx); + case BPF_H: + size = 2; break; - case BPF_ALU | BPF_AND | BPF_X: - update_on_xread(ctx); - emit(ARM_AND_R(r_A, r_A, r_X), ctx); + case BPF_B: + size = 1; break; - case BPF_ALU | BPF_LSH | BPF_K: - if (unlikely(k > 31)) - return -1; - emit(ARM_LSL_I(r_A, r_A, k), ctx); + default: + return -EINVAL; + } + emit_a32_mov_i(r2, size, false, ctx); + + /* Setting up fourth argument */ + emit(ARM_ADD_I(r3, ARM_SP, imm8m(SKB_BUFFER)), ctx); + + /* Setting up function pointer to call */ + emit_a32_mov_i(r6, (unsigned int)bpf_load_pointer, false, ctx); + emit_blx_r(r6, ctx); + + emit(ARM_EOR_R(r1, r1, r1), ctx); + /* Check if return address is NULL or not. + * if NULL then jump to epilogue + * else continue to load the value from retn address + */ + emit(ARM_CMP_I(r0, 0), ctx); + jmp_offset = epilogue_offset(ctx); + check_imm24(jmp_offset); + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); + + /* Load value from the address */ + switch (BPF_SIZE(code)) { + case BPF_W: + emit(ARM_LDR_I(r0, r0, 0), ctx); + emit_rev32(r0, r0, ctx); break; - case BPF_ALU | BPF_LSH | BPF_X: - update_on_xread(ctx); - emit(ARM_LSL_R(r_A, r_A, r_X), ctx); + case BPF_H: + emit(ARM_LDRH_I(r0, r0, 0), ctx); + emit_rev16(r0, r0, ctx); break; - case BPF_ALU | BPF_RSH | BPF_K: - if (unlikely(k > 31)) - return -1; - if (k) - emit(ARM_LSR_I(r_A, r_A, k), ctx); + case BPF_B: + emit(ARM_LDRB_I(r0, r0, 0), ctx); + /* No need to reverse */ break; - case BPF_ALU | BPF_RSH | BPF_X: - update_on_xread(ctx); - emit(ARM_LSR_R(r_A, r_A, r_X), ctx); + } + break; + } + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + case BPF_ST | BPF_MEM | BPF_DW: + switch (BPF_SIZE(code)) { + case BPF_DW: + /* Sign-extend immediate value into temp reg */ + emit_a32_mov_i64(true, tmp2, imm, false, ctx); + emit_str_r(dst_lo, tmp2[1], dstk, off, ctx, BPF_W); + emit_str_r(dst_lo, tmp2[0], dstk, off+4, ctx, BPF_W); break; - case BPF_ALU | BPF_NEG: - /* A = -A */ - emit(ARM_RSB_I(r_A, r_A, 0), ctx); + case BPF_W: + case BPF_H: + case BPF_B: + emit_a32_mov_i(tmp2[1], imm, false, ctx); + emit_str_r(dst_lo, tmp2[1], dstk, off, ctx, + BPF_SIZE(code)); break; - case BPF_JMP | BPF_JA: - /* pc += K */ - emit(ARM_B(b_imm(i + k + 1, ctx)), ctx); + } + break; + /* STX XADD: lock *(u32 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_W: + /* STX XADD: lock *(u64 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_DW: + goto notyet; + /* STX: *(size *)(dst + off) = src */ + case BPF_STX | BPF_MEM | BPF_W: + case BPF_STX | BPF_MEM | BPF_H: + case BPF_STX | BPF_MEM | BPF_B: + case BPF_STX | BPF_MEM | BPF_DW: + { + u8 sz = BPF_SIZE(code); + + rn = sstk ? tmp2[1] : src_lo; + rm = sstk ? tmp2[0] : src_hi; + if (sstk) { + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(src_hi)), ctx); + } + + /* Store the value */ + if (BPF_SIZE(code) == BPF_DW) { + emit_str_r(dst_lo, rn, dstk, off, ctx, BPF_W); + emit_str_r(dst_lo, rm, dstk, off+4, ctx, BPF_W); + } else { + emit_str_r(dst_lo, rn, dstk, off, ctx, sz); + } + break; + } + /* PC += off if dst == src */ + /* PC += off if dst > src */ + /* PC += off if dst >= src */ + /* PC += off if dst < src */ + /* PC += off if dst <= src */ + /* PC += off if dst != src */ + /* PC += off if dst > src (signed) */ + /* PC += off if dst >= src (signed) */ + /* PC += off if dst < src (signed) */ + /* PC += off if dst <= src (signed) */ + /* PC += off if dst & src */ + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: + /* Setup source registers */ + rm = sstk ? tmp2[0] : src_hi; + rn = sstk ? tmp2[1] : src_lo; + if (sstk) { + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(src_hi)), ctx); + } + goto go_jmp; + /* PC += off if dst == imm */ + /* PC += off if dst > imm */ + /* PC += off if dst >= imm */ + /* PC += off if dst < imm */ + /* PC += off if dst <= imm */ + /* PC += off if dst != imm */ + /* PC += off if dst > imm (signed) */ + /* PC += off if dst >= imm (signed) */ + /* PC += off if dst < imm (signed) */ + /* PC += off if dst <= imm (signed) */ + /* PC += off if dst & imm */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: + if (off == 0) break; - case BPF_JMP | BPF_JEQ | BPF_K: - /* pc += (A == K) ? pc->jt : pc->jf */ - condt = ARM_COND_EQ; - goto cmp_imm; - case BPF_JMP | BPF_JGT | BPF_K: - /* pc += (A > K) ? pc->jt : pc->jf */ - condt = ARM_COND_HI; - goto cmp_imm; - case BPF_JMP | BPF_JGE | BPF_K: - /* pc += (A >= K) ? pc->jt : pc->jf */ - condt = ARM_COND_HS; -cmp_imm: - imm12 = imm8m(k); - if (imm12 < 0) { - emit_mov_i_no8m(r_scratch, k, ctx); - emit(ARM_CMP_R(r_A, r_scratch), ctx); - } else { - emit(ARM_CMP_I(r_A, imm12), ctx); - } -cond_jump: - if (inst->jt) - _emit(condt, ARM_B(b_imm(i + inst->jt + 1, - ctx)), ctx); - if (inst->jf) - _emit(condt ^ 1, ARM_B(b_imm(i + inst->jf + 1, - ctx)), ctx); + rm = tmp2[0]; + rn = tmp2[1]; + /* Sign-extend immediate value */ + emit_a32_mov_i64(true, tmp2, imm, false, ctx); +go_jmp: + /* Setup destination register */ + rd = dstk ? tmp[0] : dst_hi; + rt = dstk ? tmp[1] : dst_lo; + if (dstk) { + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Check for the condition */ + emit_ar_r(rd, rt, rm, rn, ctx, BPF_OP(code)); + + /* Setup JUMP instruction */ + jmp_offset = bpf2a32_offset(i+off, i, ctx); + switch (BPF_OP(code)) { + case BPF_JNE: + case BPF_JSET: + _emit(ARM_COND_NE, ARM_B(jmp_offset), ctx); break; - case BPF_JMP | BPF_JEQ | BPF_X: - /* pc += (A == X) ? pc->jt : pc->jf */ - condt = ARM_COND_EQ; - goto cmp_x; - case BPF_JMP | BPF_JGT | BPF_X: - /* pc += (A > X) ? pc->jt : pc->jf */ - condt = ARM_COND_HI; - goto cmp_x; - case BPF_JMP | BPF_JGE | BPF_X: - /* pc += (A >= X) ? pc->jt : pc->jf */ - condt = ARM_COND_CS; -cmp_x: - update_on_xread(ctx); - emit(ARM_CMP_R(r_A, r_X), ctx); - goto cond_jump; - case BPF_JMP | BPF_JSET | BPF_K: - /* pc += (A & K) ? pc->jt : pc->jf */ - condt = ARM_COND_NE; - /* not set iff all zeroes iff Z==1 iff EQ */ - - imm12 = imm8m(k); - if (imm12 < 0) { - emit_mov_i_no8m(r_scratch, k, ctx); - emit(ARM_TST_R(r_A, r_scratch), ctx); - } else { - emit(ARM_TST_I(r_A, imm12), ctx); - } - goto cond_jump; - case BPF_JMP | BPF_JSET | BPF_X: - /* pc += (A & X) ? pc->jt : pc->jf */ - update_on_xread(ctx); - condt = ARM_COND_NE; - emit(ARM_TST_R(r_A, r_X), ctx); - goto cond_jump; - case BPF_RET | BPF_A: - emit(ARM_MOV_R(ARM_R0, r_A), ctx); - goto b_epilogue; - case BPF_RET | BPF_K: - if ((k == 0) && (ctx->ret0_fp_idx < 0)) - ctx->ret0_fp_idx = i; - emit_mov_i(ARM_R0, k, ctx); -b_epilogue: - if (i != ctx->skf->len - 1) - emit(ARM_B(b_imm(prog->len, ctx)), ctx); + case BPF_JEQ: + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); break; - case BPF_MISC | BPF_TAX: - /* X = A */ - ctx->seen |= SEEN_X; - emit(ARM_MOV_R(r_X, r_A), ctx); + case BPF_JGT: + _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); break; - case BPF_MISC | BPF_TXA: - /* A = X */ - update_on_xread(ctx); - emit(ARM_MOV_R(r_A, r_X), ctx); + case BPF_JGE: + _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_PROTOCOL: - /* A = ntohs(skb->protocol) */ - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, - protocol) != 2); - off = offsetof(struct sk_buff, protocol); - emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx); - emit_swap16(r_A, r_scratch, ctx); + case BPF_JSGT: + _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_CPU: - /* r_scratch = current_thread_info() */ - OP_IMM3(ARM_BIC, r_scratch, ARM_SP, THREAD_SIZE - 1, ctx); - /* A = current_thread_info()->cpu */ - BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4); - off = offsetof(struct thread_info, cpu); - emit(ARM_LDR_I(r_A, r_scratch, off), ctx); + case BPF_JSGE: + _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_IFINDEX: - case BPF_ANC | SKF_AD_HATYPE: - /* A = skb->dev->ifindex */ - /* A = skb->dev->type */ - ctx->seen |= SEEN_SKB; - off = offsetof(struct sk_buff, dev); - emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); - - emit(ARM_CMP_I(r_scratch, 0), ctx); - emit_err_ret(ARM_COND_EQ, ctx); - - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, - ifindex) != 4); - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, - type) != 2); - - if (code == (BPF_ANC | SKF_AD_IFINDEX)) { - off = offsetof(struct net_device, ifindex); - emit(ARM_LDR_I(r_A, r_scratch, off), ctx); - } else { - /* - * offset of field "type" in "struct - * net_device" is above what can be - * used in the ldrh rd, [rn, #imm] - * instruction, so load the offset in - * a register and use ldrh rd, [rn, rm] - */ - off = offsetof(struct net_device, type); - emit_mov_i(ARM_R3, off, ctx); - emit(ARM_LDRH_R(r_A, r_scratch, ARM_R3), ctx); - } + case BPF_JLE: + _emit(ARM_COND_LS, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_MARK: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - off = offsetof(struct sk_buff, mark); - emit(ARM_LDR_I(r_A, r_skb, off), ctx); + case BPF_JLT: + _emit(ARM_COND_CC, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_RXHASH: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - off = offsetof(struct sk_buff, hash); - emit(ARM_LDR_I(r_A, r_skb, off), ctx); + case BPF_JSLT: + _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_VLAN_TAG: - case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - off = offsetof(struct sk_buff, vlan_tci); - emit(ARM_LDRH_I(r_A, r_skb, off), ctx); - if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) - OP_IMM3(ARM_AND, r_A, r_A, ~VLAN_TAG_PRESENT, ctx); - else { - OP_IMM3(ARM_LSR, r_A, r_A, 12, ctx); - OP_IMM3(ARM_AND, r_A, r_A, 0x1, ctx); - } + case BPF_JSLE: + _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_PKTTYPE: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, - __pkt_type_offset[0]) != 1); - off = PKT_TYPE_OFFSET(); - emit(ARM_LDRB_I(r_A, r_skb, off), ctx); - emit(ARM_AND_I(r_A, r_A, PKT_TYPE_MAX), ctx); -#ifdef __BIG_ENDIAN_BITFIELD - emit(ARM_LSR_I(r_A, r_A, 5), ctx); -#endif + } + break; + /* JMP OFF */ + case BPF_JMP | BPF_JA: + { + if (off == 0) break; - case BPF_ANC | SKF_AD_QUEUE: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, - queue_mapping) != 2); - BUILD_BUG_ON(offsetof(struct sk_buff, - queue_mapping) > 0xff); - off = offsetof(struct sk_buff, queue_mapping); - emit(ARM_LDRH_I(r_A, r_skb, off), ctx); + jmp_offset = bpf2a32_offset(i+off, i, ctx); + check_imm24(jmp_offset); + emit(ARM_B(jmp_offset), ctx); + break; + } + /* tail call */ + case BPF_JMP | BPF_TAIL_CALL: + if (emit_bpf_tail_call(ctx)) + return -EFAULT; + break; + /* function call */ + case BPF_JMP | BPF_CALL: + { + const u8 *r0 = bpf2a32[BPF_REG_0]; + const u8 *r1 = bpf2a32[BPF_REG_1]; + const u8 *r2 = bpf2a32[BPF_REG_2]; + const u8 *r3 = bpf2a32[BPF_REG_3]; + const u8 *r4 = bpf2a32[BPF_REG_4]; + const u8 *r5 = bpf2a32[BPF_REG_5]; + const u32 func = (u32)__bpf_call_base + (u32)imm; + + emit_a32_mov_r64(true, r0, r1, false, false, ctx); + emit_a32_mov_r64(true, r1, r2, false, true, ctx); + emit_push_r64(r5, 0, ctx); + emit_push_r64(r4, 8, ctx); + emit_push_r64(r3, 16, ctx); + + emit_a32_mov_i(tmp[1], func, false, ctx); + emit_blx_r(tmp[1], ctx); + + emit(ARM_ADD_I(ARM_SP, ARM_SP, imm8m(24)), ctx); // callee clean + break; + } + /* function return */ + case BPF_JMP | BPF_EXIT: + /* Optimization: when last instruction is EXIT + * simply fallthrough to epilogue. + */ + if (i == ctx->prog->len - 1) break; - case BPF_ANC | SKF_AD_PAY_OFFSET: - ctx->seen |= SEEN_SKB | SEEN_CALL; + jmp_offset = epilogue_offset(ctx); + check_imm24(jmp_offset); + emit(ARM_B(jmp_offset), ctx); + break; +notyet: + pr_info_once("*** NOT YET: opcode %02x ***\n", code); + return -EFAULT; + default: + pr_err_once("unknown opcode %02x\n", code); + return -EINVAL; + } - emit(ARM_MOV_R(ARM_R0, r_skb), ctx); - emit_mov_i(ARM_R3, (unsigned int)skb_get_poff, ctx); - emit_blx_r(ARM_R3, ctx); - emit(ARM_MOV_R(r_A, ARM_R0), ctx); - break; - case BPF_LDX | BPF_W | BPF_ABS: - /* - * load a 32bit word from struct seccomp_data. - * seccomp_check_filter() will already have checked - * that k is 32bit aligned and lies within the - * struct seccomp_data. - */ - ctx->seen |= SEEN_SKB; - emit(ARM_LDR_I(r_A, r_skb, k), ctx); - break; - default: - return -1; + if (ctx->flags & FLAG_IMM_OVERFLOW) + /* + * this instruction generated an overflow when + * trying to access the literal pool, so + * delegate this filter to the kernel interpreter. + */ + return -1; + return 0; +} + +static int build_body(struct jit_ctx *ctx) +{ + const struct bpf_prog *prog = ctx->prog; + unsigned int i; + + for (i = 0; i < prog->len; i++) { + const struct bpf_insn *insn = &(prog->insnsi[i]); + int ret; + + ret = build_insn(insn, ctx); + + /* It's used with loading the 64 bit immediate value. */ + if (ret > 0) { + i++; + if (ctx->target == NULL) + ctx->offsets[i] = ctx->idx; + continue; } - if (ctx->flags & FLAG_IMM_OVERFLOW) - /* - * this instruction generated an overflow when - * trying to access the literal pool, so - * delegate this filter to the kernel interpreter. - */ - return -1; + if (ctx->target == NULL) + ctx->offsets[i] = ctx->idx; + + /* If unsuccesfull, return with error code */ + if (ret) + return ret; } + return 0; +} - /* compute offsets only during the first pass */ - if (ctx->target == NULL) - ctx->offsets[i] = ctx->idx * 4; +static int validate_code(struct jit_ctx *ctx) +{ + int i; + + for (i = 0; i < ctx->idx; i++) { + if (ctx->target[i] == __opcode_to_mem_arm(ARM_INST_UDF)) + return -1; + } return 0; } +void bpf_jit_compile(struct bpf_prog *prog) +{ + /* Nothing to do here. We support Internal BPF. */ +} -void bpf_jit_compile(struct bpf_prog *fp) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { + struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; + bool tmp_blinded = false; struct jit_ctx ctx; - unsigned tmp_idx; - unsigned alloc_size; - u8 *target_ptr; + unsigned int tmp_idx; + unsigned int image_size; + u8 *image_ptr; + /* If BPF JIT was not enabled then we must fall back to + * the interpreter. + */ if (!bpf_jit_enable) - return; + return orig_prog; - memset(&ctx, 0, sizeof(ctx)); - ctx.skf = fp; - ctx.ret0_fp_idx = -1; + /* If constant blinding was enabled and we failed during blinding + * then we must fall back to the interpreter. Otherwise, we save + * the new JITed code. + */ + tmp = bpf_jit_blind_constants(prog); - ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL); - if (ctx.offsets == NULL) - return; + if (IS_ERR(tmp)) + return orig_prog; + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } - /* fake pass to fill in the ctx->seen */ - if (unlikely(build_body(&ctx))) + memset(&ctx, 0, sizeof(ctx)); + ctx.prog = prog; + + /* Not able to allocate memory for offsets[] , then + * we must fall back to the interpreter + */ + ctx.offsets = kcalloc(prog->len, sizeof(int), GFP_KERNEL); + if (ctx.offsets == NULL) { + prog = orig_prog; goto out; + } + + /* 1) fake pass to find in the length of the JITed code, + * to compute ctx->offsets and other context variables + * needed to compute final JITed code. + * Also, calculate random starting pointer/start of JITed code + * which is prefixed by random number of fault instructions. + * + * If the first pass fails then there is no chance of it + * being successful in the second pass, so just fall back + * to the interpreter. + */ + if (build_body(&ctx)) { + prog = orig_prog; + goto out_off; + } tmp_idx = ctx.idx; build_prologue(&ctx); ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4; + ctx.epilogue_offset = ctx.idx; + #if __LINUX_ARM_ARCH__ < 7 tmp_idx = ctx.idx; build_epilogue(&ctx); @@ -1021,64 +1880,83 @@ void bpf_jit_compile(struct bpf_prog *fp) ctx.idx += ctx.imm_count; if (ctx.imm_count) { - ctx.imms = kzalloc(4 * ctx.imm_count, GFP_KERNEL); - if (ctx.imms == NULL) - goto out; + ctx.imms = kcalloc(ctx.imm_count, sizeof(u32), GFP_KERNEL); + if (ctx.imms == NULL) { + prog = orig_prog; + goto out_off; + } } #else - /* there's nothing after the epilogue on ARMv7 */ + /* there's nothing about the epilogue on ARMv7 */ build_epilogue(&ctx); #endif - alloc_size = 4 * ctx.idx; - header = bpf_jit_binary_alloc(alloc_size, &target_ptr, - 4, jit_fill_hole); - if (header == NULL) - goto out; + /* Now we can get the actual image size of the JITed arm code. + * Currently, we are not considering the THUMB-2 instructions + * for jit, although it can decrease the size of the image. + * + * As each arm instruction is of length 32bit, we are translating + * number of JITed intructions into the size required to store these + * JITed code. + */ + image_size = sizeof(u32) * ctx.idx; - ctx.target = (u32 *) target_ptr; + /* Now we know the size of the structure to make */ + header = bpf_jit_binary_alloc(image_size, &image_ptr, + sizeof(u32), jit_fill_hole); + /* Not able to allocate memory for the structure then + * we must fall back to the interpretation + */ + if (header == NULL) { + prog = orig_prog; + goto out_imms; + } + + /* 2.) Actual pass to generate final JIT code */ + ctx.target = (u32 *) image_ptr; ctx.idx = 0; build_prologue(&ctx); + + /* If building the body of the JITed code fails somehow, + * we fall back to the interpretation. + */ if (build_body(&ctx) < 0) { -#if __LINUX_ARM_ARCH__ < 7 - if (ctx.imm_count) - kfree(ctx.imms); -#endif + image_ptr = NULL; bpf_jit_binary_free(header); - goto out; + prog = orig_prog; + goto out_imms; } build_epilogue(&ctx); + /* 3.) Extra pass to validate JITed Code */ + if (validate_code(&ctx)) { + image_ptr = NULL; + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_imms; + } flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx)); -#if __LINUX_ARM_ARCH__ < 7 - if (ctx.imm_count) - kfree(ctx.imms); -#endif - if (bpf_jit_enable > 1) /* there are 2 passes here */ - bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); + bpf_jit_dump(prog->len, image_size, 2, ctx.target); set_memory_ro((unsigned long)header, header->pages); - fp->bpf_func = (void *)ctx.target; - fp->jited = 1; -out: + prog->bpf_func = (void *)ctx.target; + prog->jited = 1; + prog->jited_len = image_size; + +out_imms: +#if __LINUX_ARM_ARCH__ < 7 + if (ctx.imm_count) + kfree(ctx.imms); +#endif +out_off: kfree(ctx.offsets); - return; +out: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? + tmp : orig_prog); + return prog; } -void bpf_jit_free(struct bpf_prog *fp) -{ - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - if (!fp->jited) - goto free_filter; - - set_memory_rw(addr, header->pages); - bpf_jit_binary_free(header); - -free_filter: - bpf_prog_unlock_free(fp); -} diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h index c46fca2972f7..d5cf5f6208aa 100644 --- a/arch/arm/net/bpf_jit_32.h +++ b/arch/arm/net/bpf_jit_32.h @@ -11,6 +11,7 @@ #ifndef PFILTER_OPCODES_ARM_H #define PFILTER_OPCODES_ARM_H +/* ARM 32bit Registers */ #define ARM_R0 0 #define ARM_R1 1 #define ARM_R2 2 @@ -22,38 +23,43 @@ #define ARM_R8 8 #define ARM_R9 9 #define ARM_R10 10 -#define ARM_FP 11 -#define ARM_IP 12 -#define ARM_SP 13 -#define ARM_LR 14 -#define ARM_PC 15 - -#define ARM_COND_EQ 0x0 -#define ARM_COND_NE 0x1 -#define ARM_COND_CS 0x2 +#define ARM_FP 11 /* Frame Pointer */ +#define ARM_IP 12 /* Intra-procedure scratch register */ +#define ARM_SP 13 /* Stack pointer: as load/store base reg */ +#define ARM_LR 14 /* Link Register */ +#define ARM_PC 15 /* Program counter */ + +#define ARM_COND_EQ 0x0 /* == */ +#define ARM_COND_NE 0x1 /* != */ +#define ARM_COND_CS 0x2 /* unsigned >= */ #define ARM_COND_HS ARM_COND_CS -#define ARM_COND_CC 0x3 +#define ARM_COND_CC 0x3 /* unsigned < */ #define ARM_COND_LO ARM_COND_CC -#define ARM_COND_MI 0x4 -#define ARM_COND_PL 0x5 -#define ARM_COND_VS 0x6 -#define ARM_COND_VC 0x7 -#define ARM_COND_HI 0x8 -#define ARM_COND_LS 0x9 -#define ARM_COND_GE 0xa -#define ARM_COND_LT 0xb -#define ARM_COND_GT 0xc -#define ARM_COND_LE 0xd -#define ARM_COND_AL 0xe +#define ARM_COND_MI 0x4 /* < 0 */ +#define ARM_COND_PL 0x5 /* >= 0 */ +#define ARM_COND_VS 0x6 /* Signed Overflow */ +#define ARM_COND_VC 0x7 /* No Signed Overflow */ +#define ARM_COND_HI 0x8 /* unsigned > */ +#define ARM_COND_LS 0x9 /* unsigned <= */ +#define ARM_COND_GE 0xa /* Signed >= */ +#define ARM_COND_LT 0xb /* Signed < */ +#define ARM_COND_GT 0xc /* Signed > */ +#define ARM_COND_LE 0xd /* Signed <= */ +#define ARM_COND_AL 0xe /* None */ /* register shift types */ #define SRTYPE_LSL 0 #define SRTYPE_LSR 1 #define SRTYPE_ASR 2 #define SRTYPE_ROR 3 +#define SRTYPE_ASL (SRTYPE_LSL) #define ARM_INST_ADD_R 0x00800000 +#define ARM_INST_ADDS_R 0x00900000 +#define ARM_INST_ADC_R 0x00a00000 +#define ARM_INST_ADC_I 0x02a00000 #define ARM_INST_ADD_I 0x02800000 +#define ARM_INST_ADDS_I 0x02900000 #define ARM_INST_AND_R 0x00000000 #define ARM_INST_AND_I 0x02000000 @@ -76,8 +82,10 @@ #define ARM_INST_LDRH_I 0x01d000b0 #define ARM_INST_LDRH_R 0x019000b0 #define ARM_INST_LDR_I 0x05900000 +#define ARM_INST_LDR_R 0x07900000 #define ARM_INST_LDM 0x08900000 +#define ARM_INST_LDM_IA 0x08b00000 #define ARM_INST_LSL_I 0x01a00000 #define ARM_INST_LSL_R 0x01a00010 @@ -86,6 +94,7 @@ #define ARM_INST_LSR_R 0x01a00030 #define ARM_INST_MOV_R 0x01a00000 +#define ARM_INST_MOVS_R 0x01b00000 #define ARM_INST_MOV_I 0x03a00000 #define ARM_INST_MOVW 0x03000000 #define ARM_INST_MOVT 0x03400000 @@ -96,17 +105,28 @@ #define ARM_INST_PUSH 0x092d0000 #define ARM_INST_ORR_R 0x01800000 +#define ARM_INST_ORRS_R 0x01900000 #define ARM_INST_ORR_I 0x03800000 #define ARM_INST_REV 0x06bf0f30 #define ARM_INST_REV16 0x06bf0fb0 #define ARM_INST_RSB_I 0x02600000 +#define ARM_INST_RSBS_I 0x02700000 +#define ARM_INST_RSC_I 0x02e00000 #define ARM_INST_SUB_R 0x00400000 +#define ARM_INST_SUBS_R 0x00500000 +#define ARM_INST_RSB_R 0x00600000 #define ARM_INST_SUB_I 0x02400000 +#define ARM_INST_SUBS_I 0x02500000 +#define ARM_INST_SBC_I 0x02c00000 +#define ARM_INST_SBC_R 0x00c00000 +#define ARM_INST_SBCS_R 0x00d00000 #define ARM_INST_STR_I 0x05800000 +#define ARM_INST_STRB_I 0x05c00000 +#define ARM_INST_STRH_I 0x01c000b0 #define ARM_INST_TST_R 0x01100000 #define ARM_INST_TST_I 0x03100000 @@ -117,6 +137,8 @@ #define ARM_INST_MLS 0x00600090 +#define ARM_INST_UXTH 0x06ff0070 + /* * Use a suitable undefined instruction to use for ARM/Thumb2 faulting. * We need to be careful not to conflict with those used by other modules @@ -135,9 +157,15 @@ #define _AL3_R(op, rd, rn, rm) ((op ## _R) | (rd) << 12 | (rn) << 16 | (rm)) /* immediate */ #define _AL3_I(op, rd, rn, imm) ((op ## _I) | (rd) << 12 | (rn) << 16 | (imm)) +/* register with register-shift */ +#define _AL3_SR(inst) (inst | (1 << 4)) #define ARM_ADD_R(rd, rn, rm) _AL3_R(ARM_INST_ADD, rd, rn, rm) +#define ARM_ADDS_R(rd, rn, rm) _AL3_R(ARM_INST_ADDS, rd, rn, rm) #define ARM_ADD_I(rd, rn, imm) _AL3_I(ARM_INST_ADD, rd, rn, imm) +#define ARM_ADDS_I(rd, rn, imm) _AL3_I(ARM_INST_ADDS, rd, rn, imm) +#define ARM_ADC_R(rd, rn, rm) _AL3_R(ARM_INST_ADC, rd, rn, rm) +#define ARM_ADC_I(rd, rn, imm) _AL3_I(ARM_INST_ADC, rd, rn, imm) #define ARM_AND_R(rd, rn, rm) _AL3_R(ARM_INST_AND, rd, rn, rm) #define ARM_AND_I(rd, rn, imm) _AL3_I(ARM_INST_AND, rd, rn, imm) @@ -156,7 +184,9 @@ #define ARM_EOR_I(rd, rn, imm) _AL3_I(ARM_INST_EOR, rd, rn, imm) #define ARM_LDR_I(rt, rn, off) (ARM_INST_LDR_I | (rt) << 12 | (rn) << 16 \ - | (off)) + | ((off) & 0xfff)) +#define ARM_LDR_R(rt, rn, rm) (ARM_INST_LDR_R | (rt) << 12 | (rn) << 16 \ + | (rm)) #define ARM_LDRB_I(rt, rn, off) (ARM_INST_LDRB_I | (rt) << 12 | (rn) << 16 \ | (off)) #define ARM_LDRB_R(rt, rn, rm) (ARM_INST_LDRB_R | (rt) << 12 | (rn) << 16 \ @@ -167,15 +197,23 @@ | (rm)) #define ARM_LDM(rn, regs) (ARM_INST_LDM | (rn) << 16 | (regs)) +#define ARM_LDM_IA(rn, regs) (ARM_INST_LDM_IA | (rn) << 16 | (regs)) #define ARM_LSL_R(rd, rn, rm) (_AL3_R(ARM_INST_LSL, rd, 0, rn) | (rm) << 8) #define ARM_LSL_I(rd, rn, imm) (_AL3_I(ARM_INST_LSL, rd, 0, rn) | (imm) << 7) #define ARM_LSR_R(rd, rn, rm) (_AL3_R(ARM_INST_LSR, rd, 0, rn) | (rm) << 8) #define ARM_LSR_I(rd, rn, imm) (_AL3_I(ARM_INST_LSR, rd, 0, rn) | (imm) << 7) +#define ARM_ASR_R(rd, rn, rm) (_AL3_R(ARM_INST_ASR, rd, 0, rn) | (rm) << 8) +#define ARM_ASR_I(rd, rn, imm) (_AL3_I(ARM_INST_ASR, rd, 0, rn) | (imm) << 7) #define ARM_MOV_R(rd, rm) _AL3_R(ARM_INST_MOV, rd, 0, rm) +#define ARM_MOVS_R(rd, rm) _AL3_R(ARM_INST_MOVS, rd, 0, rm) #define ARM_MOV_I(rd, imm) _AL3_I(ARM_INST_MOV, rd, 0, imm) +#define ARM_MOV_SR(rd, rm, type, rs) \ + (_AL3_SR(ARM_MOV_R(rd, rm)) | (type) << 5 | (rs) << 8) +#define ARM_MOV_SI(rd, rm, type, imm6) \ + (ARM_MOV_R(rd, rm) | (type) << 5 | (imm6) << 7) #define ARM_MOVW(rd, imm) \ (ARM_INST_MOVW | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff)) @@ -190,19 +228,38 @@ #define ARM_ORR_R(rd, rn, rm) _AL3_R(ARM_INST_ORR, rd, rn, rm) #define ARM_ORR_I(rd, rn, imm) _AL3_I(ARM_INST_ORR, rd, rn, imm) -#define ARM_ORR_S(rd, rn, rm, type, rs) \ - (ARM_ORR_R(rd, rn, rm) | (type) << 5 | (rs) << 7) +#define ARM_ORR_SR(rd, rn, rm, type, rs) \ + (_AL3_SR(ARM_ORR_R(rd, rn, rm)) | (type) << 5 | (rs) << 8) +#define ARM_ORRS_R(rd, rn, rm) _AL3_R(ARM_INST_ORRS, rd, rn, rm) +#define ARM_ORRS_SR(rd, rn, rm, type, rs) \ + (_AL3_SR(ARM_ORRS_R(rd, rn, rm)) | (type) << 5 | (rs) << 8) +#define ARM_ORR_SI(rd, rn, rm, type, imm6) \ + (ARM_ORR_R(rd, rn, rm) | (type) << 5 | (imm6) << 7) +#define ARM_ORRS_SI(rd, rn, rm, type, imm6) \ + (ARM_ORRS_R(rd, rn, rm) | (type) << 5 | (imm6) << 7) #define ARM_REV(rd, rm) (ARM_INST_REV | (rd) << 12 | (rm)) #define ARM_REV16(rd, rm) (ARM_INST_REV16 | (rd) << 12 | (rm)) #define ARM_RSB_I(rd, rn, imm) _AL3_I(ARM_INST_RSB, rd, rn, imm) +#define ARM_RSBS_I(rd, rn, imm) _AL3_I(ARM_INST_RSBS, rd, rn, imm) +#define ARM_RSC_I(rd, rn, imm) _AL3_I(ARM_INST_RSC, rd, rn, imm) #define ARM_SUB_R(rd, rn, rm) _AL3_R(ARM_INST_SUB, rd, rn, rm) +#define ARM_SUBS_R(rd, rn, rm) _AL3_R(ARM_INST_SUBS, rd, rn, rm) +#define ARM_RSB_R(rd, rn, rm) _AL3_R(ARM_INST_RSB, rd, rn, rm) +#define ARM_SBC_R(rd, rn, rm) _AL3_R(ARM_INST_SBC, rd, rn, rm) +#define ARM_SBCS_R(rd, rn, rm) _AL3_R(ARM_INST_SBCS, rd, rn, rm) #define ARM_SUB_I(rd, rn, imm) _AL3_I(ARM_INST_SUB, rd, rn, imm) +#define ARM_SUBS_I(rd, rn, imm) _AL3_I(ARM_INST_SUBS, rd, rn, imm) +#define ARM_SBC_I(rd, rn, imm) _AL3_I(ARM_INST_SBC, rd, rn, imm) #define ARM_STR_I(rt, rn, off) (ARM_INST_STR_I | (rt) << 12 | (rn) << 16 \ - | (off)) + | ((off) & 0xfff)) +#define ARM_STRH_I(rt, rn, off) (ARM_INST_STRH_I | (rt) << 12 | (rn) << 16 \ + | (((off) & 0xf0) << 4) | ((off) & 0xf)) +#define ARM_STRB_I(rt, rn, off) (ARM_INST_STRB_I | (rt) << 12 | (rn) << 16 \ + | (((off) & 0xf0) << 4) | ((off) & 0xf)) #define ARM_TST_R(rn, rm) _AL3_R(ARM_INST_TST, 0, rn, rm) #define ARM_TST_I(rn, imm) _AL3_I(ARM_INST_TST, 0, rn, imm) @@ -214,5 +271,6 @@ #define ARM_MLS(rd, rn, rm, ra) (ARM_INST_MLS | (rd) << 16 | (rn) | (rm) << 8 \ | (ra) << 12) +#define ARM_UXTH(rd, rm) (ARM_INST_UXTH | (rd) << 12 | (rm)) #endif /* PFILTER_OPCODES_ARM_H */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dfd908630631..0df64a6a56d4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -75,6 +75,7 @@ config ARM64 select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_VMAP_STACK select HAVE_ARM_SMCCC select HAVE_EBPF_JIT select HAVE_C_RECORDMCOUNT @@ -960,6 +961,18 @@ config ARM64_UAO regular load/store instructions if the cpu does not implement the feature. +config ARM64_PMEM + bool "Enable support for persistent memory" + select ARCH_HAS_PMEM_API + select ARCH_HAS_UACCESS_FLUSHCACHE + help + Say Y to enable support for the persistent memory API based on the + ARMv8.2 DCPoP feature. + + The feature is detected at runtime, and the kernel will use DC CVAC + operations if DC CVAP is not supported (following the behaviour of + DC CVAP itself if the system does not define a point of persistence). + endmenu config ARM64_MODULE_CMODEL_LARGE diff --git a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi index e2b0da2c0bc7..105b2938082f 100644 --- a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi +++ b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi @@ -280,9 +280,6 @@ &decon { status = "okay"; - - i80-if-timings { - }; }; &decon_tv { @@ -1116,9 +1113,6 @@ &mic { status = "okay"; - - i80-if-timings { - }; }; &pmu_system_controller { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 31fd77f82ced..d16b9cc1e825 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -653,21 +653,21 @@ }; msi1: msi-controller1@1571000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1571000 0x0 0x8>; msi-controller; interrupts = <0 116 0x4>; }; msi2: msi-controller2@1572000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1572000 0x0 0x8>; msi-controller; interrupts = <0 126 0x4>; }; msi3: msi-controller3@1573000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1573000 0x0 0x8>; msi-controller; interrupts = <0 160 0x4>; @@ -689,7 +689,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi1>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 110 0x4>, @@ -714,7 +714,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi2>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 120 0x4>, @@ -739,7 +739,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x50 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi3>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 154 0x4>, diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index dc1640be0345..c8ff0baddf1d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -630,6 +630,37 @@ interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clockgen 4 1>; }; + + msi1: msi-controller@1580000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x1580000 0x0 0x10000>; + interrupts = <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>; + }; + + msi2: msi-controller@1590000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x1590000 0x0 0x10000>; + interrupts = <GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>; + }; + + msi3: msi-controller@15a0000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x15a0000 0x0 0x10000>; + interrupts = <GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>; + }; + }; reserved-memory { diff --git a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts index cf272392cebf..b9f36dad17e6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts @@ -50,6 +50,23 @@ chosen { stdout-path = "serial2:1500000n8"; }; + + vcc_phy: vcc-phy-regulator { + compatible = "regulator-fixed"; + regulator-name = "vcc_phy"; + regulator-always-on; + regulator-boot-on; + }; +}; + +&gmac2phy { + phy-supply = <&vcc_phy>; + clock_in_out = "output"; + assigned-clocks = <&cru SCLK_MAC2PHY_SRC>; + assigned-clock-rate = <50000000>; + assigned-clocks = <&cru SCLK_MAC2PHY>; + assigned-clock-parents = <&cru SCLK_MAC2PHY_SRC>; + status = "okay"; }; &uart2 { diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 0be96cee27bd..d48bf5d9f8bd 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -63,6 +63,8 @@ i2c1 = &i2c1; i2c2 = &i2c2; i2c3 = &i2c3; + ethernet0 = &gmac2io; + ethernet1 = &gmac2phy; }; cpus { @@ -424,6 +426,43 @@ status = "disabled"; }; + gmac2phy: ethernet@ff550000 { + compatible = "rockchip,rk3328-gmac"; + reg = <0x0 0xff550000 0x0 0x10000>; + rockchip,grf = <&grf>; + interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + clocks = <&cru SCLK_MAC2PHY_SRC>, <&cru SCLK_MAC2PHY_RXTX>, + <&cru SCLK_MAC2PHY_RXTX>, <&cru SCLK_MAC2PHY_REF>, + <&cru ACLK_MAC2PHY>, <&cru PCLK_MAC2PHY>, + <&cru SCLK_MAC2PHY_OUT>; + clock-names = "stmmaceth", "mac_clk_rx", + "mac_clk_tx", "clk_mac_ref", + "aclk_mac", "pclk_mac", + "clk_macphy"; + resets = <&cru SRST_GMAC2PHY_A>, <&cru SRST_MACPHY>; + reset-names = "stmmaceth", "mac-phy"; + phy-mode = "rmii"; + phy-handle = <&phy>; + status = "disabled"; + + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + + phy: phy@0 { + compatible = "ethernet-phy-id1234.d400", "ethernet-phy-ieee802.3-c22"; + reg = <0>; + clocks = <&cru SCLK_MAC2PHY_OUT>; + resets = <&cru SRST_MACPHY>; + pinctrl-names = "default"; + pinctrl-0 = <&fephyled_rxm1 &fephyled_linkm1>; + phy-is-integrated; + }; + }; + }; + gic: interrupt-controller@ff811000 { compatible = "arm,gic-400"; #interrupt-cells = <3>; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index b4ca115b3be1..cdde4f56a281 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -203,6 +203,7 @@ CONFIG_MARVELL_PHY=m CONFIG_MESON_GXL_PHY=m CONFIG_MICREL_PHY=y CONFIG_REALTEK_PHY=m +CONFIG_ROCKCHIP_PHY=y CONFIG_USB_PEGASUS=m CONFIG_USB_RTL8150=m CONFIG_USB_RTL8152=m diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index d92293747d63..7ca54a76f6b9 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -18,18 +18,23 @@ config CRYPTO_SHA512_ARM64 config CRYPTO_SHA1_ARM64_CE tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_SHA1 config CRYPTO_SHA2_ARM64_CE tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_SHA256_ARM64 config CRYPTO_GHASH_ARM64_CE - tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" - depends on ARM64 && KERNEL_MODE_NEON + tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_GF128MUL + select CRYPTO_AES + select CRYPTO_AES_ARM64 config CRYPTO_CRCT10DIF_ARM64_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" @@ -49,25 +54,29 @@ config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI + select CRYPTO_AES_ARM64 config CRYPTO_AES_ARM64_CE_CCM tristate "AES in CCM mode using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI select CRYPTO_AES_ARM64_CE + select CRYPTO_AES_ARM64 select CRYPTO_AEAD config CRYPTO_AES_ARM64_CE_BLK tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES_ARM64_CE + select CRYPTO_AES_ARM64 select CRYPTO_SIMD config CRYPTO_AES_ARM64_NEON_BLK tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER + select CRYPTO_AES_ARM64 select CRYPTO_AES select CRYPTO_SIMD @@ -82,6 +91,7 @@ config CRYPTO_AES_ARM64_BS depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES_ARM64_NEON_BLK + select CRYPTO_AES_ARM64 select CRYPTO_SIMD endif diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 3363560c79b7..e3a375c4cb83 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -1,7 +1,7 @@ /* * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ ENTRY(ce_aes_ccm_auth_data) beq 8f /* out of input? */ cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.16b}, [x4] /* load first round key */ +1: ld1 {v3.4s}, [x4] /* load first round key */ prfm pldl1strm, [x1] cmp w5, #12 /* which key size? */ add x6, x4, #16 @@ -42,17 +42,17 @@ ENTRY(ce_aes_ccm_auth_data) mov v5.16b, v3.16b b 4f 2: mov v4.16b, v3.16b - ld1 {v5.16b}, [x6], #16 /* load 2nd round key */ + ld1 {v5.4s}, [x6], #16 /* load 2nd round key */ 3: aese v0.16b, v4.16b aesmc v0.16b, v0.16b -4: ld1 {v3.16b}, [x6], #16 /* load next round key */ +4: ld1 {v3.4s}, [x6], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b -5: ld1 {v4.16b}, [x6], #16 /* load next round key */ +5: ld1 {v4.4s}, [x6], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b - ld1 {v5.16b}, [x6], #16 /* load next round key */ + ld1 {v5.4s}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b subs w2, w2, #16 /* last data? */ @@ -90,7 +90,7 @@ ENDPROC(ce_aes_ccm_auth_data) * u32 rounds); */ ENTRY(ce_aes_ccm_final) - ld1 {v3.16b}, [x2], #16 /* load first round key */ + ld1 {v3.4s}, [x2], #16 /* load first round key */ ld1 {v0.16b}, [x0] /* load mac */ cmp w3, #12 /* which key size? */ sub w3, w3, #2 /* modified # of rounds */ @@ -100,17 +100,17 @@ ENTRY(ce_aes_ccm_final) mov v5.16b, v3.16b b 2f 0: mov v4.16b, v3.16b -1: ld1 {v5.16b}, [x2], #16 /* load next round key */ +1: ld1 {v5.4s}, [x2], #16 /* load next round key */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -2: ld1 {v3.16b}, [x2], #16 /* load next round key */ +2: ld1 {v3.4s}, [x2], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -3: ld1 {v4.16b}, [x2], #16 /* load next round key */ +3: ld1 {v4.4s}, [x2], #16 /* load next round key */ subs w3, w3, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b @@ -137,31 +137,31 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ cmp w4, #12 /* which key size? */ sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.16b}, [x3] /* load first round key */ + ld1 {v3.4s}, [x3] /* load first round key */ add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b b 3f 1: mov v4.16b, v3.16b - ld1 {v5.16b}, [x10], #16 /* load 2nd round key */ + ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ 2: /* inner loop: 3 rounds, 2x interleaved */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -3: ld1 {v3.16b}, [x10], #16 /* load next round key */ +3: ld1 {v3.4s}, [x10], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -4: ld1 {v4.16b}, [x10], #16 /* load next round key */ +4: ld1 {v4.4s}, [x10], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b aese v1.16b, v3.16b aesmc v1.16b, v1.16b - ld1 {v5.16b}, [x10], #16 /* load next round key */ + ld1 {v5.4s}, [x10], #16 /* load next round key */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index 6a7dbc7c83a6..a1254036f2b1 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -1,7 +1,7 @@ /* * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/aes.h> #include <crypto/scatterwalk.h> @@ -44,6 +45,8 @@ asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], u32 rounds); +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, unsigned int key_len) { @@ -103,7 +106,45 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) return 0; } -static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) +static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[], + u32 abytes, u32 *macp, bool use_neon) +{ + if (likely(use_neon)) { + ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc, + num_rounds(key)); + } else { + if (*macp > 0 && *macp < AES_BLOCK_SIZE) { + int added = min(abytes, AES_BLOCK_SIZE - *macp); + + crypto_xor(&mac[*macp], in, added); + + *macp += added; + in += added; + abytes -= added; + } + + while (abytes > AES_BLOCK_SIZE) { + __aes_arm64_encrypt(key->key_enc, mac, mac, + num_rounds(key)); + crypto_xor(mac, in, AES_BLOCK_SIZE); + + in += AES_BLOCK_SIZE; + abytes -= AES_BLOCK_SIZE; + } + + if (abytes > 0) { + __aes_arm64_encrypt(key->key_enc, mac, mac, + num_rounds(key)); + crypto_xor(mac, in, abytes); + *macp = abytes; + } else { + *macp = 0; + } + } +} + +static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[], + bool use_neon) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); @@ -122,8 +163,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) ltag.len = 6; } - ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, - num_rounds(ctx)); + ccm_update_mac(ctx, mac, (u8 *)<ag, ltag.len, &macp, use_neon); scatterwalk_start(&walk, req->src); do { @@ -135,8 +175,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) n = scatterwalk_clamp(&walk, len); } p = scatterwalk_map(&walk); - ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, - num_rounds(ctx)); + ccm_update_mac(ctx, mac, p, n, &macp, use_neon); len -= n; scatterwalk_unmap(p); @@ -145,6 +184,56 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) } while (len); } +static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[], + struct crypto_aes_ctx *ctx, bool enc) +{ + u8 buf[AES_BLOCK_SIZE]; + int err = 0; + + while (walk->nbytes) { + int blocks = walk->nbytes / AES_BLOCK_SIZE; + u32 tail = walk->nbytes % AES_BLOCK_SIZE; + u8 *dst = walk->dst.virt.addr; + u8 *src = walk->src.virt.addr; + u32 nbytes = walk->nbytes; + + if (nbytes == walk->total && tail > 0) { + blocks++; + tail = 0; + } + + do { + u32 bsize = AES_BLOCK_SIZE; + + if (nbytes < AES_BLOCK_SIZE) + bsize = nbytes; + + crypto_inc(walk->iv, AES_BLOCK_SIZE); + __aes_arm64_encrypt(ctx->key_enc, buf, walk->iv, + num_rounds(ctx)); + __aes_arm64_encrypt(ctx->key_enc, mac, mac, + num_rounds(ctx)); + if (enc) + crypto_xor(mac, src, bsize); + crypto_xor_cpy(dst, src, buf, bsize); + if (!enc) + crypto_xor(mac, dst, bsize); + dst += bsize; + src += bsize; + nbytes -= bsize; + } while (--blocks); + + err = skcipher_walk_done(walk, tail); + } + + if (!err) { + __aes_arm64_encrypt(ctx->key_enc, buf, iv0, num_rounds(ctx)); + __aes_arm64_encrypt(ctx->key_enc, mac, mac, num_rounds(ctx)); + crypto_xor(mac, buf, AES_BLOCK_SIZE); + } + return err; +} + static int ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); @@ -153,39 +242,46 @@ static int ccm_encrypt(struct aead_request *req) u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen; + bool use_neon = may_use_simd(); int err; err = ccm_init_mac(req, mac, len); if (err) return err; - kernel_neon_begin_partial(6); + if (likely(use_neon)) + kernel_neon_begin(); if (req->assoclen) - ccm_calculate_auth_mac(req, mac); + ccm_calculate_auth_mac(req, mac, use_neon); /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, true); - while (walk.nbytes) { - u32 tail = walk.nbytes % AES_BLOCK_SIZE; - - if (walk.nbytes == walk.total) - tail = 0; + if (likely(use_neon)) { + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; - ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + if (walk.nbytes == walk.total) + tail = 0; - err = skcipher_walk_done(&walk, tail); - } - if (!err) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + ce_aes_ccm_encrypt(walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); - kernel_neon_end(); + err = skcipher_walk_done(&walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, + num_rounds(ctx)); + kernel_neon_end(); + } else { + err = ccm_crypt_fallback(&walk, mac, buf, ctx, true); + } if (err) return err; @@ -205,38 +301,46 @@ static int ccm_decrypt(struct aead_request *req) u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen - authsize; + bool use_neon = may_use_simd(); int err; err = ccm_init_mac(req, mac, len); if (err) return err; - kernel_neon_begin_partial(6); + if (likely(use_neon)) + kernel_neon_begin(); if (req->assoclen) - ccm_calculate_auth_mac(req, mac); + ccm_calculate_auth_mac(req, mac, use_neon); /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_decrypt(&walk, req, true); - while (walk.nbytes) { - u32 tail = walk.nbytes % AES_BLOCK_SIZE; + if (likely(use_neon)) { + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; - if (walk.nbytes == walk.total) - tail = 0; + if (walk.nbytes == walk.total) + tail = 0; - ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + ce_aes_ccm_decrypt(walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); - err = skcipher_walk_done(&walk, tail); - } - if (!err) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + err = skcipher_walk_done(&walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, + num_rounds(ctx)); - kernel_neon_end(); + kernel_neon_end(); + } else { + err = ccm_crypt_fallback(&walk, mac, buf, ctx, false); + } if (err) return err; diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c index 50d9fe11d0c8..6a75cd75ed11 100644 --- a/arch/arm64/crypto/aes-ce-cipher.c +++ b/arch/arm64/crypto/aes-ce-cipher.c @@ -1,7 +1,7 @@ /* * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,8 @@ */ #include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> #include <crypto/aes.h> #include <linux/cpufeature.h> #include <linux/crypto.h> @@ -20,6 +22,9 @@ MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + struct aes_block { u8 b[AES_BLOCK_SIZE]; }; @@ -44,27 +49,32 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) void *dummy0; int dummy1; - kernel_neon_begin_partial(4); + if (!may_use_simd()) { + __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); + return; + } + + kernel_neon_begin(); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" + " ld1 {v1.4s}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" "1: aese v0.16b, v2.16b ;" " aesmc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" + "2: ld1 {v1.4s}, [%[key]], #16 ;" " aese v0.16b, v3.16b ;" " aesmc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" + "3: ld1 {v2.4s}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aese v0.16b, v1.16b ;" " aesmc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" " bpl 1b ;" " aese v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -89,27 +99,32 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) void *dummy0; int dummy1; - kernel_neon_begin_partial(4); + if (!may_use_simd()) { + __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); + return; + } + + kernel_neon_begin(); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" + " ld1 {v1.4s}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" "1: aesd v0.16b, v2.16b ;" " aesimc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" + "2: ld1 {v1.4s}, [%[key]], #16 ;" " aesd v0.16b, v3.16b ;" " aesimc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" + "3: ld1 {v2.4s}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aesd v0.16b, v1.16b ;" " aesimc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" " bpl 1b ;" " aesd v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -165,20 +180,16 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_len != AES_KEYSIZE_256) return -EINVAL; - memcpy(ctx->key_enc, in_key, key_len); ctx->key_length = key_len; + for (i = 0; i < kwords; i++) + ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32)); - kernel_neon_begin_partial(2); + kernel_neon_begin(); for (i = 0; i < sizeof(rcon); i++) { u32 *rki = ctx->key_enc + (i * kwords); u32 *rko = rki + kwords; -#ifndef CONFIG_CPU_BIG_ENDIAN rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; -#else - rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^ - rki[0]; -#endif rko[1] = rko[0] ^ rki[1]; rko[2] = rko[1] ^ rki[2]; rko[3] = rko[2] ^ rki[3]; @@ -210,9 +221,9 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_dec[0] = key_enc[j]; for (i = 1, j--; j > 0; i++, j--) - __asm__("ld1 {v0.16b}, %[in] ;" + __asm__("ld1 {v0.4s}, %[in] ;" "aesimc v1.16b, v0.16b ;" - "st1 {v1.16b}, %[out] ;" + "st1 {v1.4s}, %[out] ;" : [out] "=Q"(key_dec[i]) : [in] "Q"(key_enc[j]) diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S index b46093d567e5..50330f5c3adc 100644 --- a/arch/arm64/crypto/aes-ce.S +++ b/arch/arm64/crypto/aes-ce.S @@ -2,7 +2,7 @@ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with * Crypto Extensions * - * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -22,11 +22,11 @@ cmp \rounds, #12 blo 2222f /* 128 bits */ beq 1111f /* 192 bits */ - ld1 {v17.16b-v18.16b}, [\rk], #32 -1111: ld1 {v19.16b-v20.16b}, [\rk], #32 -2222: ld1 {v21.16b-v24.16b}, [\rk], #64 - ld1 {v25.16b-v28.16b}, [\rk], #64 - ld1 {v29.16b-v31.16b}, [\rk] + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] .endm /* prepare for encryption with key in rk[] */ diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S index f2f9cc519309..6d2445d603cc 100644 --- a/arch/arm64/crypto/aes-cipher-core.S +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/cache.h> .text @@ -17,94 +18,155 @@ out .req x1 in .req x2 rounds .req x3 - tt .req x4 - lt .req x2 + tt .req x2 - .macro __pair, enc, reg0, reg1, in0, in1e, in1d, shift + .macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift + .ifc \op\shift, b0 + ubfiz \reg0, \in0, #2, #8 + ubfiz \reg1, \in1e, #2, #8 + .else ubfx \reg0, \in0, #\shift, #8 - .if \enc ubfx \reg1, \in1e, #\shift, #8 - .else - ubfx \reg1, \in1d, #\shift, #8 .endif + + /* + * AArch64 cannot do byte size indexed loads from a table containing + * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a + * valid instruction. So perform the shift explicitly first for the + * high bytes (the low byte is shifted implicitly by using ubfiz rather + * than ubfx above) + */ + .ifnc \op, b ldr \reg0, [tt, \reg0, uxtw #2] ldr \reg1, [tt, \reg1, uxtw #2] + .else + .if \shift > 0 + lsl \reg0, \reg0, #2 + lsl \reg1, \reg1, #2 + .endif + ldrb \reg0, [tt, \reg0, uxtw] + ldrb \reg1, [tt, \reg1, uxtw] + .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc + .macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift + ubfx \reg0, \in0, #\shift, #8 + ubfx \reg1, \in1d, #\shift, #8 + ldr\op \reg0, [tt, \reg0, uxtw #\sz] + ldr\op \reg1, [tt, \reg1, uxtw #\sz] + .endm + + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op ldp \out0, \out1, [rk], #8 - __pair \enc, w13, w14, \in0, \in1, \in3, 0 - __pair \enc, w15, w16, \in1, \in2, \in0, 8 - __pair \enc, w17, w18, \in2, \in3, \in1, 16 - __pair \enc, \t0, \t1, \in3, \in0, \in2, 24 - - eor \out0, \out0, w13 - eor \out1, \out1, w14 - eor \out0, \out0, w15, ror #24 - eor \out1, \out1, w16, ror #24 - eor \out0, \out0, w17, ror #16 - eor \out1, \out1, w18, ror #16 + __pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0 + __pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8 + __pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16 + __pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24 + + eor \out0, \out0, w12 + eor \out1, \out1, w13 + eor \out0, \out0, w14, ror #24 + eor \out1, \out1, w15, ror #24 + eor \out0, \out0, w16, ror #16 + eor \out1, \out1, w17, ror #16 eor \out0, \out0, \t0, ror #8 eor \out1, \out1, \t1, ror #8 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op .endm - .macro do_crypt, round, ttab, ltab - ldp w5, w6, [in] - ldp w7, w8, [in, #8] - ldp w9, w10, [rk], #16 - ldp w11, w12, [rk, #-8] + .macro do_crypt, round, ttab, ltab, bsz + ldp w4, w5, [in] + ldp w6, w7, [in, #8] + ldp w8, w9, [rk], #16 + ldp w10, w11, [rk, #-8] +CPU_BE( rev w4, w4 ) CPU_BE( rev w5, w5 ) CPU_BE( rev w6, w6 ) CPU_BE( rev w7, w7 ) -CPU_BE( rev w8, w8 ) + eor w4, w4, w8 eor w5, w5, w9 eor w6, w6, w10 eor w7, w7, w11 - eor w8, w8, w12 adr_l tt, \ttab - adr_l lt, \ltab tbnz rounds, #1, 1f -0: \round w9, w10, w11, w12, w5, w6, w7, w8 - \round w5, w6, w7, w8, w9, w10, w11, w12 +0: \round w8, w9, w10, w11, w4, w5, w6, w7 + \round w4, w5, w6, w7, w8, w9, w10, w11 1: subs rounds, rounds, #4 - \round w9, w10, w11, w12, w5, w6, w7, w8 - csel tt, tt, lt, hi - \round w5, w6, w7, w8, w9, w10, w11, w12 - b.hi 0b - + \round w8, w9, w10, w11, w4, w5, w6, w7 + b.ls 3f +2: \round w4, w5, w6, w7, w8, w9, w10, w11 + b 0b +3: adr_l tt, \ltab + \round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b + +CPU_BE( rev w4, w4 ) CPU_BE( rev w5, w5 ) CPU_BE( rev w6, w6 ) CPU_BE( rev w7, w7 ) -CPU_BE( rev w8, w8 ) - stp w5, w6, [out] - stp w7, w8, [out, #8] + stp w4, w5, [out] + stp w6, w7, [out, #8] ret .endm - .align 5 + .align L1_CACHE_SHIFT + .type __aes_arm64_inverse_sbox, %object +__aes_arm64_inverse_sbox: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox + ENTRY(__aes_arm64_encrypt) - do_crypt fround, crypto_ft_tab, crypto_fl_tab + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 ENDPROC(__aes_arm64_encrypt) .align 5 ENTRY(__aes_arm64_decrypt) - do_crypt iround, crypto_it_tab, crypto_il_tab + do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0 ENDPROC(__aes_arm64_decrypt) diff --git a/arch/arm64/crypto/aes-ctr-fallback.h b/arch/arm64/crypto/aes-ctr-fallback.h new file mode 100644 index 000000000000..c9285717b6b5 --- /dev/null +++ b/arch/arm64/crypto/aes-ctr-fallback.h @@ -0,0 +1,53 @@ +/* + * Fallback for sync aes(ctr) in contexts where kernel mode NEON + * is not allowed + * + * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <crypto/aes.h> +#include <crypto/internal/skcipher.h> + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +static inline int aes_ctr_encrypt_fallback(struct crypto_aes_ctx *ctx, + struct skcipher_request *req) +{ + struct skcipher_walk walk; + u8 buf[AES_BLOCK_SIZE]; + int err; + + err = skcipher_walk_virt(&walk, req, true); + + while (walk.nbytes > 0) { + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + int nbytes = walk.nbytes; + int tail = 0; + + if (nbytes < walk.total) { + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + tail = walk.nbytes % AES_BLOCK_SIZE; + } + + do { + int bsize = min(nbytes, AES_BLOCK_SIZE); + + __aes_arm64_encrypt(ctx->key_enc, buf, walk.iv, + 6 + ctx->key_length / 4); + crypto_xor_cpy(dst, src, buf, bsize); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + nbytes -= AES_BLOCK_SIZE; + } while (nbytes > 0); + + err = skcipher_walk_done(&walk, tail); + } + return err; +} diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index bcf596b0197e..998ba519a026 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -10,6 +10,7 @@ #include <asm/neon.h> #include <asm/hwcap.h> +#include <asm/simd.h> #include <crypto/aes.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> @@ -19,6 +20,7 @@ #include <crypto/xts.h> #include "aes-ce-setkey.h" +#include "aes-ctr-fallback.h" #ifdef USE_V8_CRYPTO_EXTENSIONS #define MODE "ce" @@ -241,9 +243,7 @@ static int ctr_encrypt(struct skcipher_request *req) aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds, blocks, walk.iv, first); - if (tdst != tsrc) - memcpy(tdst, tsrc, nbytes); - crypto_xor(tdst, tail, nbytes); + crypto_xor_cpy(tdst, tsrc, tail, nbytes); err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); @@ -251,6 +251,17 @@ static int ctr_encrypt(struct skcipher_request *req) return err; } +static int ctr_encrypt_sync(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (!may_use_simd()) + return aes_ctr_encrypt_fallback(ctx, req); + + return ctr_encrypt(req); +} + static int xts_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -357,8 +368,8 @@ static struct skcipher_alg aes_algs[] = { { .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = skcipher_aes_setkey, - .encrypt = ctr_encrypt, - .decrypt = ctr_encrypt, + .encrypt = ctr_encrypt_sync, + .decrypt = ctr_encrypt_sync, }, { .base = { .cra_name = "__xts(aes)", @@ -460,11 +471,35 @@ static int mac_init(struct shash_desc *desc) return 0; } +static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, + u8 dg[], int enc_before, int enc_after) +{ + int rounds = 6 + ctx->key_length / 4; + + if (may_use_simd()) { + kernel_neon_begin(); + aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before, + enc_after); + kernel_neon_end(); + } else { + if (enc_before) + __aes_arm64_encrypt(ctx->key_enc, dg, dg, rounds); + + while (blocks--) { + crypto_xor(dg, in, AES_BLOCK_SIZE); + in += AES_BLOCK_SIZE; + + if (blocks || enc_after) + __aes_arm64_encrypt(ctx->key_enc, dg, dg, + rounds); + } + } +} + static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; while (len > 0) { unsigned int l; @@ -476,10 +511,8 @@ static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) len %= AES_BLOCK_SIZE; - kernel_neon_begin(); - aes_mac_update(p, tctx->key.key_enc, rounds, blocks, - ctx->dg, (ctx->len != 0), (len != 0)); - kernel_neon_end(); + mac_do_update(&tctx->key, p, blocks, ctx->dg, + (ctx->len != 0), (len != 0)); p += blocks * AES_BLOCK_SIZE; @@ -507,11 +540,8 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; - kernel_neon_begin(); - aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0); - kernel_neon_end(); + mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0); memcpy(out, ctx->dg, AES_BLOCK_SIZE); @@ -522,7 +552,6 @@ static int cmac_final(struct shash_desc *desc, u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; u8 *consts = tctx->consts; if (ctx->len != AES_BLOCK_SIZE) { @@ -530,9 +559,7 @@ static int cmac_final(struct shash_desc *desc, u8 *out) consts += AES_BLOCK_SIZE; } - kernel_neon_begin(); - aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1); - kernel_neon_end(); + mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1); memcpy(out, ctx->dg, AES_BLOCK_SIZE); diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index db2501d93550..c55d68ccb89f 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -1,7 +1,7 @@ /* * Bit sliced AES using NEON instructions * - * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,12 +9,15 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <crypto/aes.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/xts.h> #include <linux/module.h> +#include "aes-ctr-fallback.h" + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); @@ -58,6 +61,11 @@ struct aesbs_cbc_ctx { u32 enc[AES_MAX_KEYLENGTH_U32]; }; +struct aesbs_ctr_ctx { + struct aesbs_ctx key; /* must be first member */ + struct crypto_aes_ctx fallback; +}; + struct aesbs_xts_ctx { struct aesbs_ctx key; u32 twkey[AES_MAX_KEYLENGTH_U32]; @@ -196,6 +204,25 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } +static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + err = crypto_aes_expand_key(&ctx->fallback, in_key, key_len); + if (err) + return err; + + ctx->key.rounds = 6 + key_len / 4; + + kernel_neon_begin(); + aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds); + kernel_neon_end(); + + return 0; +} + static int ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -224,9 +251,8 @@ static int ctr_encrypt(struct skcipher_request *req) u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; - if (dst != src) - memcpy(dst, src, walk.total % AES_BLOCK_SIZE); - crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); + crypto_xor_cpy(dst, src, final, + walk.total % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, 0); break; @@ -260,6 +286,17 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, return aesbs_setkey(tfm, in_key, key_len); } +static int ctr_encrypt_sync(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (!may_use_simd()) + return aes_ctr_encrypt_fallback(&ctx->fallback, req); + + return ctr_encrypt(req); +} + static int __xts_crypt(struct skcipher_request *req, void (*fn)(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[])) @@ -356,7 +393,7 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_driver_name = "ctr-aes-neonbs", .base.cra_priority = 250 - 1, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx), .base.cra_module = THIS_MODULE, .min_keysize = AES_MIN_KEY_SIZE, @@ -364,9 +401,9 @@ static struct skcipher_alg aes_algs[] = { { .chunksize = AES_BLOCK_SIZE, .walksize = 8 * AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE, - .setkey = aesbs_setkey, - .encrypt = ctr_encrypt, - .decrypt = ctr_encrypt, + .setkey = aesbs_ctr_setkey_sync, + .encrypt = ctr_encrypt_sync, + .decrypt = ctr_encrypt_sync, }, { .base.cra_name = "__xts(aes)", .base.cra_driver_name = "__xts-aes-neonbs", diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c index a7cd575ea223..cbdb75d15cd0 100644 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -1,7 +1,7 @@ /* * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions * - * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,7 @@ #include <asm/hwcap.h> #include <asm/neon.h> +#include <asm/simd.h> asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); @@ -64,7 +65,7 @@ static int chacha20_neon(struct skcipher_request *req) u32 state[16]; int err; - if (req->cryptlen <= CHACHA20_BLOCK_SIZE) + if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE) return crypto_chacha20_crypt(req); err = skcipher_walk_virt(&walk, req, true); diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c index eccb1ae90064..624f4137918c 100644 --- a/arch/arm64/crypto/crc32-ce-glue.c +++ b/arch/arm64/crypto/crc32-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions * - * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,6 +19,7 @@ #include <asm/hwcap.h> #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #define PMULL_MIN_LEN 64L /* minimum size of buffer @@ -105,10 +106,10 @@ static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, length -= l; } - if (length >= PMULL_MIN_LEN) { + if (length >= PMULL_MIN_LEN && may_use_simd()) { l = round_down(length, SCALE_F); - kernel_neon_begin_partial(10); + kernel_neon_begin(); *crc = crc32_pmull_le(data, l, *crc); kernel_neon_end(); @@ -137,10 +138,10 @@ static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data, length -= l; } - if (length >= PMULL_MIN_LEN) { + if (length >= PMULL_MIN_LEN && may_use_simd()) { l = round_down(length, SCALE_F); - kernel_neon_begin_partial(10); + kernel_neon_begin(); *crc = crc32c_pmull_le(data, l, *crc); kernel_neon_end(); diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 60cb590c2590..96f0cae4a022 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions * - * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ #include <crypto/internal/hash.h> #include <asm/neon.h> +#include <asm/simd.h> #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U @@ -48,9 +49,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data, } if (length > 0) { - kernel_neon_begin_partial(14); - *crc = crc_t10dif_pmull(*crc, data, length); - kernel_neon_end(); + if (may_use_simd()) { + kernel_neon_begin(); + *crc = crc_t10dif_pmull(*crc, data, length); + kernel_neon_end(); + } else { + *crc = crc_t10dif_generic(*crc, data, length); + } } return 0; diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index f0bb9f0b524f..11ebf1ae248a 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -11,31 +11,215 @@ #include <linux/linkage.h> #include <asm/assembler.h> - SHASH .req v0 - SHASH2 .req v1 - T1 .req v2 - T2 .req v3 - MASK .req v4 - XL .req v5 - XM .req v6 - XH .req v7 - IN1 .req v7 + SHASH .req v0 + SHASH2 .req v1 + T1 .req v2 + T2 .req v3 + MASK .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + IN1 .req v7 + + k00_16 .req v8 + k32_48 .req v9 + + t3 .req v10 + t4 .req v11 + t5 .req v12 + t6 .req v13 + t7 .req v14 + t8 .req v15 + t9 .req v16 + + perm1 .req v17 + perm2 .req v18 + perm3 .req v19 + + sh1 .req v20 + sh2 .req v21 + sh3 .req v22 + sh4 .req v23 + + ss1 .req v24 + ss2 .req v25 + ss3 .req v26 + ss4 .req v27 .text .arch armv8-a+crypto - /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) - */ -ENTRY(pmull_ghash_update) + .macro __pmull_p64, rd, rn, rm + pmull \rd\().1q, \rn\().1d, \rm\().1d + .endm + + .macro __pmull2_p64, rd, rn, rm + pmull2 \rd\().1q, \rn\().2d, \rm\().2d + .endm + + .macro __pmull_p8, rq, ad, bd + ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 + ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 + ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 + + __pmull_p8_\bd \rq, \ad + .endm + + .macro __pmull2_p8, rq, ad, bd + tbl t3.16b, {\ad\().16b}, perm1.16b // A1 + tbl t5.16b, {\ad\().16b}, perm2.16b // A2 + tbl t7.16b, {\ad\().16b}, perm3.16b // A3 + + __pmull2_p8_\bd \rq, \ad + .endm + + .macro __pmull_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_SHASH2, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 + .endm + + .macro __pmull2_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 + pmull\t t3.8h, t3.\nb, \bd // F = A1*B + pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 + pmull\t t5.8h, t5.\nb, \bd // H = A2*B + pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 + pmull\t t7.8h, t7.\nb, \bd // J = A3*B + pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 + pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 + pmull\t \rq\().8h, \ad, \bd // D = A*B + + eor t3.16b, t3.16b, t4.16b // L = E + F + eor t5.16b, t5.16b, t6.16b // M = G + H + eor t7.16b, t7.16b, t8.16b // N = I + J + + uzp1 t4.2d, t3.2d, t5.2d + uzp2 t3.2d, t3.2d, t5.2d + uzp1 t6.2d, t7.2d, t9.2d + uzp2 t7.2d, t7.2d, t9.2d + + // t3 = (L) (P0 + P1) << 8 + // t5 = (M) (P2 + P3) << 16 + eor t4.16b, t4.16b, t3.16b + and t3.16b, t3.16b, k32_48.16b + + // t7 = (N) (P4 + P5) << 24 + // t9 = (K) (P6 + P7) << 32 + eor t6.16b, t6.16b, t7.16b + and t7.16b, t7.16b, k00_16.16b + + eor t4.16b, t4.16b, t3.16b + eor t6.16b, t6.16b, t7.16b + + zip2 t5.2d, t4.2d, t3.2d + zip1 t3.2d, t4.2d, t3.2d + zip2 t9.2d, t6.2d, t7.2d + zip1 t7.2d, t6.2d, t7.2d + + ext t3.16b, t3.16b, t3.16b, #15 + ext t5.16b, t5.16b, t5.16b, #14 + ext t7.16b, t7.16b, t7.16b, #13 + ext t9.16b, t9.16b, t9.16b, #12 + + eor t3.16b, t3.16b, t5.16b + eor t7.16b, t7.16b, t9.16b + eor \rq\().16b, \rq\().16b, t3.16b + eor \rq\().16b, \rq\().16b, t7.16b + .endm + + .macro __pmull_pre_p64 + movi MASK.16b, #0xe1 + shl MASK.2d, MASK.2d, #57 + .endm + + .macro __pmull_pre_p8 + // k00_16 := 0x0000000000000000_000000000000ffff + // k32_48 := 0x00000000ffffffff_0000ffffffffffff + movi k32_48.2d, #0xffffffff + mov k32_48.h[2], k32_48.h[0] + ushr k00_16.2d, k32_48.2d, #32 + + // prepare the permutation vectors + mov_q x5, 0x080f0e0d0c0b0a09 + movi T1.8b, #8 + dup perm1.2d, x5 + eor perm1.16b, perm1.16b, T1.16b + ushr perm2.2d, perm1.2d, #8 + ushr perm3.2d, perm1.2d, #16 + ushr T1.2d, perm1.2d, #24 + sli perm2.2d, perm1.2d, #56 + sli perm3.2d, perm1.2d, #48 + sli T1.2d, perm1.2d, #40 + + // precompute loop invariants + tbl sh1.16b, {SHASH.16b}, perm1.16b + tbl sh2.16b, {SHASH.16b}, perm2.16b + tbl sh3.16b, {SHASH.16b}, perm3.16b + tbl sh4.16b, {SHASH.16b}, T1.16b + ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 + ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 + ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 + ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + pmull T2.1q, XL.1d, MASK.1d + eor XM.16b, XM.16b, T1.16b + + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + eor XL.16b, XM.16b, T2.16b + ext T2.16b, XL.16b, XL.16b, #8 + pmull XL.1q, XL.1d, MASK.1d + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + eor XM.16b, XM.16b, T1.16b + + mov XL.d[1], XM.d[0] + mov XH.d[0], XM.d[1] + + shl T1.2d, XL.2d, #57 + shl T2.2d, XL.2d, #62 + eor T2.16b, T2.16b, T1.16b + shl T1.2d, XL.2d, #63 + eor T2.16b, T2.16b, T1.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, T2.16b, T1.16b + + mov XL.d[1], T2.d[0] + mov XH.d[0], T2.d[1] + + ushr T2.2d, XL.2d, #1 + eor XH.16b, XH.16b, XL.16b + eor XL.16b, XL.16b, T2.16b + ushr T2.2d, T2.2d, #6 + ushr XL.2d, XL.2d, #1 + .endm + + .macro __pmull_ghash, pn ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] - movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 - shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, SHASH.16b + __pmull_pre_\pn + /* do the head block first, if supplied */ cbz x4, 0f ld1 {T1.2d}, [x4] @@ -52,28 +236,209 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b + __pmull2_\pn XH, XL, SHASH // a1 * b1 + eor T1.16b, T1.16b, XL.16b + __pmull_\pn XL, XL, SHASH // a0 * b0 + __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_\pn + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbnz w0, 0b + + st1 {XL.2d}, [x1] + ret + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update_p64) + __pmull_ghash p64 +ENDPROC(pmull_ghash_update_p64) + +ENTRY(pmull_ghash_update_p8) + __pmull_ghash p8 +ENDPROC(pmull_ghash_update_p8) + + KS .req v8 + CTR .req v9 + INP .req v10 + + .macro load_round_keys, rounds, rk + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] + .endm + + .macro enc_round, state, key + aese \state\().16b, \key\().16b + aesmc \state\().16b, \state\().16b + .endm + + .macro enc_block, state, rounds + cmp \rounds, #12 + b.lo 2222f /* 128 bits */ + b.eq 1111f /* 192 bits */ + enc_round \state, v17 + enc_round \state, v18 +1111: enc_round \state, v19 + enc_round \state, v20 +2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + enc_round \state, \key + .endr + aese \state\().16b, v30.16b + eor \state\().16b, \state\().16b, v31.16b + .endm + + .macro pmull_gcm_do_crypt, enc + ld1 {SHASH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter + + movi MASK.16b, #0xe1 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 +CPU_LE( rev x8, x8 ) + shl MASK.2d, MASK.2d, #57 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + + .if \enc == 1 + ld1 {KS.16b}, [x7] + .endif + +0: ld1 {CTR.8b}, [x5] // load upper counter + ld1 {INP.16b}, [x3], #16 + rev x9, x8 + add x8, x8, #1 + sub w0, w0, #1 + ins CTR.d[1], x9 // set lower counter + + .if \enc == 1 + eor INP.16b, INP.16b, KS.16b // encrypt input + st1 {INP.16b}, [x2], #16 + .endif + + rev64 T1.16b, INP.16b + + cmp w6, #12 + b.ge 2f // AES-192/256? + +1: enc_round CTR, v21 + + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + + enc_round CTR, v22 + + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b + + enc_round CTR, v23 + pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 eor T1.16b, T1.16b, XL.16b + + enc_round CTR, v24 + pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + enc_round CTR, v25 + ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b + + enc_round CTR, v26 + eor XM.16b, XM.16b, T2.16b pmull T2.1q, XL.1d, MASK.1d + enc_round CTR, v27 + mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] + enc_round CTR, v28 + eor XL.16b, XM.16b, T2.16b + + enc_round CTR, v29 + ext T2.16b, XL.16b, XL.16b, #8 + + aese CTR.16b, v30.16b + pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b + + eor KS.16b, CTR.16b, v31.16b + eor XL.16b, XL.16b, T2.16b + .if \enc == 0 + eor INP.16b, INP.16b, KS.16b + st1 {INP.16b}, [x2], #16 + .endif + cbnz w0, 0b +CPU_LE( rev x8, x8 ) st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter + + .if \enc == 1 + st1 {KS.16b}, [x7] + .endif + + ret + +2: b.eq 3f // AES-192? + enc_round CTR, v17 + enc_round CTR, v18 +3: enc_round CTR, v19 + enc_round CTR, v20 + b 1b + .endm + + /* + * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds, u8 ks[]) + */ +ENTRY(pmull_gcm_encrypt) + pmull_gcm_do_crypt 1 +ENDPROC(pmull_gcm_encrypt) + + /* + * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds) + */ +ENTRY(pmull_gcm_decrypt) + pmull_gcm_do_crypt 0 +ENDPROC(pmull_gcm_decrypt) + + /* + * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) + */ +ENTRY(pmull_gcm_encrypt_block) + cbz x2, 0f + load_round_keys w3, x2 +0: ld1 {v0.16b}, [x1] + enc_block v0, w3 + st1 {v0.16b}, [x0] ret -ENDPROC(pmull_ghash_update) +ENDPROC(pmull_gcm_encrypt_block) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 833ec1e3f3e9..cfc9c92814fd 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -9,22 +9,33 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/b128ops.h> +#include <crypto/gf128mul.h> +#include <crypto/internal/aead.h> #include <crypto/internal/hash.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> #include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> -MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); +MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 +#define GCM_IV_SIZE 12 struct ghash_key { u64 a; u64 b; + be128 k; }; struct ghash_desc_ctx { @@ -33,8 +44,35 @@ struct ghash_desc_ctx { u32 count; }; -asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, const char *head); +struct gcm_aes_ctx { + struct crypto_aes_ctx aes_key; + struct ghash_key ghash_key; +}; + +asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], + const u8 src[], struct ghash_key const *k, + u8 ctr[], int rounds, u8 ks[]); + +asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], + const u8 src[], struct ghash_key const *k, + u8 ctr[], int rounds); + +asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[], + u32 const rk[], int rounds); + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); static int ghash_init(struct shash_desc *desc) { @@ -44,6 +82,36 @@ static int ghash_init(struct shash_desc *desc) return 0; } +static void ghash_do_update(int blocks, u64 dg[], const char *src, + struct ghash_key *key, const char *head) +{ + if (likely(may_use_simd())) { + kernel_neon_begin(); + pmull_ghash_update(blocks, dg, src, key, head); + kernel_neon_end(); + } else { + be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; + + do { + const u8 *in = src; + + if (head) { + in = head; + blocks++; + head = NULL; + } else { + src += GHASH_BLOCK_SIZE; + } + + crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE); + gf128mul_lle(&dst, &key->k); + } while (--blocks); + + dg[0] = be64_to_cpu(dst.b); + dg[1] = be64_to_cpu(dst.a); + } +} + static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { @@ -67,10 +135,9 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, blocks = len / GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE; - kernel_neon_begin_partial(8); - pmull_ghash_update(blocks, ctx->digest, src, key, - partial ? ctx->buf : NULL); - kernel_neon_end(); + ghash_do_update(blocks, ctx->digest, src, key, + partial ? ctx->buf : NULL); + src += blocks * GHASH_BLOCK_SIZE; partial = 0; } @@ -89,9 +156,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); - kernel_neon_begin_partial(8); - pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); - kernel_neon_end(); + ghash_do_update(1, ctx->digest, ctx->buf, key, NULL); } put_unaligned_be64(ctx->digest[1], dst); put_unaligned_be64(ctx->digest[0], dst + 8); @@ -100,16 +165,13 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) return 0; } -static int ghash_setkey(struct crypto_shash *tfm, - const u8 *inkey, unsigned int keylen) +static int __ghash_setkey(struct ghash_key *key, + const u8 *inkey, unsigned int keylen) { - struct ghash_key *key = crypto_shash_ctx(tfm); u64 a, b; - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } + /* needed for the fallback */ + memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); /* perform multiplication by 'x' in GF(2^128) */ b = get_unaligned_be64(inkey); @@ -124,33 +186,418 @@ static int ghash_setkey(struct crypto_shash *tfm, return 0; } +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *inkey, unsigned int keylen) +{ + struct ghash_key *key = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + return __ghash_setkey(key, inkey, keylen); +} + static struct shash_alg ghash_alg = { - .digestsize = GHASH_DIGEST_SIZE, - .init = ghash_init, - .update = ghash_update, - .final = ghash_final, - .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), - .base = { - .cra_name = "ghash", - .cra_driver_name = "ghash-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_key), - .cra_module = THIS_MODULE, - }, + .base.cra_name = "ghash", + .base.cra_driver_name = "ghash-ce", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = GHASH_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct ghash_key), + .base.cra_module = THIS_MODULE, + + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), +}; + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, + unsigned int keylen) +{ + struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); + u8 key[GHASH_BLOCK_SIZE]; + int ret; + + ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen); + if (ret) { + tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, + num_rounds(&ctx->aes_key)); + + return __ghash_setkey(&ctx->ghash_key, key, sizeof(key)); +} + +static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12 ... 16: + break; + default: + return -EINVAL; + } + return 0; +} + +static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[], + int *buf_count, struct gcm_aes_ctx *ctx) +{ + if (*buf_count > 0) { + int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count); + + memcpy(&buf[*buf_count], src, buf_added); + + *buf_count += buf_added; + src += buf_added; + count -= buf_added; + } + + if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) { + int blocks = count / GHASH_BLOCK_SIZE; + + ghash_do_update(blocks, dg, src, &ctx->ghash_key, + *buf_count ? buf : NULL); + + src += blocks * GHASH_BLOCK_SIZE; + count %= GHASH_BLOCK_SIZE; + *buf_count = 0; + } + + if (count > 0) { + memcpy(buf, src, count); + *buf_count = count; + } +} + +static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + u8 buf[GHASH_BLOCK_SIZE]; + struct scatter_walk walk; + u32 len = req->assoclen; + int buf_count = 0; + + scatterwalk_start(&walk, req->src); + + do { + u32 n = scatterwalk_clamp(&walk, len); + u8 *p; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, len); + } + p = scatterwalk_map(&walk); + + gcm_update_mac(dg, p, n, buf, &buf_count, ctx); + len -= n; + + scatterwalk_unmap(p); + scatterwalk_advance(&walk, n); + scatterwalk_done(&walk, 0, len); + } while (len); + + if (buf_count) { + memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + } +} + +static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx, + u64 dg[], u8 tag[], int cryptlen) +{ + u8 mac[AES_BLOCK_SIZE]; + u128 lengths; + + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64(cryptlen * 8); + + ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL); + + put_unaligned_be64(dg[1], mac); + put_unaligned_be64(dg[0], mac + 8); + + crypto_xor(tag, mac, AES_BLOCK_SIZE); +} + +static int gcm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + struct skcipher_walk walk; + u8 iv[AES_BLOCK_SIZE]; + u8 ks[AES_BLOCK_SIZE]; + u8 tag[AES_BLOCK_SIZE]; + u64 dg[2] = {}; + int err; + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + + if (likely(may_use_simd())) { + kernel_neon_begin(); + + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + pmull_gcm_encrypt_block(ks, iv, NULL, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(3, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + + pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, + walk.src.virt.addr, &ctx->ghash_key, + iv, num_rounds(&ctx->aes_key), ks); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + } else { + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + + do { + __aes_arm64_encrypt(ctx->aes_key.key_enc, + ks, iv, + num_rounds(&ctx->aes_key)); + crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE); + crypto_inc(iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + } while (--blocks > 0); + + ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg, + walk.dst.virt.addr, &ctx->ghash_key, + NULL); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv, + num_rounds(&ctx->aes_key)); + } + + /* handle the tail */ + if (walk.nbytes) { + u8 buf[GHASH_BLOCK_SIZE]; + + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks, + walk.nbytes); + + memcpy(buf, walk.dst.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + + err = skcipher_walk_done(&walk, 0); + } + + if (err) + return err; + + gcm_final(req, ctx, dg, tag, req->cryptlen); + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int gcm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + unsigned int authsize = crypto_aead_authsize(aead); + struct skcipher_walk walk; + u8 iv[AES_BLOCK_SIZE]; + u8 tag[AES_BLOCK_SIZE]; + u8 buf[GHASH_BLOCK_SIZE]; + u64 dg[2] = {}; + int err; + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + + if (likely(may_use_simd())) { + kernel_neon_begin(); + + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_decrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + + pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, + walk.src.virt.addr, &ctx->ghash_key, + iv, num_rounds(&ctx->aes_key)); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + pmull_gcm_encrypt_block(iv, iv, NULL, + num_rounds(&ctx->aes_key)); + + kernel_neon_end(); + } else { + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_decrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + + ghash_do_update(blocks, dg, walk.src.virt.addr, + &ctx->ghash_key, NULL); + + do { + __aes_arm64_encrypt(ctx->aes_key.key_enc, + buf, iv, + num_rounds(&ctx->aes_key)); + crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE); + crypto_inc(iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + } while (--blocks > 0); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv, + num_rounds(&ctx->aes_key)); + } + + /* handle the tail */ + if (walk.nbytes) { + memcpy(buf, walk.src.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv, + walk.nbytes); + + err = skcipher_walk_done(&walk, 0); + } + + if (err) + return err; + + gcm_final(req, ctx, dg, tag, req->cryptlen - authsize); + + /* compare calculated auth tag with the stored one */ + scatterwalk_map_and_copy(buf, req->src, + req->assoclen + req->cryptlen - authsize, + authsize, 0); + + if (crypto_memneq(tag, buf, authsize)) + return -EBADMSG; + return 0; +} + +static struct aead_alg gcm_aes_alg = { + .ivsize = GCM_IV_SIZE, + .chunksize = AES_BLOCK_SIZE, + .maxauthsize = AES_BLOCK_SIZE, + .setkey = gcm_setkey, + .setauthsize = gcm_setauthsize, + .encrypt = gcm_encrypt, + .decrypt = gcm_decrypt, + + .base.cra_name = "gcm(aes)", + .base.cra_driver_name = "gcm-aes-ce", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct gcm_aes_ctx), + .base.cra_module = THIS_MODULE, }; static int __init ghash_ce_mod_init(void) { - return crypto_register_shash(&ghash_alg); + int ret; + + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + if (elf_hwcap & HWCAP_PMULL) + pmull_ghash_update = pmull_ghash_update_p64; + + else + pmull_ghash_update = pmull_ghash_update_p8; + + ret = crypto_register_shash(&ghash_alg); + if (ret) + return ret; + + if (elf_hwcap & HWCAP_PMULL) { + ret = crypto_register_aead(&gcm_aes_alg); + if (ret) + crypto_unregister_shash(&ghash_alg); + } + return ret; } static void __exit ghash_ce_mod_exit(void) { crypto_unregister_shash(&ghash_alg); + crypto_unregister_aead(&gcm_aes_alg); } -module_cpu_feature_match(PMULL, ghash_ce_mod_init); +static const struct cpu_feature ghash_cpu_feature[] = { + { cpu_feature(PMULL) }, { } +}; +MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature); + +module_init(ghash_ce_mod_init); module_exit(ghash_ce_mod_exit); diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index ea319c055f5d..efbeb3e0dcfb 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -1,7 +1,7 @@ /* * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions * - * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha.h> @@ -37,8 +38,11 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, { struct sha1_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return crypto_sha1_update(desc, data, len); + sctx->finalize = 0; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_update(desc, data, len, (sha1_block_fn *)sha1_ce_transform); kernel_neon_end(); @@ -52,13 +56,16 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, struct sha1_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); + if (!may_use_simd()) + return crypto_sha1_finup(desc, data, len, out); + /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. */ sctx->finalize = finalize; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_update(desc, data, len, (sha1_block_fn *)sha1_ce_transform); if (!finalize) @@ -71,8 +78,11 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) { struct sha1_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return crypto_sha1_finup(desc, NULL, 0, out); + sctx->finalize = 0; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform); kernel_neon_end(); return sha1_base_finish(desc, out); diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 0ed9486f75dd..fd1ff2b13dfa 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -1,7 +1,7 @@ /* * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions * - * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha.h> @@ -34,13 +35,19 @@ const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, finalize); +asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks); + static int sha256_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sctx->finalize = 0; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha2_ce_transform); kernel_neon_end(); @@ -54,13 +61,22 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, struct sha256_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); + if (!may_use_simd()) { + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + return sha256_base_finish(desc, out); + } + /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. */ sctx->finalize = finalize; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha2_ce_transform); if (!finalize) @@ -74,8 +90,14 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) { + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + return sha256_base_finish(desc, out); + } + sctx->finalize = 0; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform); kernel_neon_end(); return sha256_base_finish(desc, out); diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index a2226f841960..b064d925fe2a 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -29,6 +29,7 @@ MODULE_ALIAS_CRYPTO("sha256"); asmlinkage void sha256_block_data_order(u32 *digest, const void *data, unsigned int num_blks); +EXPORT_SYMBOL(sha256_block_data_order); asmlinkage void sha256_block_neon(u32 *digest, const void *data, unsigned int num_blks); diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index f81c7b685fc6..2326e39d5892 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += rwsem.h generic-y += segment.h generic-y += serial.h generic-y += set_memory.h -generic-y += simd.h generic-y += sizes.h generic-y += switch_to.h generic-y += trace_clock.h diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 8cef47fa2218..b7e3f74822da 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -116,6 +116,8 @@ static inline void gic_write_bpr1(u32 val) #define gic_read_typer(c) readq_relaxed(c) #define gic_write_irouter(v, c) writeq_relaxed(v, c) +#define gic_read_lpir(c) readq_relaxed(c) +#define gic_write_lpir(v, c) writeq_relaxed(v, c) #define gic_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) @@ -133,5 +135,10 @@ static inline void gic_write_bpr1(u32 val) #define gicr_write_pendbaser(v, c) writeq_relaxed(v, c) #define gicr_read_pendbaser(c) readq_relaxed(c) +#define gits_write_vpropbaser(v, c) writeq_relaxed(v, c) + +#define gits_write_vpendbaser(v, c) writeq_relaxed(v, c) +#define gits_read_vpendbaser(c) readq_relaxed(c) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARCH_GICV3_H */ diff --git a/arch/arm64/include/asm/asm-bug.h b/arch/arm64/include/asm/asm-bug.h new file mode 100644 index 000000000000..636e755bcdca --- /dev/null +++ b/arch/arm64/include/asm/asm-bug.h @@ -0,0 +1,54 @@ +#ifndef __ASM_ASM_BUG_H +/* + * Copyright (C) 2017 ARM Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#define __ASM_ASM_BUG_H + +#include <asm/brk-imm.h> + +#ifdef CONFIG_DEBUG_BUGVERBOSE +#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) +#define __BUGVERBOSE_LOCATION(file, line) \ + .pushsection .rodata.str,"aMS",@progbits,1; \ + 2: .string file; \ + .popsection; \ + \ + .long 2b - 0b; \ + .short line; +#else +#define _BUGVERBOSE_LOCATION(file, line) +#endif + +#ifdef CONFIG_GENERIC_BUG + +#define __BUG_ENTRY(flags) \ + .pushsection __bug_table,"aw"; \ + .align 2; \ + 0: .long 1f - 0b; \ +_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ + .short flags; \ + .popsection; \ + 1: +#else +#define __BUG_ENTRY(flags) +#endif + +#define ASM_BUG_FLAGS(flags) \ + __BUG_ENTRY(flags) \ + brk BUG_BRK_IMM + +#define ASM_BUG() ASM_BUG_FLAGS(0) + +#endif /* __ASM_ASM_BUG_H */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 1b67c3782d00..d58a6253c6ab 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -230,12 +230,18 @@ lr .req x30 // link register .endm /* - * @dst: Result of per_cpu(sym, smp_processor_id()) + * @dst: Result of per_cpu(sym, smp_processor_id()), can be SP for + * non-module code * @sym: The name of the per-cpu variable * @tmp: scratch register */ .macro adr_this_cpu, dst, sym, tmp +#ifndef MODULE + adrp \tmp, \sym + add \dst, \tmp, #:lo12:\sym +#else adr_l \dst, \sym +#endif mrs \tmp, tpidr_el1 add \dst, \dst, \tmp .endm @@ -353,6 +359,12 @@ alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE alternative_else dc civac, \kaddr alternative_endif + .elseif (\op == cvap) +alternative_if ARM64_HAS_DCPOP + sys 3, c7, c12, 1, \kaddr // dc cvap +alternative_else + dc cvac, \kaddr +alternative_endif .else dc \op, \kaddr .endif @@ -403,6 +415,17 @@ alternative_endif .size __pi_##x, . - x; \ ENDPROC(x) +/* + * Annotate a function as being unsuitable for kprobes. + */ +#ifdef CONFIG_KPROBES +#define NOKPROBE(x) \ + .pushsection "_kprobe_blacklist", "aw"; \ + .quad x; \ + .popsection; +#else +#define NOKPROBE(x) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index a02a57186f56..d7dc43752705 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,41 +18,12 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#include <asm/brk-imm.h> +#include <linux/stringify.h> -#ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) -#define __BUGVERBOSE_LOCATION(file, line) \ - ".pushsection .rodata.str,\"aMS\",@progbits,1\n" \ - "2: .string \"" file "\"\n\t" \ - ".popsection\n\t" \ - \ - ".long 2b - 0b\n\t" \ - ".short " #line "\n\t" -#else -#define _BUGVERBOSE_LOCATION(file, line) -#endif - -#ifdef CONFIG_GENERIC_BUG - -#define __BUG_ENTRY(flags) \ - ".pushsection __bug_table,\"aw\"\n\t" \ - ".align 2\n\t" \ - "0: .long 1f - 0b\n\t" \ -_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ - ".short " #flags "\n\t" \ - ".popsection\n" \ - "1: " -#else -#define __BUG_ENTRY(flags) "" -#endif +#include <asm/asm-bug.h> #define __BUG_FLAGS(flags) \ - asm volatile ( \ - __BUG_ENTRY(flags) \ - "brk %[imm]" :: [imm] "i" (BUG_BRK_IMM) \ - ); - + asm volatile (__stringify(ASM_BUG_FLAGS(flags))); #define BUG() do { \ __BUG_FLAGS(0); \ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index d74a284abdc2..76d1cc85d5b1 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -67,7 +67,9 @@ */ extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __inval_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); +extern void __clean_dcache_area_pop(void *addr, size_t len); extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); extern void sync_icache_aliases(void *kaddr, unsigned long len); @@ -150,6 +152,6 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) { } -int set_memory_valid(unsigned long addr, unsigned long size, int enable); +int set_memory_valid(unsigned long addr, int numpages, int enable); #endif diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 8d2272c6822c..8da621627d7c 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -39,7 +39,8 @@ #define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 #define ARM64_WORKAROUND_858921 19 #define ARM64_WORKAROUND_CAVIUM_30115 20 +#define ARM64_HAS_DCPOP 21 -#define ARM64_NCAPS 21 +#define ARM64_NCAPS 22 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8f3043aba873..b93904b16fc2 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -3,7 +3,9 @@ #include <asm/boot.h> #include <asm/cpufeature.h> +#include <asm/fpsimd.h> #include <asm/io.h> +#include <asm/memory.h> #include <asm/mmu_context.h> #include <asm/neon.h> #include <asm/ptrace.h> @@ -20,8 +22,8 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define arch_efi_call_virt_setup() \ ({ \ - kernel_neon_begin(); \ efi_virtmap_load(); \ + __efi_fpsimd_begin(); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -33,8 +35,8 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define arch_efi_call_virt_teardown() \ ({ \ + __efi_fpsimd_end(); \ efi_virtmap_unload(); \ - kernel_neon_end(); \ }) #define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) @@ -48,6 +50,13 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); */ #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ +/* + * In some configurations (e.g. VMAP_STACK && 64K pages), stacks built into the + * kernel need greater alignment than we require the segments to be padded to. + */ +#define EFI_KIMG_ALIGN \ + (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN) + /* on arm64, the FDT may be located anywhere in system RAM */ static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base) { @@ -81,6 +90,9 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base, #define alloc_screen_info(x...) &screen_info #define free_screen_info(x...) +/* redeclare as 'hidden' so the compiler will generate relative references */ +extern struct screen_info screen_info __attribute__((__visibility__("hidden"))); + static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { } diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 3288c2b36731..33be513ef24c 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -139,7 +139,6 @@ typedef struct user_fpsimd_state elf_fpregset_t; #define SET_PERSONALITY(ex) \ ({ \ - clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ clear_thread_flag(TIF_32BIT); \ current->personality &= ~READ_IMPLIES_EXEC; \ }) @@ -195,7 +194,6 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; */ #define COMPAT_SET_PERSONALITY(ex) \ ({ \ - set_bit(TIF_32BIT, ¤t->mm->context.flags); \ set_thread_flag(TIF_32BIT); \ }) #define COMPAT_ARCH_DLINFO diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 8cabd57b6348..66ed8b6b9976 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -77,16 +77,23 @@ #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) #define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) -#define ESR_ELx_IL (UL(1) << 25) +#define ESR_ELx_IL_SHIFT (25) +#define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) /* ISS field definitions shared by different classes */ -#define ESR_ELx_WNR (UL(1) << 6) +#define ESR_ELx_WNR_SHIFT (6) +#define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) /* Shared ISS field definitions for Data/Instruction aborts */ -#define ESR_ELx_FnV (UL(1) << 10) -#define ESR_ELx_EA (UL(1) << 9) -#define ESR_ELx_S1PTW (UL(1) << 7) +#define ESR_ELx_SET_SHIFT (11) +#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) +#define ESR_ELx_FnV_SHIFT (10) +#define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) +#define ESR_ELx_EA_SHIFT (9) +#define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) +#define ESR_ELx_S1PTW_SHIFT (7) +#define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) /* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ #define ESR_ELx_FSC (0x3F) @@ -97,15 +104,20 @@ #define ESR_ELx_FSC_PERM (0x0C) /* ISS field definitions for Data Aborts */ -#define ESR_ELx_ISV (UL(1) << 24) +#define ESR_ELx_ISV_SHIFT (24) +#define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) #define ESR_ELx_SAS_SHIFT (22) #define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) -#define ESR_ELx_SSE (UL(1) << 21) +#define ESR_ELx_SSE_SHIFT (21) +#define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) #define ESR_ELx_SRT_SHIFT (16) #define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) -#define ESR_ELx_SF (UL(1) << 15) -#define ESR_ELx_AR (UL(1) << 14) -#define ESR_ELx_CM (UL(1) << 8) +#define ESR_ELx_SF_SHIFT (15) +#define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) +#define ESR_ELx_AR_SHIFT (14) +#define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_CM_SHIFT (8) +#define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_CV (UL(1) << 24) @@ -157,9 +169,10 @@ /* * User space cache operations have the following sysreg encoding * in System instructions. - * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 14 }, WRITE (L=0) + * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 14 }, WRITE (L=0) */ #define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 #define ESR_ELx_SYS64_ISS_CRM_IC_IVAU 5 @@ -209,6 +222,13 @@ #ifndef __ASSEMBLY__ #include <asm/types.h> +static inline bool esr_is_data_abort(u32 esr) +{ + const u32 ec = ESR_ELx_EC(esr); + + return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; +} + const char *esr_get_class_string(u32 esr); #endif /* __ASSEMBLY */ diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 50f559f574fe..410c48163c6a 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -41,16 +41,6 @@ struct fpsimd_state { unsigned int cpu; }; -/* - * Struct for stacking the bottom 'n' FP/SIMD registers. - */ -struct fpsimd_partial_state { - u32 fpsr; - u32 fpcr; - u32 num_regs; - __uint128_t vregs[32]; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ @@ -77,9 +67,9 @@ extern void fpsimd_update_current_state(struct fpsimd_state *state); extern void fpsimd_flush_task_state(struct task_struct *target); -extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, - u32 num_regs); -extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); +/* For use by EFI runtime services calls only */ +extern void __efi_fpsimd_begin(void); +extern void __efi_fpsimd_end(void); #endif diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index a2daf1293028..0f5fdd388b0d 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -75,59 +75,3 @@ ldr w\tmpnr, [\state, #16 * 2 + 4] fpsimd_restore_fpcr x\tmpnr, \state .endm - -.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 - mrs x\tmpnr1, fpsr - str w\numnr, [\state, #8] - mrs x\tmpnr2, fpcr - stp w\tmpnr1, w\tmpnr2, [\state] - adr x\tmpnr1, 0f - add \state, \state, x\numnr, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 - br x\tmpnr1 - stp q30, q31, [\state, #-16 * 30 - 16] - stp q28, q29, [\state, #-16 * 28 - 16] - stp q26, q27, [\state, #-16 * 26 - 16] - stp q24, q25, [\state, #-16 * 24 - 16] - stp q22, q23, [\state, #-16 * 22 - 16] - stp q20, q21, [\state, #-16 * 20 - 16] - stp q18, q19, [\state, #-16 * 18 - 16] - stp q16, q17, [\state, #-16 * 16 - 16] - stp q14, q15, [\state, #-16 * 14 - 16] - stp q12, q13, [\state, #-16 * 12 - 16] - stp q10, q11, [\state, #-16 * 10 - 16] - stp q8, q9, [\state, #-16 * 8 - 16] - stp q6, q7, [\state, #-16 * 6 - 16] - stp q4, q5, [\state, #-16 * 4 - 16] - stp q2, q3, [\state, #-16 * 2 - 16] - stp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm - -.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 - ldp w\tmpnr1, w\tmpnr2, [\state] - msr fpsr, x\tmpnr1 - fpsimd_restore_fpcr x\tmpnr2, x\tmpnr1 - adr x\tmpnr1, 0f - ldr w\tmpnr2, [\state, #8] - add \state, \state, x\tmpnr2, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 - br x\tmpnr1 - ldp q30, q31, [\state, #-16 * 30 - 16] - ldp q28, q29, [\state, #-16 * 28 - 16] - ldp q26, q27, [\state, #-16 * 26 - 16] - ldp q24, q25, [\state, #-16 * 24 - 16] - ldp q22, q23, [\state, #-16 * 22 - 16] - ldp q20, q21, [\state, #-16 * 20 - 16] - ldp q18, q19, [\state, #-16 * 18 - 16] - ldp q16, q17, [\state, #-16 * 16 - 16] - ldp q14, q15, [\state, #-16 * 14 - 16] - ldp q12, q13, [\state, #-16 * 12 - 16] - ldp q10, q11, [\state, #-16 * 10 - 16] - ldp q8, q9, [\state, #-16 * 8 - 16] - ldp q6, q7, [\state, #-16 * 6 - 16] - ldp q4, q5, [\state, #-16 * 4 - 16] - ldp q2, q3, [\state, #-16 * 2 - 16] - ldp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f32b42e8725d..5bb2fd4674e7 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -48,20 +48,10 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (int)(encoded_op << 8) >> 20; - int cmparg = (int)(encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1U << (oparg & 0x1f); - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 793bd73b0d07..1dca41bea16a 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -18,7 +18,6 @@ #ifndef __ASM_HUGETLB_H #define __ASM_HUGETLB_H -#include <asm-generic/hugetlb.h> #include <asm/page.h> static inline pte_t huge_ptep_get(pte_t *ptep) @@ -82,6 +81,14 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); +#define huge_pte_clear huge_pte_clear +extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +#define set_huge_swap_pte_at set_huge_swap_pte_at + +#include <asm-generic/hugetlb.h> #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static inline bool gigantic_page_supported(void) { return true; } diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index b77197d941fc..5e6f77239064 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,45 +1,12 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H -#define IRQ_STACK_SIZE THREAD_SIZE -#define IRQ_STACK_START_SP THREAD_START_SP - #ifndef __ASSEMBLER__ -#include <linux/percpu.h> - #include <asm-generic/irq.h> -#include <asm/thread_info.h> struct pt_regs; -DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); - -/* - * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S, which - * is structured as follows: - * - * ------------ - * | | <- irq_stack_ptr - * top ------------ - * | x19 | <- irq_stack_ptr - 0x08 - * ------------ - * | x29 | <- irq_stack_ptr - 0x10 - * ------------ - * - * where x19 holds a copy of the task stack pointer where the struct pt_regs - * from kernel_entry can be found. - * - */ -#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) - -/* - * The offset from irq_stack_ptr where entry.S will store the original - * stack pointer. Used by unwind_frame() and dump_backtrace(). - */ -#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) - extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -47,14 +14,5 @@ static inline int nr_legacy_irqs(void) return 0; } -static inline bool on_irq_stack(unsigned long sp, int cpu) -{ - /* variable names the same as kernel/stacktrace.c */ - unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); - unsigned long high = low + IRQ_STACK_START_SP; - - return (low <= sp && sp <= high); -} - #endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index a89cc22abadc..672c8684d5c2 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -175,18 +175,15 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) static inline void kvm_set_s2pte_readonly(pte_t *pte) { - pteval_t pteval; - unsigned long tmp; - - asm volatile("// kvm_set_s2pte_readonly\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " and %0, %0, %3 // clear PTE_S2_RDWR\n" - " orr %0, %0, %4 // set PTE_S2_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte)) - : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY)); + pteval_t old_pteval, pteval; + + pteval = READ_ONCE(pte_val(*pte)); + do { + old_pteval = pteval; + pteval &= ~PTE_S2_RDWR; + pteval |= PTE_S2_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*pte), old_pteval, pteval); + } while (pteval != old_pteval); } static inline bool kvm_s2pte_readonly(pte_t *pte) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index ef39dcb9ca6a..3585a5e26151 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -25,6 +25,7 @@ #include <linux/const.h> #include <linux/types.h> #include <asm/bug.h> +#include <asm/page-def.h> #include <asm/sizes.h> /* @@ -103,6 +104,58 @@ #define KASAN_SHADOW_SIZE (0) #endif +#define MIN_THREAD_SHIFT 14 + +/* + * VMAP'd stacks are allocated at page granularity, so we must ensure that such + * stacks are a multiple of page size. + */ +#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT) +#define THREAD_SHIFT PAGE_SHIFT +#else +#define THREAD_SHIFT MIN_THREAD_SHIFT +#endif + +#if THREAD_SHIFT >= PAGE_SHIFT +#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) +#endif + +#define THREAD_SIZE (UL(1) << THREAD_SHIFT) + +/* + * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by + * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry + * assembly. + */ +#ifdef CONFIG_VMAP_STACK +#define THREAD_ALIGN (2 * THREAD_SIZE) +#else +#define THREAD_ALIGN THREAD_SIZE +#endif + +#define IRQ_STACK_SIZE THREAD_SIZE + +#define OVERFLOW_STACK_SIZE SZ_4K + +/* + * Alignment of kernel segments (e.g. .text, .data). + */ +#if defined(CONFIG_DEBUG_ALIGN_RODATA) +/* + * 4 KB granule: 1 level 2 entry + * 16 KB granule: 128 level 3 entries, with contiguous bit + * 64 KB granule: 32 level 3 entries, with contiguous bit + */ +#define SEGMENT_ALIGN SZ_2M +#else +/* + * 4 KB granule: 16 level 3 entries, with contiguous bit + * 16 KB granule: 4 level 3 entries, without contiguous bit + * 64 KB granule: 1 level 3 entry + */ +#define SEGMENT_ALIGN SZ_64K +#endif + /* * Memory types available. */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 5468c834b072..0d34bf0a89c7 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -16,6 +16,8 @@ #ifndef __ASM_MMU_H #define __ASM_MMU_H +#define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ + typedef struct { atomic64_t id; void *vdso; diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index ad4cdc966c0f..f922eaf780f9 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -8,12 +8,22 @@ * published by the Free Software Foundation. */ +#ifndef __ASM_NEON_H +#define __ASM_NEON_H + #include <linux/types.h> #include <asm/fpsimd.h> #define cpu_has_neon() system_supports_fpsimd() -#define kernel_neon_begin() kernel_neon_begin_partial(32) - -void kernel_neon_begin_partial(u32 num_regs); +void kernel_neon_begin(void); void kernel_neon_end(void); + +/* + * Temporary macro to allow the crypto code to compile. Note that the + * semantics of kernel_neon_begin_partial() are now different from the + * original as it does not allow being called in an interrupt context. + */ +#define kernel_neon_begin_partial(num_regs) kernel_neon_begin() + +#endif /* ! __ASM_NEON_H */ diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index bf466d1876e3..ef7b23863a7c 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -7,9 +7,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) -/* currently, arm64 implements flat NUMA topology */ -#define parent_node(node) (node) - int __node_distance(int from, int to); #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h new file mode 100644 index 000000000000..01591a29dc2e --- /dev/null +++ b/arch/arm64/include/asm/page-def.h @@ -0,0 +1,34 @@ +/* + * Based on arch/arm/include/asm/page.h + * + * Copyright (C) 1995-2003 Russell King + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __ASM_PAGE_DEF_H +#define __ASM_PAGE_DEF_H + +#include <linux/const.h> + +/* PAGE_SHIFT determines the page size */ +/* CONT_SHIFT determines the number of pages which can be tracked together */ +#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT +#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT +#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) +#define CONT_MASK (~(CONT_SIZE-1)) + +#endif /* __ASM_PAGE_DEF_H */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 8472c6def5ef..60d02c81a3a2 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -19,17 +19,7 @@ #ifndef __ASM_PAGE_H #define __ASM_PAGE_H -#include <linux/const.h> - -/* PAGE_SHIFT determines the page size */ -/* CONT_SHIFT determines the number of pages which can be tracked together */ -#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT -#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) -#define CONT_MASK (~(CONT_SIZE-1)) +#include <asm/page-def.h> #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2142c7726e76..0a5635fb0ef9 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -63,23 +63,21 @@ #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) -#define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_NG | PTE_PXN) +#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) +#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #define __P000 PAGE_NONE #define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY +#define __P010 PAGE_READONLY +#define __P011 PAGE_READONLY #define __P100 PAGE_EXECONLY #define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC +#define __P110 PAGE_READONLY_EXEC +#define __P111 PAGE_READONLY_EXEC #define __S000 PAGE_NONE #define __S001 PAGE_READONLY diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6eae342ced6b..bc4e92337d16 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -39,6 +39,7 @@ #ifndef __ASSEMBLY__ +#include <asm/cmpxchg.h> #include <asm/fixmap.h> #include <linux/mmdebug.h> @@ -84,11 +85,7 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; (__boundary - 1 < (end) - 1) ? __boundary : (end); \ }) -#ifdef CONFIG_ARM64_HW_AFDBM #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) -#else -#define pte_hw_dirty(pte) (0) -#endif #define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY)) #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) @@ -124,12 +121,16 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) static inline pte_t pte_wrprotect(pte_t pte) { - return clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkwrite(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkclean(pte_t pte) @@ -168,11 +169,6 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } -static inline pte_t pte_clear_rdonly(pte_t pte) -{ - return clear_pte_bit(pte, __pgprot(PTE_RDONLY)); -} - static inline pte_t pte_mkpresent(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_VALID)); @@ -220,22 +216,15 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - if (pte_present(pte)) { - if (pte_sw_dirty(pte) && pte_write(pte)) - pte_val(pte) &= ~PTE_RDONLY; - else - pte_val(pte) |= PTE_RDONLY; - if (pte_user_exec(pte) && !pte_special(pte)) - __sync_icache_dcache(pte, addr); - } + if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte, addr); /* * If the existing pte is valid, check for potential race with * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && - pte_valid(*ptep) && pte_valid(pte)) { + if (pte_valid(*ptep) && pte_valid(pte)) { VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(*ptep), pte_val(pte)); @@ -571,7 +560,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#ifdef CONFIG_ARM64_HW_AFDBM #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, @@ -593,20 +581,17 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int __ptep_test_and_clear_young(pte_t *ptep) { - pteval_t pteval; - unsigned int tmp, res; + pte_t old_pte, pte; - asm volatile("// __ptep_test_and_clear_young\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n" - " and %0, %0, %4 // clear PTE_AF\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)), "=&r" (res) - : "L" (~PTE_AF), "I" (ilog2(PTE_AF))); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_mkold(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); - return res; + return pte_young(pte); } static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, @@ -630,17 +615,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t old_pteval; - unsigned int tmp; - - asm volatile("// ptep_get_and_clear\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " stxr %w1, xzr, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))); - - return __pte(old_pteval); + return __pte(xchg_relaxed(&pte_val(*ptep), 0)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -653,27 +628,32 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware - * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. + * ptep_set_wrprotect - mark read-only while preserving the hardware update of + * the Access Flag. */ #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t pteval; - unsigned long tmp; + pte_t old_pte, pte; - asm volatile("// ptep_set_wrprotect\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " tst %0, %4 // check for hw dirty (!PTE_RDONLY)\n" - " csel %1, %3, xzr, eq // set PTE_DIRTY|PTE_RDONLY if dirty\n" - " orr %0, %0, %1 // if !dirty, PTE_RDONLY is already set\n" - " and %0, %0, %5 // clear PTE_WRITE/PTE_DBM\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "r" (PTE_DIRTY|PTE_RDONLY), "L" (PTE_RDONLY), "L" (~PTE_WRITE) - : "cc"); + /* + * ptep_set_wrprotect() is only called on CoW mappings which are + * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE && + * PTE_RDONLY) or writable and software-dirty (PTE_WRITE && + * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and + * protection_map[]. There is no race with the hardware update of the + * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM) + * is set. + */ + VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep), + "%s: potential race with hardware DBM", __func__); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_wrprotect(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -684,7 +664,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #endif -#endif /* CONFIG_ARM64_HW_AFDBM */ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 64c9e78f9882..29adab8138c3 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -112,7 +112,7 @@ void tls_preserve_current_state(void); static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) { memset(regs, 0, sizeof(*regs)); - regs->syscallno = ~0UL; + forget_syscall(regs); regs->pc = pc; } @@ -159,7 +159,7 @@ extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); #define task_pt_regs(p) \ - ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) + ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) #define KSTK_EIP(tsk) ((unsigned long)task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) user_stack_pointer(task_pt_regs(tsk)) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 11403fdd0a50..6069d66e0bc2 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -72,8 +72,19 @@ #define COMPAT_PT_TEXT_ADDR 0x10000 #define COMPAT_PT_DATA_ADDR 0x10004 #define COMPAT_PT_TEXT_END_ADDR 0x10008 + +/* + * If pt_regs.syscallno == NO_SYSCALL, then the thread is not executing + * a syscall -- i.e., its most recent entry into the kernel from + * userspace was not via SVC, or otherwise a tracer cancelled the syscall. + * + * This must have the value -1, for ABI compatibility with ptrace etc. + */ +#define NO_SYSCALL (-1) + #ifndef __ASSEMBLY__ #include <linux/bug.h> +#include <linux/types.h> /* sizeof(struct user) for AArch32 */ #define COMPAT_USER_SZ 296 @@ -116,11 +127,29 @@ struct pt_regs { }; }; u64 orig_x0; - u64 syscallno; +#ifdef __AARCH64EB__ + u32 unused2; + s32 syscallno; +#else + s32 syscallno; + u32 unused2; +#endif + u64 orig_addr_limit; u64 unused; // maintain 16 byte alignment + u64 stackframe[2]; }; +static inline bool in_syscall(struct pt_regs const *regs) +{ + return regs->syscallno != NO_SYSCALL; +} + +static inline void forget_syscall(struct pt_regs *regs) +{ + regs->syscallno = NO_SYSCALL; +} + #define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) #define arch_has_single_step() (1) diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h index eeaa97559bab..81abea0b7650 100644 --- a/arch/arm64/include/asm/signal32.h +++ b/arch/arm64/include/asm/signal32.h @@ -22,8 +22,6 @@ #define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500 -extern const compat_ulong_t aarch32_sigret_code[6]; - int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h new file mode 100644 index 000000000000..fa8b3fe932e6 --- /dev/null +++ b/arch/arm64/include/asm/simd.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef __ASM_SIMD_H +#define __ASM_SIMD_H + +#include <linux/compiler.h> +#include <linux/irqflags.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/types.h> + +#ifdef CONFIG_KERNEL_MODE_NEON + +DECLARE_PER_CPU(bool, kernel_neon_busy); + +/* + * may_use_simd - whether it is allowable at this time to issue SIMD + * instructions or access the SIMD register file + * + * Callers must not assume that the result remains true beyond the next + * preempt_enable() or return from softirq context. + */ +static __must_check inline bool may_use_simd(void) +{ + /* + * The raw_cpu_read() is racy if called with preemption enabled. + * This is not a bug: kernel_neon_busy is only set when + * preemption is disabled, so we cannot migrate to another CPU + * while it is set, nor can we migrate to a CPU where it is set. + * So, if we find it clear on some CPU then we're guaranteed to + * find it clear on any CPU we could migrate to. + * + * If we are in between kernel_neon_begin()...kernel_neon_end(), + * the flag will be set, but preemption is also disabled, so we + * can't migrate to another CPU and spuriously see it become + * false. + */ + return !in_irq() && !irqs_disabled() && !in_nmi() && + !raw_cpu_read(kernel_neon_busy); +} + +#else /* ! CONFIG_KERNEL_MODE_NEON */ + +static __must_check inline bool may_use_simd(void) { + return false; +} + +#endif /* ! CONFIG_KERNEL_MODE_NEON */ + +#endif diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 55f08c5acfad..f82b447bd34f 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -148,7 +148,7 @@ static inline void cpu_panic_kernel(void) */ bool cpus_are_stuck_in_kernel(void); -extern void smp_send_crash_stop(void); +extern void crash_smp_send_stop(void); extern bool smp_crash_stop_failed(void); #endif /* ifndef __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index cae331d553f8..95ad7102b63c 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,58 +26,6 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - unsigned int tmp; - arch_spinlock_t lockval; - u32 owner; - - /* - * Ensure prior spin_lock operations to other locks have completed - * on this CPU before we test whether "lock" is locked. - */ - smp_mb(); - owner = READ_ONCE(lock->owner) << 16; - - asm volatile( -" sevl\n" -"1: wfe\n" -"2: ldaxr %w0, %2\n" - /* Is the lock free? */ -" eor %w1, %w0, %w0, ror #16\n" -" cbz %w1, 3f\n" - /* Lock taken -- has there been a subsequent unlock->lock transition? */ -" eor %w1, %w3, %w0, lsl #16\n" -" cbz %w1, 1b\n" - /* - * The owner has been updated, so there was an unlock->lock - * transition that we missed. That means we can rely on the - * store-release of the unlock operation paired with the - * load-acquire of the lock operation to publish any of our - * previous stores to the new lock owner and therefore don't - * need to bother with the writeback below. - */ -" b 4f\n" -"3:\n" - /* - * Serialise against any concurrent lockers by writing back the - * unlocked lock value - */ - ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ -" stxr %w1, %w0, %2\n" - __nops(2), - /* LSE atomics */ -" mov %w1, %w0\n" -" cas %w0, %w0, %2\n" -" eor %w1, %w1, %w0\n") - /* Somebody else wrote to the lock, GOTO 10 and reload the value */ -" cbnz %w1, 2b\n" -"4:" - : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) - : "r" (owner) - : "memory"); -} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) @@ -176,7 +124,11 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock) static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - smp_mb(); /* See arch_spin_unlock_wait */ + /* + * Ensure prior spin_lock operations to other locks have completed + * on this CPU before we test whether "lock" is locked. + */ + smp_mb(); /* ^^^ */ return !arch_spin_value_unlocked(READ_ONCE(*lock)); } @@ -358,14 +310,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* - * Accesses appearing in program order before a spin_lock() operation - * can be reordered with accesses inside the critical section, by virtue - * of arch_spin_lock being constructed using acquire semantics. - * - * In cases where this is problematic (e.g. try_to_wake_up), an - * smp_mb__before_spinlock() can restore the required ordering. - */ -#define smp_mb__before_spinlock() smp_mb() +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 5b6eafccc5d8..6ad30776e984 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -16,11 +16,15 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H -struct task_struct; +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> + +#include <asm/memory.h> +#include <asm/ptrace.h> struct stackframe { unsigned long fp; - unsigned long sp; unsigned long pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned int graph; @@ -32,4 +36,57 @@ extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); +DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); + +static inline bool on_irq_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); + unsigned long high = low + IRQ_STACK_SIZE; + + if (!low) + return false; + + return (low <= sp && sp < high); +} + +static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) +{ + unsigned long low = (unsigned long)task_stack_page(tsk); + unsigned long high = low + THREAD_SIZE; + + return (low <= sp && sp < high); +} + +#ifdef CONFIG_VMAP_STACK +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); + +static inline bool on_overflow_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (low <= sp && sp < high); +} +#else +static inline bool on_overflow_stack(unsigned long sp) { return false; } +#endif + +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp) +{ + if (on_task_stack(tsk, sp)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp)) + return true; + if (on_overflow_stack(sp)) + return true; + + return false; +} + #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index d0aa42907569..dd95d33a5bd5 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -52,6 +52,10 @@ extern void *__memset(void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMCMP extern int memcmp(const void *, const void *, size_t); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE +void memcpy_flushcache(void *dst, const void *src, size_t cnt); +#endif #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 248339e4aaf5..f707fed5886f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -329,6 +329,7 @@ #define ID_AA64ISAR1_LRCPC_SHIFT 20 #define ID_AA64ISAR1_FCMA_SHIFT 16 #define ID_AA64ISAR1_JSCVT_SHIFT 12 +#define ID_AA64ISAR1_DPB_SHIFT 0 /* id_aa64pfr0 */ #define ID_AA64PFR0_GIC_SHIFT 24 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93cf865..ddded6497a8a 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -23,19 +23,11 @@ #include <linux/compiler.h> -#ifdef CONFIG_ARM64_4K_PAGES -#define THREAD_SIZE_ORDER 2 -#elif defined(CONFIG_ARM64_16K_PAGES) -#define THREAD_SIZE_ORDER 0 -#endif - -#define THREAD_SIZE 16384 -#define THREAD_START_SP (THREAD_SIZE - 16) - #ifndef __ASSEMBLY__ struct task_struct; +#include <asm/memory.h> #include <asm/stack_pointer.h> #include <asm/types.h> @@ -68,6 +60,9 @@ struct thread_info { #define thread_saved_fp(tsk) \ ((unsigned long)(tsk->thread.cpu_context.fp)) +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec + #endif /* @@ -86,6 +81,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -107,11 +103,12 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE) + _TIF_UPROBE | _TIF_FSCHECK) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index 02e9035b0685..d131501c6222 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook); void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { @@ -60,4 +53,9 @@ static inline int in_exception_text(unsigned long ptr) return in ? : __in_irqentry_text(ptr); } +static inline int in_entry_text(unsigned long ptr) +{ + return ptr >= (unsigned long)&__entry_text_start && + ptr < (unsigned long)&__entry_text_end; +} #endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index fab46a0ea223..fc0f9eb66039 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); + /* * Enable/disable UAO so that copy_to_user() etc can access * kernel memory with the unprivileged instructions. @@ -347,4 +350,16 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +struct page; +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); +extern unsigned long __must_check __copy_user_flushcache(void *to, const void __user *from, unsigned long n); + +static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + kasan_check_write(dst, size); + return __copy_user_flushcache(dst, src, size); +} +#endif + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 4e187ce2a811..4b9344cba83a 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -35,5 +35,6 @@ #define HWCAP_JSCVT (1 << 13) #define HWCAP_FCMA (1 << 14) #define HWCAP_LRCPC (1 << 15) +#define HWCAP_DCPOP (1 << 16) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index e25c11e727fe..b3162715ed78 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -95,7 +95,7 @@ static int __init dt_scan_depth1_nodes(unsigned long node, * __acpi_map_table() will be called before page_init(), so early_ioremap() * or early_memremap() should be called here to for ACPI table mapping. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!size) return NULL; @@ -103,7 +103,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index b3bb7ef97bc8..71bf088f1e4b 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -75,6 +75,7 @@ int main(void) DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f9e0064c8c1..cd52d365d1f0 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -120,6 +120,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_DPB_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -888,6 +889,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 0, .matches = has_no_fpsimd, }, +#ifdef CONFIG_ARM64_PMEM + { + .desc = "Data cache clean to Point of Persistence", + .capability = ARM64_HAS_DCPOP, + .def_scope = SCOPE_SYSTEM, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .min_field_value = 1, + }, +#endif {}, }; @@ -916,6 +928,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f495ee5049fd..311885962830 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -68,6 +68,7 @@ static const char *const hwcap_str[] = { "jscvt", "fcma", "lrcpc", + "dcpop", NULL }; diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index c44a82f146b1..6a27cd6dbfa6 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -41,27 +41,3 @@ ENTRY(fpsimd_load_state) fpsimd_restore x0, 8 ret ENDPROC(fpsimd_load_state) - -#ifdef CONFIG_KERNEL_MODE_NEON - -/* - * Save the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_save_partial_state) - fpsimd_save_partial x0, 1, 8, 9 - ret -ENDPROC(fpsimd_save_partial_state) - -/* - * Load the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_load_partial_state) - fpsimd_restore_partial x0, 8, 9 - ret -ENDPROC(fpsimd_load_partial_state) - -#endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b738880350f9..e1c59d4008a8 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -69,8 +69,55 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_entry, el, regsize = 64 + .macro kernel_ventry label + .align 7 sub sp, sp, #S_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * Task and IRQ stacks are aligned to (1 << THREAD_SHIFT). + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbnz x0, #THREAD_SHIFT, 0f + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + b \label + +0: + /* + * Either we've just detected an overflow, or we've taken an exception + * while on the overflow stack. Either way, we won't return to + * userspace, and can clobber EL0 registers to free up GPRs. + */ + + /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + msr tpidr_el0, x0 + + /* Recover the original x0 value and stash it in tpidrro_el0 */ + sub x0, sp, x0 + msr tpidrro_el0, x0 + + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + /* + * Check whether we were already on the overflow stack. This may happen + * after panic() re-enables interrupts. + */ + mrs x0, tpidr_el0 // sp of interrupted context + sub x0, sp, x0 // delta with top of overflow stack + tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? + b.ne __bad_stack // no? -> bad stack pointer + + /* We were already on the overflow stack. Restore sp/x0 and carry on. */ + sub sp, sp, x0 + mrs x0, tpidrro_el0 +#endif + b \label + .endm + + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -111,6 +158,18 @@ mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + /* + * In order to be able to dump the contents of struct pt_regs at the + * time the exception was taken (in case we attempt to walk the call + * stack later), chain it together with the stack frames. + */ + .if \el == 0 + stp xzr, xzr, [sp, #S_STACKFRAME] + .else + stp x29, x22, [sp, #S_STACKFRAME] + .endif + add x29, sp, #S_STACKFRAME + #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Set the TTBR0 PAN bit in SPSR. When the exception is taken from @@ -138,12 +197,10 @@ alternative_else_nop_endif stp x22, x23, [sp, #S_PC] - /* - * Set syscallno to -1 by default (overridden later if real syscall). - */ + /* Not in a syscall by default (el0_svc overwrites for real syscall) */ .if \el == 0 - mvn x21, xzr - str x21, [sp, #S_SYSCALLNO] + mov w21, #NO_SYSCALL + str w21, [sp, #S_SYSCALLNO] .endif /* @@ -259,20 +316,12 @@ alternative_else_nop_endif and x25, x25, #~(THREAD_SIZE - 1) cbnz x25, 9998f - adr_this_cpu x25, irq_stack, x26 - mov x26, #IRQ_STACK_START_SP + ldr_this_cpu x25, irq_stack_ptr, x26 + mov x26, #IRQ_STACK_SIZE add x26, x25, x26 /* switch to the irq stack */ mov sp, x26 - - /* - * Add a dummy stack frame, this non-standard format is fixed up - * by unwind_frame() - */ - stp x29, x19, [sp, #-16]! - mov x29, sp - 9998: .endm @@ -290,8 +339,9 @@ alternative_else_nop_endif * * x7 is reserved for the system call number in 32-bit mode. */ -sc_nr .req x25 // number of system calls -scno .req x26 // syscall number +wsc_nr .req w25 // number of system calls +wscno .req w26 // syscall number +xscno .req x26 // syscall number (zero-extended) stbl .req x27 // syscall table pointer tsk .req x28 // current thread_info @@ -315,34 +365,62 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - ventry el1_sync_invalid // Synchronous EL1t - ventry el1_irq_invalid // IRQ EL1t - ventry el1_fiq_invalid // FIQ EL1t - ventry el1_error_invalid // Error EL1t + kernel_ventry el1_sync_invalid // Synchronous EL1t + kernel_ventry el1_irq_invalid // IRQ EL1t + kernel_ventry el1_fiq_invalid // FIQ EL1t + kernel_ventry el1_error_invalid // Error EL1t - ventry el1_sync // Synchronous EL1h - ventry el1_irq // IRQ EL1h - ventry el1_fiq_invalid // FIQ EL1h - ventry el1_error_invalid // Error EL1h + kernel_ventry el1_sync // Synchronous EL1h + kernel_ventry el1_irq // IRQ EL1h + kernel_ventry el1_fiq_invalid // FIQ EL1h + kernel_ventry el1_error_invalid // Error EL1h - ventry el0_sync // Synchronous 64-bit EL0 - ventry el0_irq // IRQ 64-bit EL0 - ventry el0_fiq_invalid // FIQ 64-bit EL0 - ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry el0_sync // Synchronous 64-bit EL0 + kernel_ventry el0_irq // IRQ 64-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 + kernel_ventry el0_error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - ventry el0_sync_compat // Synchronous 32-bit EL0 - ventry el0_irq_compat // IRQ 32-bit EL0 - ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 + kernel_ventry el0_irq_compat // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid_compat // Error 32-bit EL0 #else - ventry el0_sync_invalid // Synchronous 32-bit EL0 - ventry el0_irq_invalid // IRQ 32-bit EL0 - ventry el0_fiq_invalid // FIQ 32-bit EL0 - ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 + kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid // Error 32-bit EL0 #endif END(vectors) +#ifdef CONFIG_VMAP_STACK + /* + * We detected an overflow in kernel_ventry, which switched to the + * overflow stack. Stash the exception regs, and head to our overflow + * handler. + */ +__bad_stack: + /* Restore the original x0 value */ + mrs x0, tpidrro_el0 + + /* + * Store the original GPRs to the new stack. The orginal SP (minus + * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + */ + sub sp, sp, #S_FRAME_SIZE + kernel_entry 1 + mrs x0, tpidr_el0 + add x0, x0, #S_FRAME_SIZE + str x0, [sp, #S_SP] + + /* Stash the regs for handle_bad_stack */ + mov x0, sp + + /* Time to die */ + bl handle_bad_stack + ASM_BUG() +#endif /* CONFIG_VMAP_STACK */ + /* * Invalid mode handlers */ @@ -351,7 +429,8 @@ END(vectors) mov x0, sp mov x1, #\reason mrs x2, esr_el1 - b bad_mode + bl bad_mode + ASM_BUG() .endm el0_sync_invalid: @@ -448,14 +527,16 @@ el1_sp_pc: mrs x0, far_el1 enable_dbg mov x2, sp - b do_sp_pc_abort + bl do_sp_pc_abort + ASM_BUG() el1_undef: /* * Undefined instruction */ enable_dbg mov x0, sp - b do_undefinstr + bl do_undefinstr + ASM_BUG() el1_dbg: /* * Debug exception handling @@ -473,7 +554,8 @@ el1_inv: mov x0, sp mov x2, x1 mov x1, #BAD_SYNC - b bad_mode + bl bad_mode + ASM_BUG() ENDPROC(el1_sync) .align 6 @@ -577,8 +659,8 @@ el0_svc_compat: * AArch32 syscall handling */ adrp stbl, compat_sys_call_table // load compat syscall table pointer - uxtw scno, w7 // syscall number in w7 (r7) - mov sc_nr, #__NR_compat_syscalls + mov wscno, w7 // syscall number in w7 (r7) + mov wsc_nr, #__NR_compat_syscalls b el0_svc_naked .align 6 @@ -707,38 +789,6 @@ el0_irq_naked: ENDPROC(el0_irq) /* - * Register switch for AArch64. The callee-saved registers need to be saved - * and restored. On entry: - * x0 = previous task_struct (must be preserved across the switch) - * x1 = next task_struct - * Previous and next are guaranteed not to be the same. - * - */ -ENTRY(cpu_switch_to) - mov x10, #THREAD_CPU_CONTEXT - add x8, x0, x10 - mov x9, sp - stp x19, x20, [x8], #16 // store callee-saved registers - stp x21, x22, [x8], #16 - stp x23, x24, [x8], #16 - stp x25, x26, [x8], #16 - stp x27, x28, [x8], #16 - stp x29, x9, [x8], #16 - str lr, [x8] - add x8, x1, x10 - ldp x19, x20, [x8], #16 // restore callee-saved registers - ldp x21, x22, [x8], #16 - ldp x23, x24, [x8], #16 - ldp x25, x26, [x8], #16 - ldp x27, x28, [x8], #16 - ldp x29, x9, [x8], #16 - ldr lr, [x8] - mov sp, x9 - msr sp_el0, x1 - ret -ENDPROC(cpu_switch_to) - -/* * This is the fast syscall return path. We do as little as possible here, * and this includes saving x0 back into the kernel stack. */ @@ -781,36 +831,24 @@ finish_ret_to_user: ENDPROC(ret_to_user) /* - * This is how we return from a fork. - */ -ENTRY(ret_from_fork) - bl schedule_tail - cbz x19, 1f // not a kernel thread - mov x0, x20 - blr x19 -1: get_thread_info tsk - b ret_to_user -ENDPROC(ret_from_fork) - -/* * SVC handler. */ .align 6 el0_svc: adrp stbl, sys_call_table // load syscall table pointer - uxtw scno, w8 // syscall number in w8 - mov sc_nr, #__NR_syscalls + mov wscno, w8 // syscall number in w8 + mov wsc_nr, #__NR_syscalls el0_svc_naked: // compat entry point - stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number + stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number enable_dbg_and_irq ct_user_exit 1 ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs ni_sys - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine b ret_fast_syscall ni_sys: @@ -824,24 +862,23 @@ ENDPROC(el0_svc) * switches, and waiting for our parent to respond. */ __sys_trace: - mov w0, #-1 // set default errno for - cmp scno, x0 // user-issued syscall(-1) + cmp wscno, #NO_SYSCALL // user-issued syscall(-1)? b.ne 1f - mov x0, #-ENOSYS + mov x0, #-ENOSYS // set default errno if so str x0, [sp, #S_X0] 1: mov x0, sp bl syscall_trace_enter - cmp w0, #-1 // skip the syscall? + cmp w0, #NO_SYSCALL // skip the syscall? b.eq __sys_trace_return_skipped - uxtw scno, w0 // syscall number (possibly new) + mov wscno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs __ni_sys_trace ldp x0, x1, [sp] // restore the syscall args ldp x2, x3, [sp, #S_X2] ldp x4, x5, [sp, #S_X4] ldp x6, x7, [sp, #S_X6] - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine __sys_trace_return: @@ -865,3 +902,49 @@ ENTRY(sys_rt_sigreturn_wrapper) mov x0, sp b sys_rt_sigreturn ENDPROC(sys_rt_sigreturn_wrapper) + +/* + * Register switch for AArch64. The callee-saved registers need to be saved + * and restored. On entry: + * x0 = previous task_struct (must be preserved across the switch) + * x1 = next task_struct + * Previous and next are guaranteed not to be the same. + * + */ +ENTRY(cpu_switch_to) + mov x10, #THREAD_CPU_CONTEXT + add x8, x0, x10 + mov x9, sp + stp x19, x20, [x8], #16 // store callee-saved registers + stp x21, x22, [x8], #16 + stp x23, x24, [x8], #16 + stp x25, x26, [x8], #16 + stp x27, x28, [x8], #16 + stp x29, x9, [x8], #16 + str lr, [x8] + add x8, x1, x10 + ldp x19, x20, [x8], #16 // restore callee-saved registers + ldp x21, x22, [x8], #16 + ldp x23, x24, [x8], #16 + ldp x25, x26, [x8], #16 + ldp x27, x28, [x8], #16 + ldp x29, x9, [x8], #16 + ldr lr, [x8] + mov sp, x9 + msr sp_el0, x1 + ret +ENDPROC(cpu_switch_to) +NOKPROBE(cpu_switch_to) + +/* + * This is how we return from a fork. + */ +ENTRY(ret_from_fork) + bl schedule_tail + cbz x19, 1f // not a kernel thread + mov x0, x20 + blr x19 +1: get_thread_info tsk + b ret_to_user +ENDPROC(ret_from_fork) +NOKPROBE(ret_from_fork) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c7b4995868e1..3a68cf38a6b3 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -17,16 +17,19 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/bottom_half.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/percpu.h> +#include <linux/preempt.h> #include <linux/sched/signal.h> #include <linux/signal.h> -#include <linux/hardirq.h> #include <asm/fpsimd.h> #include <asm/cputype.h> +#include <asm/simd.h> #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -62,6 +65,13 @@ * CPU currently contain the most recent userland FPSIMD state of the current * task. * + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may + * save the task's FPSIMD context back to task_struct from softirq context. + * To prevent this from racing with the manipulation of the task's FPSIMD state + * from task context and thereby corrupting the state, it is necessary to + * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE + * flag with local_bh_disable() unless softirqs are already masked. + * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_state.cpu field * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu @@ -161,11 +171,14 @@ void fpsimd_flush_thread(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); - preempt_enable(); + + local_bh_enable(); } /* @@ -176,10 +189,13 @@ void fpsimd_preserve_current_state(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); - preempt_enable(); + + local_bh_enable(); } /* @@ -191,15 +207,18 @@ void fpsimd_restore_current_state(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; fpsimd_load_state(st); - this_cpu_write(fpsimd_last_state, st); + __this_cpu_write(fpsimd_last_state, st); st->cpu = smp_processor_id(); } - preempt_enable(); + + local_bh_enable(); } /* @@ -211,15 +230,18 @@ void fpsimd_update_current_state(struct fpsimd_state *state) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + fpsimd_load_state(state); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; - this_cpu_write(fpsimd_last_state, st); + __this_cpu_write(fpsimd_last_state, st); st->cpu = smp_processor_id(); } - preempt_enable(); + + local_bh_enable(); } /* @@ -232,52 +254,122 @@ void fpsimd_flush_task_state(struct task_struct *t) #ifdef CONFIG_KERNEL_MODE_NEON -static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); -static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); +DEFINE_PER_CPU(bool, kernel_neon_busy); +EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); /* * Kernel-side NEON support functions */ -void kernel_neon_begin_partial(u32 num_regs) + +/* + * kernel_neon_begin(): obtain the CPU FPSIMD registers for use by the calling + * context + * + * Must not be called unless may_use_simd() returns true. + * Task context in the FPSIMD registers is saved back to memory as necessary. + * + * A matching call to kernel_neon_end() must be made before returning from the + * calling context. + * + * The caller may freely use the FPSIMD registers until kernel_neon_end() is + * called. + */ +void kernel_neon_begin(void) { if (WARN_ON(!system_supports_fpsimd())) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - BUG_ON(num_regs > 32); - fpsimd_save_partial_state(s, roundup(num_regs, 2)); - } else { - /* - * Save the userland FPSIMD state if we have one and if we - * haven't done so already. Clear fpsimd_last_state to indicate - * that there is no longer userland FPSIMD state in the - * registers. - */ - preempt_disable(); - if (current->mm && - !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); - this_cpu_write(fpsimd_last_state, NULL); - } + BUG_ON(!may_use_simd()); + + local_bh_disable(); + + __this_cpu_write(kernel_neon_busy, true); + + /* Save unsaved task fpsimd state, if any: */ + if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) + fpsimd_save_state(¤t->thread.fpsimd_state); + + /* Invalidate any task state remaining in the fpsimd regs: */ + __this_cpu_write(fpsimd_last_state, NULL); + + preempt_disable(); + + local_bh_enable(); } -EXPORT_SYMBOL(kernel_neon_begin_partial); +EXPORT_SYMBOL(kernel_neon_begin); +/* + * kernel_neon_end(): give the CPU FPSIMD registers back to the current task + * + * Must be called from a context in which kernel_neon_begin() was previously + * called, with no call to kernel_neon_end() in the meantime. + * + * The caller must not use the FPSIMD registers after this function is called, + * unless kernel_neon_begin() is called again in the meantime. + */ void kernel_neon_end(void) { + bool busy; + if (!system_supports_fpsimd()) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - fpsimd_load_partial_state(s); - } else { - preempt_enable(); - } + + busy = __this_cpu_xchg(kernel_neon_busy, false); + WARN_ON(!busy); /* No matching kernel_neon_begin()? */ + + preempt_enable(); } EXPORT_SYMBOL(kernel_neon_end); +static DEFINE_PER_CPU(struct fpsimd_state, efi_fpsimd_state); +static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); + +/* + * EFI runtime services support functions + * + * The ABI for EFI runtime services allows EFI to use FPSIMD during the call. + * This means that for EFI (and only for EFI), we have to assume that FPSIMD + * is always used rather than being an optional accelerator. + * + * These functions provide the necessary support for ensuring FPSIMD + * save/restore in the contexts from which EFI is used. + * + * Do not use them for any other purpose -- if tempted to do so, you are + * either doing something wrong or you need to propose some refactoring. + */ + +/* + * __efi_fpsimd_begin(): prepare FPSIMD for making an EFI runtime services call + */ +void __efi_fpsimd_begin(void) +{ + if (!system_supports_fpsimd()) + return; + + WARN_ON(preemptible()); + + if (may_use_simd()) + kernel_neon_begin(); + else { + fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + __this_cpu_write(efi_fpsimd_state_used, true); + } +} + +/* + * __efi_fpsimd_end(): clean up FPSIMD after an EFI runtime services call + */ +void __efi_fpsimd_end(void) +{ + if (!system_supports_fpsimd()) + return; + + if (__this_cpu_xchg(efi_fpsimd_state_used, false)) + fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + else + kernel_neon_end(); +} + #endif /* CONFIG_KERNEL_MODE_NEON */ #ifdef CONFIG_CPU_PM diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index adb0910b88f5..7434ec0c7a27 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -143,8 +143,8 @@ preserve_boot_args: dmb sy // needed before dc ivac with // MMU off - add x1, x0, #0x20 // 4 x 8 bytes - b __inval_cache_range // tail call + mov x1, #0x20 // 4 x 8 bytes + b __inval_dcache_area // tail call ENDPROC(preserve_boot_args) /* @@ -221,20 +221,20 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE - bl __inval_cache_range + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + bl __inval_dcache_area /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 - cmp x0, x6 - b.lo 1b + subs x1, x1, #64 + b.ne 1b mov x7, SWAPPER_MM_MMUFLAGS @@ -307,9 +307,9 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) dmb sy - bl __inval_cache_range + bl __inval_dcache_area ret x28 ENDPROC(__create_page_tables) @@ -361,6 +361,9 @@ __primary_switched: ret // to __primary_switch() 0: #endif + add sp, sp, #16 + mov x29, #0 + mov x30, #0 b start_kernel ENDPROC(__primary_switched) @@ -616,6 +619,7 @@ __secondary_switched: ldr x2, [x0, #CPU_BOOT_TASK] msr sp_el0, x2 mov x29, #0 + mov x30, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index a44e13942d30..095d3c170f5d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -330,7 +330,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_pte, pte_clear_rdonly(pte)); + set_pte(dst_pte, pte_mkwrite(pte)); } else if (debug_pagealloc_enabled() && !pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if @@ -343,7 +343,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_pte, pte_mkpresent(pte_clear_rdonly(pte))); + set_pte(dst_pte, pte_mkpresent(pte_mkwrite(pte))); } } diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 2386b26c0712..713561e5bcab 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -23,15 +23,16 @@ #include <linux/kernel_stat.h> #include <linux/irq.h> +#include <linux/memory.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/irqchip.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ -DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); int arch_show_interrupts(struct seq_file *p, int prec) { @@ -50,8 +51,43 @@ void __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) handle_arch_irq = handle_irq; } +#ifdef CONFIG_VMAP_STACK +static void init_irq_stacks(void) +{ + int cpu; + unsigned long *p; + + for_each_possible_cpu(cpu) { + /* + * To ensure that VMAP'd stack overflow detection works + * correctly, the IRQ stacks need to have the same + * alignment as other stacks. + */ + p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP, PAGE_KERNEL, + 0, cpu_to_node(cpu), + __builtin_return_address(0)); + + per_cpu(irq_stack_ptr, cpu) = p; + } +} +#else +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ +DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +static void init_irq_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); +} +#endif + void __init init_IRQ(void) { + init_irq_stacks(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 481f54a866c5..11121f608eb5 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -252,7 +252,7 @@ void machine_crash_shutdown(struct pt_regs *regs) local_irq_disable(); /* shutdown non-crashing cpus */ - smp_send_crash_stop(); + crash_smp_send_stop(); /* for crashing cpu */ crash_save_cpu(regs, smp_processor_id()); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 713ca824f266..bcafd7dcfe8b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -162,7 +162,6 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index b5798ba21189..9eaef51f83ff 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -202,55 +202,6 @@ static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, }; -/* ARM Cortex-A53 HW events mapping. */ -static const unsigned armv8_a53_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, -}; - -/* ARM Cortex-A57 and Cortex-A72 events mapping. */ -static const unsigned armv8_a57_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, -}; - -static const unsigned armv8_thunder_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - -/* Broadcom Vulcan events mapping */ -static const unsigned armv8_vulcan_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_BR_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { @@ -281,27 +232,10 @@ static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { PERF_CACHE_MAP_ALL_UNSUPPORTED, - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(LL)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE, - [C(LL)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL, - [C(LL)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE, - [C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL, - - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -314,18 +248,26 @@ static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, +}; - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, +static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + PERF_CACHE_MAP_ALL_UNSUPPORTED, + + [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, + [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, + + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, + + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -340,8 +282,6 @@ static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS, [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, [C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS, [C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS, @@ -349,13 +289,6 @@ static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, }; static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -368,22 +301,11 @@ static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB, - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; @@ -846,17 +768,14 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct hw_perf_event *hwc = &event->hw; unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; - /* Always place a cycle counter into the cycle counter. */ + /* Always prefer to place a cycle counter into the cycle counter. */ if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) { - if (test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) - return -EAGAIN; - - return ARMV8_IDX_CYCLE_COUNTER; + if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) + return ARMV8_IDX_CYCLE_COUNTER; } /* - * For anything other than a cycle counter, try and use - * the events counters + * Otherwise use events counters */ for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; ++idx) { if (!test_and_set_bit(idx, cpuc->used_mask)) @@ -924,7 +843,13 @@ static void armv8pmu_reset(void *info) ARMV8_PMU_PMCR_LC); } -static int armv8_pmuv3_map_event(struct perf_event *event) +static int __armv8_pmuv3_map_event(struct perf_event *event, + const unsigned (*extra_event_map) + [PERF_COUNT_HW_MAX], + const unsigned (*extra_cache_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]) { int hw_event_id; struct arm_pmu *armpmu = to_arm_pmu(event->pmu); @@ -932,44 +857,47 @@ static int armv8_pmuv3_map_event(struct perf_event *event) hw_event_id = armpmu_map_event(event, &armv8_pmuv3_perf_map, &armv8_pmuv3_perf_cache_map, ARMV8_PMU_EVTYPE_EVENT); - if (hw_event_id < 0) - return hw_event_id; - /* disable micro/arch events not supported by this PMU */ - if ((hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) && - !test_bit(hw_event_id, armpmu->pmceid_bitmap)) { - return -EOPNOTSUPP; + /* Onl expose micro/arch events supported by this PMU */ + if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) + && test_bit(hw_event_id, armpmu->pmceid_bitmap)) { + return hw_event_id; } - return hw_event_id; + return armpmu_map_event(event, extra_event_map, extra_cache_map, + ARMV8_PMU_EVTYPE_EVENT); +} + +static int armv8_pmuv3_map_event(struct perf_event *event) +{ + return __armv8_pmuv3_map_event(event, NULL, NULL); } static int armv8_a53_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_a53_perf_map, - &armv8_a53_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map); } static int armv8_a57_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_a57_perf_map, - &armv8_a57_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map); +} + +static int armv8_a73_map_event(struct perf_event *event) +{ + return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map); } static int armv8_thunder_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_thunder_perf_map, - &armv8_thunder_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, + &armv8_thunder_perf_cache_map); } static int armv8_vulcan_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_vulcan_perf_map, - &armv8_vulcan_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, + &armv8_vulcan_perf_cache_map); } struct armv8pmu_probe_info { @@ -1062,6 +990,22 @@ static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) return 0; } +static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) +{ + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + + cpu_pmu->name = "armv8_cortex_a35"; + cpu_pmu->map_event = armv8_a53_map_event; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = + &armv8_pmuv3_events_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = + &armv8_pmuv3_format_attr_group; + + return 0; +} + static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) { int ret = armv8_pmu_init(cpu_pmu); @@ -1110,6 +1054,22 @@ static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) return 0; } +static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) +{ + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + + cpu_pmu->name = "armv8_cortex_a73"; + cpu_pmu->map_event = armv8_a73_map_event; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = + &armv8_pmuv3_events_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = + &armv8_pmuv3_format_attr_group; + + return 0; +} + static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) { int ret = armv8_pmu_init(cpu_pmu); @@ -1144,9 +1104,11 @@ static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_init}, + {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, + {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, {}, diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 26c998534dca..636ca0119c0e 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -40,7 +40,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, probe_opcode_t insn; /* TODO: Currently we do not support AARCH32 instruction probing */ - if (test_bit(TIF_32BIT, &mm->context.flags)) + if (mm->context.flags & MMCF_AARCH32) return -ENOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 659ae8094ed5..2dc0f8482210 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, /* * Complete any pending TLB or cache maintenance on this CPU in case * the thread migrates to a different CPU. + * This full barrier is also required by the membarrier system + * call. */ dsb(ish); @@ -382,15 +384,12 @@ unsigned long get_wchan(struct task_struct *p) return 0; frame.fp = thread_saved_fp(p); - frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif do { - if (frame.sp < stack_page || - frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(p, &frame)) + if (unwind_frame(p, &frame)) goto out; if (!in_sched_functions(frame.pc)) { ret = frame.pc; @@ -417,3 +416,11 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) else return randomize_page(mm->brk, SZ_1G); } + +/* + * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. + */ +void arch_setup_new_exec(void) +{ + current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; +} diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 1b38c0150aec..9cbb6123208f 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -42,6 +42,7 @@ #include <asm/compat.h> #include <asm/debug-monitors.h> #include <asm/pgtable.h> +#include <asm/stacktrace.h> #include <asm/syscall.h> #include <asm/traps.h> #include <asm/system_misc.h> @@ -127,7 +128,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || - on_irq_stack(addr, raw_smp_processor_id()); + on_irq_stack(addr); } /** @@ -1363,7 +1364,7 @@ static void tracehook_report_syscall(struct pt_regs *regs, if (dir == PTRACE_SYSCALL_EXIT) tracehook_report_syscall_exit(regs, 0); else if (tracehook_report_syscall_entry(regs)) - regs->syscallno = ~0UL; + forget_syscall(regs); regs->regs[regno] = saved_reg; } diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 12a87f2600f2..933adbc0f654 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -42,7 +42,6 @@ void *return_address(unsigned int level) data.addr = NULL; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 089c3747995d..c45214f8fb54 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -29,6 +29,7 @@ #include <linux/string.h> #include <linux/tracehook.h> #include <linux/ratelimit.h> +#include <linux/syscalls.h> #include <asm/debug-monitors.h> #include <asm/elf.h> @@ -36,6 +37,7 @@ #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> +#include <asm/ptrace.h> #include <asm/signal32.h> #include <asm/vdso.h> @@ -387,7 +389,7 @@ static int restore_sigframe(struct pt_regs *regs, /* * Avoid sys_rt_sigreturn() restarting. */ - regs->syscallno = ~0UL; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); if (err == 0) @@ -673,13 +675,12 @@ static void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; - int syscall = (int)regs->syscallno; struct ksignal ksig; /* * If we were from a system call, check for system call restarting... */ - if (syscall >= 0) { + if (in_syscall(regs)) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; @@ -687,7 +688,7 @@ static void do_signal(struct pt_regs *regs) /* * Avoid additional syscall restarting via ret_to_user. */ - regs->syscallno = ~0UL; + forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a @@ -731,7 +732,7 @@ static void do_signal(struct pt_regs *regs) * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ - if (syscall >= 0 && regs->pc == restart_addr) { + if (in_syscall(regs) && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); @@ -749,6 +750,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, * Update the trace code with the current status. */ trace_hardirqs_off(); + + /* Check valid user FS if needed */ + addr_limit_user_check(); + do { if (thread_flags & _TIF_NEED_RESCHED) { schedule(); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index c747a0fc5d7d..4e5a664be04b 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -354,7 +354,7 @@ static int compat_restore_sigframe(struct pt_regs *regs, /* * Avoid compat_sys_sigreturn() restarting. */ - regs->syscallno = ~0UL; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dc66e6ec3a99..ffe089942ac4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -154,7 +154,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; + secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -977,11 +977,21 @@ void smp_send_stop(void) } #ifdef CONFIG_KEXEC_CORE -void smp_send_crash_stop(void) +void crash_smp_send_stop(void) { + static int cpus_stopped; cpumask_t mask; unsigned long timeout; + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + */ + if (cpus_stopped) + return; + + cpus_stopped = 1; + if (num_online_cpus() == 1) return; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 09d37d66b630..3144584617e7 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -42,33 +42,17 @@ */ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { - unsigned long high, low; unsigned long fp = frame->fp; - unsigned long irq_stack_ptr; + + if (fp & 0xf) + return -EINVAL; if (!tsk) tsk = current; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - - low = frame->sp; - /* irq stacks are not THREAD_SIZE aligned */ - if (on_irq_stack(frame->sp, raw_smp_processor_id())) - high = irq_stack_ptr; - else - high = ALIGN(low, THREAD_SIZE) - 0x20; - - if (fp < low || fp > high || fp & 0xf) + if (!on_accessible_stack(tsk, fp)) return -EINVAL; - frame->sp = fp + 0x10; frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); @@ -86,34 +70,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* - * Check whether we are going to walk through from interrupt stack - * to task stack. - * If we reach the end of the stack - and its an interrupt stack, - * unpack the dummy frame to find the original elr. - * - * Check the frame->fp we read from the bottom of the irq_stack, - * and the original task stack pointer are both in current->stack. + * Frames created upon entry from EL0 have NULL FP and PC values, so + * don't bother reporting these. Frames created by __noreturn functions + * might have a valid FP even if PC is bogus, so only terminate where + * both are NULL. */ - if (frame->sp == irq_stack_ptr) { - struct pt_regs *irq_args; - unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - - if (object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) { - frame->sp = orig_sp; - - /* orig_sp is the saved pt_regs, find the elr */ - irq_args = (struct pt_regs *)orig_sp; - frame->pc = irq_args->pc; - } else { - /* - * This frame has a non-standard format, and we - * didn't fix it, because the data looked wrong. - * Refuse to output this frame. - */ - return -EINVAL; - } - } + if (!frame->fp && !frame->pc) + return -EINVAL; return 0; } @@ -167,7 +130,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) data.no_sched_functions = 0; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; @@ -192,12 +154,10 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (tsk != current) { data.no_sched_functions = 1; frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } else { data.no_sched_functions = 0; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index da33c90248e9..a4391280fba9 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -50,7 +50,6 @@ unsigned long profile_pc(struct pt_regs *regs) return regs->pc; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = -1; /* no task info */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8a62648848e5..5ea4b85aee0e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -32,6 +32,7 @@ #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/sizes.h> #include <linux/syscalls.h> #include <linux/mm_types.h> @@ -41,6 +42,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> @@ -143,7 +145,6 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr; int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -154,25 +155,14 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!try_get_task_stack(tsk)) return; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; } else { /* * task blocked in __switch_to */ frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -182,13 +172,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) skip = !!regs; printk("Call trace:\n"); while (1) { - unsigned long where = frame.pc; unsigned long stack; int ret; /* skip until specified stack frame */ if (!skip) { - dump_backtrace_entry(where); + dump_backtrace_entry(frame.pc); } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -203,20 +192,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) ret = unwind_frame(tsk, &frame); if (ret < 0) break; - stack = frame.sp; - if (in_exception_text(where)) { - /* - * If we switched to the irq_stack before calling this - * exception handler, then the pt_regs will be on the - * task stack. The easiest way to tell is if the large - * pt_regs would overlap with the end of the irq_stack. - */ - if (stack < irq_stack_ptr && - (stack + sizeof(struct pt_regs)) > irq_stack_ptr) - stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (in_entry_text(frame.pc)) { + stack = frame.fp - offsetof(struct pt_regs, stackframe); - dump_mem("", "Exception stack", stack, - stack + sizeof(struct pt_regs)); + if (on_accessible_stack(tsk, stack)) + dump_mem("", "Exception stack", stack, + stack + sizeof(struct pt_regs)); } } @@ -257,8 +238,6 @@ static int __die(const char *str, int err, struct pt_regs *regs) end_of_stack(tsk)); if (!user_mode(regs)) { - dump_mem(KERN_EMERG, "Stack: ", regs->sp, - THREAD_SIZE + (unsigned long)task_stack_page(tsk)); dump_backtrace(regs, tsk); dump_instr(KERN_EMERG, regs); } @@ -484,6 +463,9 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ + __user_cache_maint("sys 3, c7, c12, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CIVAC: /* DC CIVAC */ __user_cache_maint("dc civac", address, ret); break; @@ -593,7 +575,7 @@ asmlinkage long do_ni_syscall(struct pt_regs *regs) if (show_unhandled_signals_ratelimited()) { pr_info("%s[%d]: syscall %d\n", current->comm, - task_pid_nr(current), (int)regs->syscallno); + task_pid_nr(current), regs->syscallno); dump_instr("", regs); if (user_mode(regs)) __show_regs(regs); @@ -689,6 +671,43 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) force_sig_info(info.si_signo, &info, current); } +#ifdef CONFIG_VMAP_STACK + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +asmlinkage void handle_bad_stack(struct pt_regs *regs) +{ + unsigned long tsk_stk = (unsigned long)current->stack; + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); + unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned int esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + console_verbose(); + pr_emerg("Insufficient stack space to handle exception!"); + + pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("FAR: 0x%016lx\n", far); + + pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", + tsk_stk, tsk_stk + THREAD_SIZE); + pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", + irq_stk, irq_stk + THREAD_SIZE); + pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", + ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); + + __show_regs(regs); + + /* + * We use nmi_panic to limit the potential for recusive overflows, and + * to get a better stack trace. + */ + nmi_panic(NULL, "kernel stack overflow"); + cpu_park_loop(); +} +#endif + void __pte_error(const char *file, int line, unsigned long val) { pr_err("%s:%d: bad pte %016lx.\n", file, line, val); diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e8f759f764f2..2d419006ad43 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -110,12 +110,27 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) } #endif /* CONFIG_COMPAT */ +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + unsigned long vdso_size = vdso_end - vdso_start; + + if (vdso_size != new_size) + return -EINVAL; + + current->mm->context.vdso = (void *)new_vma->vm_start; + + return 0; +} + static struct vm_special_mapping vdso_spec[2] __ro_after_init = { { .name = "[vvar]", }, { .name = "[vdso]", + .mremap = vdso_mremap, }, }; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 987a00ee446c..fe56c268a7d9 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -72,22 +72,6 @@ PECOFF_FILE_ALIGNMENT = 0x200; #define PECOFF_EDATA_PADDING #endif -#if defined(CONFIG_DEBUG_ALIGN_RODATA) -/* - * 4 KB granule: 1 level 2 entry - * 16 KB granule: 128 level 3 entries, with contiguous bit - * 64 KB granule: 32 level 3 entries, with contiguous bit - */ -#define SEGMENT_ALIGN SZ_2M -#else -/* - * 4 KB granule: 16 level 3 entries, with contiguous bit - * 16 KB granule: 4 level 3 entries, without contiguous bit - * 64 KB granule: 1 level 3 entry - */ -#define SEGMENT_ALIGN SZ_64K -#endif - SECTIONS { /* @@ -192,7 +176,7 @@ SECTIONS _data = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) /* * Data written with the MMU off but read with the MMU on requires diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index b81f4091c909..a81f5e10fc8c 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -70,7 +70,7 @@ u32 __hyp_text __init_stage2_translation(void) * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. */ tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp) + if (tmp) val |= VTCR_EL2_HA; /* diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index c86b7909ef31..a0abc142c92b 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -17,3 +17,5 @@ CFLAGS_atomic_ll_sc.o := -fcall-used-x0 -ffixed-x1 -ffixed-x2 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ -fcall-saved-x18 + +lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c new file mode 100644 index 000000000000..b6ceafdb8b72 --- /dev/null +++ b/arch/arm64/lib/uaccess_flushcache.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/uaccess.h> +#include <asm/barrier.h> +#include <asm/cacheflush.h> + +void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + /* + * We assume this should not be called with @dst pointing to + * non-cacheable memory, such that we don't need an explicit + * barrier to order the cache maintenance against the memcpy. + */ + memcpy(dst, src, cnt); + __clean_dcache_area_pop(dst, cnt); +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + memcpy_flushcache(to, page_address(page) + offset, len); +} + +unsigned long __copy_user_flushcache(void *to, const void __user *from, + unsigned long n) +{ + unsigned long rc = __arch_copy_from_user(to, from, n); + + /* See above */ + __clean_dcache_area_pop(to, n - rc); + return rc; +} diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 83c27b6e6dca..7f1dbe962cf5 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -109,20 +109,25 @@ ENTRY(__clean_dcache_area_pou) ENDPROC(__clean_dcache_area_pou) /* - * __dma_inv_area(start, size) - * - start - virtual start address of region + * __inval_dcache_area(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are invalidated. Any partial lines at the ends of the interval are + * also cleaned to PoC to prevent data loss. + * + * - kaddr - kernel address * - size - size in question */ -__dma_inv_area: - add x1, x1, x0 +ENTRY(__inval_dcache_area) /* FALLTHROUGH */ /* - * __inval_cache_range(start, end) - * - start - start address of region - * - end - end address of region + * __dma_inv_area(start, size) + * - start - virtual start address of region + * - size - size in question */ -ENTRY(__inval_cache_range) +__dma_inv_area: + add x1, x1, x0 dcache_line_size x2, x3 sub x3, x2, #1 tst x1, x3 // end cache line aligned? @@ -140,7 +145,7 @@ ENTRY(__inval_cache_range) b.lo 2b dsb sy ret -ENDPIPROC(__inval_cache_range) +ENDPIPROC(__inval_dcache_area) ENDPROC(__dma_inv_area) /* @@ -167,6 +172,20 @@ ENDPIPROC(__clean_dcache_area_poc) ENDPROC(__dma_clean_area) /* + * __clean_dcache_area_pop(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoP. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pop) + dcache_by_line_op cvap, sy, x0, x1, x2, x3 + ret +ENDPIPROC(__clean_dcache_area_pop) + +/* * __dma_flush_area(start, size) * * clean & invalidate D / U line diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index f27d4dd04384..614af886b7ef 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -42,7 +42,7 @@ static pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot, return prot; } -static struct gen_pool *atomic_pool; +static struct gen_pool *atomic_pool __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -425,7 +425,7 @@ static int __init atomic_pool_init(void) gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, - (void *)PAGE_SHIFT); + NULL); pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", atomic_pool_size / 1024); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1f22a41565a3..89993c4be1be 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -34,6 +34,7 @@ #include <linux/hugetlb.h> #include <asm/bug.h> +#include <asm/cmpxchg.h> #include <asm/cpufeature.h> #include <asm/exception.h> #include <asm/debug-monitors.h> @@ -82,6 +83,49 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) } #endif +static void data_abort_decode(unsigned int esr) +{ + pr_alert("Data abort info:\n"); + + if (esr & ESR_ELx_ISV) { + pr_alert(" Access size = %u byte(s)\n", + 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT)); + pr_alert(" SSE = %lu, SRT = %lu\n", + (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT, + (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT); + pr_alert(" SF = %lu, AR = %lu\n", + (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT, + (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT); + } else { + pr_alert(" ISV = 0, ISS = 0x%08lu\n", esr & ESR_ELx_ISS_MASK); + } + + pr_alert(" CM = %lu, WnR = %lu\n", + (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT, + (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT); +} + +/* + * Decode mem abort information + */ +static void mem_abort_decode(unsigned int esr) +{ + pr_alert("Mem abort info:\n"); + + pr_alert(" Exception class = %s, IL = %u bits\n", + esr_get_class_string(esr), + (esr & ESR_ELx_IL) ? 32 : 16); + pr_alert(" SET = %lu, FnV = %lu\n", + (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT, + (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT); + pr_alert(" EA = %lu, S1PTW = %lu\n", + (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT, + (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT); + + if (esr_is_data_abort(esr)) + data_abort_decode(esr); +} + /* * Dump out the page tables associated with 'addr' in the currently active mm. */ @@ -139,7 +183,6 @@ void show_pte(unsigned long addr) pr_cont("\n"); } -#ifdef CONFIG_ARM64_HW_AFDBM /* * This function sets the access flags (dirty, accessed), as well as write * permission, and only to a more permissive setting. @@ -154,18 +197,13 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - pteval_t old_pteval; - unsigned int tmp; + pteval_t old_pteval, pteval; if (pte_same(*ptep, entry)) return 0; /* only preserve the access flags and write permission */ - pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY; - - /* set PTE_RDONLY if actual read-only or clean PTE */ - if (!pte_write(entry) || !pte_sw_dirty(entry)) - pte_val(entry) |= PTE_RDONLY; + pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; /* * Setting the flags must be done atomically to avoid racing with the @@ -174,21 +212,18 @@ int ptep_set_access_flags(struct vm_area_struct *vma, * (calculated as: a & b == ~(~a | ~b)). */ pte_val(entry) ^= PTE_RDONLY; - asm volatile("// ptep_set_access_flags\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " eor %0, %0, %3 // negate PTE_RDONLY in *ptep\n" - " orr %0, %0, %4 // set flags\n" - " eor %0, %0, %3 // negate final PTE_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "L" (PTE_RDONLY), "r" (pte_val(entry))); + pteval = READ_ONCE(pte_val(*ptep)); + do { + old_pteval = pteval; + pteval ^= PTE_RDONLY; + pteval |= pte_val(entry); + pteval ^= PTE_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); + } while (pteval != old_pteval); flush_tlb_fix_spurious_fault(vma, address); return 1; } -#endif static bool is_el1_instruction_abort(unsigned int esr) { @@ -248,6 +283,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg, addr); + mem_abort_decode(esr); + show_pte(addr); die("Oops", regs, esr); bust_spinlocks(0); @@ -705,6 +742,8 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n", inf->name, esr, addr); + mem_abort_decode(esr); + info.si_signo = inf->sig; info.si_errno = 0; info.si_code = inf->code; diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 21a8d828cbf4..e36ed5087b5c 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -83,3 +83,19 @@ EXPORT_SYMBOL(flush_dcache_page); * Additional functions defined in assembly. */ EXPORT_SYMBOL(flush_icache_range); + +#ifdef CONFIG_ARCH_HAS_PMEM_API +void arch_wb_cache_pmem(void *addr, size_t size) +{ + /* Ensure order against any prior non-cacheable writes */ + dmb(osh); + __clean_dcache_area_pop(addr, size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + __inval_dcache_area(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); +#endif diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 656e0ece2289..6cb0fa92a651 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -41,6 +41,16 @@ int pud_huge(pud_t pud) #endif } +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { @@ -58,15 +68,107 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, return CONT_PTES; } +static inline int num_contig_ptes(unsigned long size, size_t *pgsize) +{ + int contig_ptes = 0; + + *pgsize = size; + + switch (size) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE: + contig_ptes = 1; + break; + case CONT_PMD_SIZE: + *pgsize = PMD_SIZE; + contig_ptes = CONT_PMDS; + break; + case CONT_PTE_SIZE: + *pgsize = PAGE_SIZE; + contig_ptes = CONT_PTES; + break; + } + + return contig_ptes; +} + +/* + * Changing some bits of contiguous entries requires us to follow a + * Break-Before-Make approach, breaking the whole contiguous set + * before we can change any entries. See ARM DDI 0487A.k_iss10775, + * "Misprogramming of the Contiguous bit", page D4-1762. + * + * This helper performs the break step. + */ +static pte_t get_clear_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + struct vm_area_struct vma = { .vm_mm = mm }; + pte_t orig_pte = huge_ptep_get(ptep); + bool valid = pte_valid(orig_pte); + unsigned long i, saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + + /* + * If HW_AFDBM is enabled, then the HW could turn on + * the dirty bit for any page in the set, so check + * them all. All hugetlb entries are already young. + */ + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + } + + if (valid) + flush_tlb_range(&vma, saddr, addr); + return orig_pte; +} + +/* + * Changing some bits of contiguous entries requires us to follow a + * Break-Before-Make approach, breaking the whole contiguous set + * before we can change any entries. See ARM DDI 0487A.k_iss10775, + * "Misprogramming of the Contiguous bit", page D4-1762. + * + * This helper performs the break step for use cases where the + * original pte is not needed. + */ +static void clear_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + struct vm_area_struct vma = { .vm_mm = mm }; + unsigned long i, saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + pte_clear(mm, addr, ptep); + + flush_tlb_range(&vma, saddr, addr); +} + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { size_t pgsize; int i; int ncontig; - unsigned long pfn; + unsigned long pfn, dpfn; pgprot_t hugeprot; + /* + * Code needs to be expanded to handle huge swap and migration + * entries. Needed for HUGETLB and MEMORY_FAILURE. + */ + WARN_ON(!pte_present(pte)); + if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); return; @@ -74,17 +176,30 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, ncontig = find_num_contig(mm, addr, ptep, &pgsize); pfn = pte_pfn(pte); - hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); - for (i = 0; i < ncontig; i++) { + dpfn = pgsize >> PAGE_SHIFT; + hugeprot = pte_pgprot(pte); + + clear_flush(mm, addr, ptep, pgsize, ncontig); + + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) { pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, pte_val(pfn_pte(pfn, hugeprot))); set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); - ptep++; - pfn += pgsize >> PAGE_SHIFT; - addr += pgsize; } } +void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + int i, ncontig; + size_t pgsize; + + ncontig = num_contig_ptes(sz, &pgsize); + + for (i = 0; i < ncontig; i++, ptep++) + set_pte(ptep, pte); +} + pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { @@ -144,19 +259,28 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(pgd, addr); - if (pud_none(*pud)) + if (sz != PUD_SIZE && pud_none(*pud)) return NULL; - /* swap or huge page */ - if (!pud_present(*pud) || pud_huge(*pud)) + /* hugepage or swap? */ + if (pud_huge(*pud) || !pud_present(*pud)) return (pte_t *)pud; /* table; check the next level */ + if (sz == CONT_PMD_SIZE) + addr &= CONT_PMD_MASK; + pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) && + pmd_none(*pmd)) return NULL; - if (!pmd_present(*pmd) || pmd_huge(*pmd)) + if (pmd_huge(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; + if (sz == CONT_PTE_SIZE) { + pte_t *pte = pte_offset_kernel(pmd, (addr & CONT_PTE_MASK)); + return pte; + } + return NULL; } @@ -176,111 +300,133 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, return entry; } +void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) +{ + int i, ncontig; + size_t pgsize; + + ncontig = num_contig_ptes(sz, &pgsize); + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + pte_clear(mm, addr, ptep); +} + pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte; + int ncontig; + size_t pgsize; + pte_t orig_pte = huge_ptep_get(ptep); - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize; - bool is_dirty = false; - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - /* save the 1st pte to return */ - pte = ptep_get_and_clear(mm, addr, ptep); - for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { - /* - * If HW_AFDBM is enabled, then the HW could - * turn on the dirty bit for any of the page - * in the set, so check them all. - */ - ++ptep; - if (pte_dirty(ptep_get_and_clear(mm, addr, ptep))) - is_dirty = true; - } - if (is_dirty) - return pte_mkdirty(pte); - else - return pte; - } else { + if (!pte_cont(orig_pte)) return ptep_get_and_clear(mm, addr, ptep); - } + + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + + return get_clear_flush(mm, addr, ptep, pgsize, ncontig); } int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - if (pte_cont(pte)) { - int ncontig, i, changed = 0; - size_t pgsize = 0; - unsigned long pfn = pte_pfn(pte); - /* Select all bits except the pfn */ - pgprot_t hugeprot = - __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ - pte_val(pte)); - - pfn = pte_pfn(pte); - ncontig = find_num_contig(vma->vm_mm, addr, ptep, - &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) { - changed |= ptep_set_access_flags(vma, addr, ptep, - pfn_pte(pfn, - hugeprot), - dirty); - pfn += pgsize >> PAGE_SHIFT; - } - return changed; - } else { + int ncontig, i, changed = 0; + size_t pgsize = 0; + unsigned long pfn = pte_pfn(pte), dpfn; + pgprot_t hugeprot; + pte_t orig_pte; + + if (!pte_cont(pte)) return ptep_set_access_flags(vma, addr, ptep, pte, dirty); - } + + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); + dpfn = pgsize >> PAGE_SHIFT; + + orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); + if (!pte_same(orig_pte, pte)) + changed = 1; + + /* Make sure we don't lose the dirty state */ + if (pte_dirty(orig_pte)) + pte = pte_mkdirty(pte); + + hugeprot = pte_pgprot(pte); + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) + set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot)); + + return changed; } void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize = 0; - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) - ptep_set_wrprotect(mm, addr, ptep); - } else { + unsigned long pfn, dpfn; + pgprot_t hugeprot; + int ncontig, i; + size_t pgsize; + pte_t pte; + + if (!pte_cont(*ptep)) { ptep_set_wrprotect(mm, addr, ptep); + return; } + + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + dpfn = pgsize >> PAGE_SHIFT; + + pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig); + pte = pte_wrprotect(pte); + + hugeprot = pte_pgprot(pte); + pfn = pte_pfn(pte); + + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); } void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize = 0; - - ncontig = find_num_contig(vma->vm_mm, addr, ptep, - &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) - ptep_clear_flush(vma, addr, ptep); - } else { + size_t pgsize; + int ncontig; + + if (!pte_cont(*ptep)) { ptep_clear_flush(vma, addr, ptep); + return; } + + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); + clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); } static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); - return 0; + switch (ps) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE * CONT_PMDS: + case PMD_SIZE: + case PAGE_SIZE * CONT_PTES: + hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT); + return 1; } - return 1; + + hugetlb_bad_size(); + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); + return 0; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_ARM64_64K_PAGES +static __init int add_default_hugepagesz(void) +{ + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) + hugetlb_add_hstate(CONT_PTE_SHIFT); + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index b02a9268dfbf..783de51a6c4e 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -44,8 +44,12 @@ #define A64_COND_NE AARCH64_INSN_COND_NE /* != */ #define A64_COND_CS AARCH64_INSN_COND_CS /* unsigned >= */ #define A64_COND_HI AARCH64_INSN_COND_HI /* unsigned > */ +#define A64_COND_LS AARCH64_INSN_COND_LS /* unsigned <= */ +#define A64_COND_CC AARCH64_INSN_COND_CC /* unsigned < */ #define A64_COND_GE AARCH64_INSN_COND_GE /* signed >= */ #define A64_COND_GT AARCH64_INSN_COND_GT /* signed > */ +#define A64_COND_LE AARCH64_INSN_COND_LE /* signed <= */ +#define A64_COND_LT AARCH64_INSN_COND_LT /* signed < */ #define A64_B_(cond, imm19) A64_COND_BRANCH(cond, (imm19) << 2) /* Unconditional branch (immediate) */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index f32144b2e07f..ba38d403abb2 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -527,10 +527,14 @@ emit_bswap_uxt: /* IF (dst COND src) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: emit(A64_CMP(1, dst, src), ctx); emit_cond_jmp: jmp_offset = bpf2a64_offset(i + off, i, ctx); @@ -542,9 +546,15 @@ emit_cond_jmp: case BPF_JGT: jmp_cond = A64_COND_HI; break; + case BPF_JLT: + jmp_cond = A64_COND_CC; + break; case BPF_JGE: jmp_cond = A64_COND_CS; break; + case BPF_JLE: + jmp_cond = A64_COND_LS; + break; case BPF_JSET: case BPF_JNE: jmp_cond = A64_COND_NE; @@ -552,9 +562,15 @@ emit_cond_jmp: case BPF_JSGT: jmp_cond = A64_COND_GT; break; + case BPF_JSLT: + jmp_cond = A64_COND_LT; + break; case BPF_JSGE: jmp_cond = A64_COND_GE; break; + case BPF_JSLE: + jmp_cond = A64_COND_LE; + break; default: return -EFAULT; } @@ -566,10 +582,14 @@ emit_cond_jmp: /* IF (dst COND imm) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: emit_a64_mov_i(1, tmp, imm, ctx); emit(A64_CMP(1, dst, tmp), ctx); goto emit_cond_jmp; diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index c58f4a83ed6f..f6431439d15d 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -48,11 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __raw_spin_unlock_asm(&lock->lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_read_can_lock(arch_rwlock_t *rw) { return __raw_uncached_fetch_asm(&rw->lock) > 0; diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c index 0188c933b155..15af5768c403 100644 --- a/arch/blackfin/kernel/module.c +++ b/arch/blackfin/kernel/module.c @@ -4,8 +4,6 @@ * Licensed under the GPL-2 or later */ -#define pr_fmt(fmt) "module %s: " fmt, mod->name - #include <linux/moduleloader.h> #include <linux/elf.h> #include <linux/vmalloc.h> @@ -16,6 +14,11 @@ #include <asm/cacheflush.h> #include <linux/uaccess.h> +#define mod_err(mod, fmt, ...) \ + pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__) +#define mod_debug(mod, fmt, ...) \ + pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__) + /* Transfer the section to the L1 memory */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_inst_sram_alloc(s->sh_size); mod->arch.text_l1 = dest; if (dest == NULL) { - pr_err("L1 inst memory allocation failed\n"); + mod_err(mod, "L1 inst memory allocation failed\n"); return -1; } dma_memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_alloc(s->sh_size); mod->arch.data_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_zalloc(s->sh_size); mod->arch.bss_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } @@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.data_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.bss_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memset(dest, 0, s->sh_size); @@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.text_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.data_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_zalloc(s->sh_size); mod->arch.bss_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } @@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, Elf32_Sym *sym; unsigned long location, value, size; - pr_debug("applying relocate section %u to %u\n", - relsec, sechdrs[relsec].sh_info); + mod_debug(mod, "applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ @@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, #ifdef CONFIG_SMP if (location >= COREB_L1_DATA_A_START) { - pr_err("cannot relocate in L1: %u (SMP kernel)\n", + mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } #endif - pr_debug("location is %lx, value is %lx type is %d\n", - location, value, ELF32_R_TYPE(rel[i].r_info)); + mod_debug(mod, "location is %lx, value is %lx type is %d\n", + location, value, ELF32_R_TYPE(rel[i].r_info)); switch (ELF32_R_TYPE(rel[i].r_info)) { @@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, case R_BFIN_PCREL12_JUMP: case R_BFIN_PCREL12_JUMP_S: case R_BFIN_PCREL10: - pr_err("unsupported relocation: %u (no -mlong-calls?)\n", + mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; default: - pr_err("unknown relocation: %u\n", + mod_err(mod, "unknown relocation: %u\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, isram_memcpy((void *)location, &value, size); break; default: - pr_err("invalid relocation for %#lx\n", location); + mod_err(mod, "invalid relocation for %#lx\n", location); return -ENOEXEC; } } diff --git a/arch/cris/arch-v32/mach-a3/arbiter.c b/arch/cris/arch-v32/mach-a3/arbiter.c index ab5c421a4de8..735a9b0abdb8 100644 --- a/arch/cris/arch-v32/mach-a3/arbiter.c +++ b/arch/cris/arch-v32/mach-a3/arbiter.c @@ -227,7 +227,7 @@ static void crisv32_arbiter_config(int arbiter, int region, int unused_slots) } } -extern char _stext, _etext; +extern char _stext[], _etext[]; static void crisv32_arbiter_init(void) { @@ -265,7 +265,7 @@ static void crisv32_arbiter_init(void) #ifndef CONFIG_ETRAX_KGDB /* Global watch for writes to kernel text segment. */ - crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext, + crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext, MARB_CLIENTS(arbiter_all_clients, arbiter_bar_all_clients), arbiter_all_write, NULL); #endif diff --git a/arch/cris/arch-v32/mach-fs/arbiter.c b/arch/cris/arch-v32/mach-fs/arbiter.c index c97f4d8120f9..047c70bdbb23 100644 --- a/arch/cris/arch-v32/mach-fs/arbiter.c +++ b/arch/cris/arch-v32/mach-fs/arbiter.c @@ -158,7 +158,7 @@ static void crisv32_arbiter_config(int region, int unused_slots) } } -extern char _stext, _etext; +extern char _stext[], _etext[]; static void crisv32_arbiter_init(void) { @@ -190,7 +190,7 @@ static void crisv32_arbiter_init(void) #ifndef CONFIG_ETRAX_KGDB /* Global watch for writes to kernel text segment. */ - crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext, + crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext, arbiter_all_clients, arbiter_all_write, NULL); #endif } diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c index a01636a12a6e..d98131c45bb5 100644 --- a/arch/cris/kernel/traps.c +++ b/arch/cris/kernel/traps.c @@ -42,7 +42,7 @@ void (*nmi_handler)(struct pt_regs *); void show_trace(unsigned long *stack) { unsigned long addr, module_start, module_end; - extern char _stext, _etext; + extern char _stext[], _etext[]; int i; pr_err("\nCall Trace: "); @@ -69,8 +69,8 @@ void show_trace(unsigned long *stack) * down the cause of the crash will be able to figure * out the call path that was taken. */ - if (((addr >= (unsigned long)&_stext) && - (addr <= (unsigned long)&_etext)) || + if (((addr >= (unsigned long)_stext) && + (addr <= (unsigned long)_etext)) || ((addr >= module_start) && (addr <= module_end))) { #ifdef CONFIG_KALLSYMS print_ip_sym(addr); diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 2e1da71e27a4..ab346f5f8820 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -7,7 +7,8 @@ #include <asm/errno.h> #include <linux/uaccess.h> -extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); +extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr); static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index f1e3b20dce9f..9abf02d6855a 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -102,5 +102,7 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c index d155ca9e5098..37f7b2bf7f73 100644 --- a/arch/frv/kernel/futex.c +++ b/arch/frv/kernel/futex.c @@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o /* * do the futex operations */ -int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; break; - } - } + if (!ret) + *oval = oldval; return ret; -} /* end futex_atomic_op_inuser() */ +} /* end arch_futex_atomic_op_inuser() */ diff --git a/arch/h8300/include/asm/traps.h b/arch/h8300/include/asm/traps.h index 15e701130b27..1c5a30ec2df8 100644 --- a/arch/h8300/include/asm/traps.h +++ b/arch/h8300/include/asm/traps.h @@ -33,9 +33,9 @@ extern unsigned long *_interrupt_redirect_table; #define TRAP2_VEC 10 #define TRAP3_VEC 11 -extern char _start, _etext; +extern char _start[], _etext[]; #define check_kernel_text(addr) \ - ((addr >= (unsigned long)(&_start)) && \ - (addr < (unsigned long)(&_etext)) && !(addr & 1)) + ((addr >= (unsigned long)(_start)) && \ + (addr < (unsigned long)(_etext)) && !(addr & 1)) #endif /* _H8300_TRAPS_H */ diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index a62ba368b27d..fb3dfb2a667e 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new) ); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /** * atomic_read - reads a word, atomically * @v: pointer to atomic value diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index 7e597f8434da..c607b77c8215 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -31,18 +31,9 @@ static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; pagefault_disable(); @@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index a1c55788c5d6..53a8d5885887 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -179,11 +179,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock) */ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_is_locked(x) ((x)->lock != 0) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index a3d0211970e9..c86a947f5368 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; } -#define acpi_unlazy_tlb(x) - #ifdef CONFIG_ACPI_NUMA extern cpumask_t early_cpu_possible_map; #define for_each_possible_early_cpu(cpu) \ diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index 76acbcd5c060..6d67dc1eaf2b 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -45,18 +45,9 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index ca9e76149a4a..df2c121164b8 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -76,22 +76,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) ACCESS_ONCE(*p) = (tmp + 2) & ~1; } -static __always_inline void __ticket_spin_unlock_wait(arch_spinlock_t *lock) -{ - int *p = (int *)&lock->lock, ticket; - - ia64_invala(); - - for (;;) { - asm volatile ("ld4.c.nc %0=[%1]" : "=r"(ticket) : "r"(p) : "memory"); - if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK)) - return; - cpu_relax(); - } - - smp_acquire__after_ctrl_dep(); -} - static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { long tmp = ACCESS_ONCE(lock->lock); @@ -143,11 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - __ticket_spin_unlock_wait(lock); -} - #define arch_read_can_lock(rw) (*(volatile int *)(rw) >= 0) #define arch_write_can_lock(rw) (*(volatile int *)(rw) == 0) diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 5dd5c5d0d642..002eb85a6941 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -111,4 +111,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 7508c306aa9e..1d29b2f8726b 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -159,12 +159,12 @@ int acpi_request_vector(u32 int_type) return vector; } -char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { - return __va(phys_addr); + return __va(phys); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { } diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 121295637d0d..81416000c5e0 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size) return 0; } -u32 +int efi_mem_type (unsigned long phys_addr) { efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); if (md) return md->type; - return 0; + return -EINVAL; } u64 diff --git a/arch/m32r/include/asm/flat.h b/arch/m32r/include/asm/flat.h index 455ce7ddbf14..dfcb0e4eb256 100644 --- a/arch/m32r/include/asm/flat.h +++ b/arch/m32r/include/asm/flat.h @@ -95,7 +95,7 @@ static inline unsigned long m32r_flat_get_addr_from_rp (u32 *rp, return ~0; /* bogus value */ } -static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) +static inline int flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) { unsigned int reloc = flat_m32r_get_reloc_type (relval); if (reloc & 0xf0) { @@ -133,6 +133,7 @@ static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) break; } } + return 0; } // kludge - text_len is a local variable in the only user. diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 323c7fc953cd..a56825592b90 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -30,11 +30,6 @@ #define arch_spin_is_locked(x) (*(volatile int *)(&(x)->slock) <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, VAL > 0); -} - /** * arch_spin_trylock - Try spin lock and return a result * @lock: Pointer to the lock variable diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index f8f7b47e247f..e268e51a38d1 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -102,4 +102,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index ddff1164aff0..54191f6fc715 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -49,6 +49,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -465,8 +466,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m # CONFIG_IOMMU_SUPPORT is not set @@ -477,7 +480,6 @@ CONFIG_SERIAL_CONSOLE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -587,12 +589,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 17384dc959a5..fb4663904428 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -427,8 +428,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -436,7 +439,6 @@ CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -546,12 +548,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 53a641d62f85..4ab393e86e52 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -442,7 +443,10 @@ CONFIG_DMASOUND_ATARI=m CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m +# CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -457,7 +461,6 @@ CONFIG_ATARI_DSP56K=m CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -567,12 +570,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 3925ae3a5eb3..1dd8d697545b 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index f4a134b390b4..02b39f50076e 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -430,15 +431,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -548,12 +550,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 9ed0cef632b7..044dcb2bf8fb 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -46,6 +46,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -452,15 +453,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -570,12 +572,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index efed0d48fd53..3ad04682077a 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -56,6 +56,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -520,8 +521,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m CONFIG_RTC_DRV_GENERIC=m @@ -540,7 +543,6 @@ CONFIG_SERIAL_CONSOLE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -650,12 +652,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 9040457c7f9c..dc2dd61948cd 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -44,6 +44,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 8b17f00e0484..54e7b523fc3d 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 5f3718c62c85..d63d8a15f6db 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -442,8 +443,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -451,7 +454,6 @@ CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -561,12 +563,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 8c979a68fca5..d0924c22f52a 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -42,6 +42,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -422,15 +423,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -540,12 +542,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y CONFIG_CRYPTO_RSA=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index a1e79530e806..3001ee1e5dc5 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -42,6 +42,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -422,15 +423,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -540,12 +542,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/include/asm/asm-prototypes.h b/arch/m68k/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..22ccb9c97576 --- /dev/null +++ b/arch/m68k/include/asm/asm-prototypes.h @@ -0,0 +1,5 @@ +extern int __divsi3(int, int); +extern int __modsi3(int, int); +extern int __mulsi3(int, int); +extern unsigned int __udivsi3(unsigned int, unsigned int); +extern unsigned int __umodsi3(unsigned int, unsigned int); diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 8aa8792e3174..d96348a52362 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -357,6 +357,17 @@ static void cuda_shutdown(void) struct adb_request req; if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_POWERDOWN) < 0) return; + + /* Avoid infinite polling loop when PSU is not under Cuda control */ + switch (macintosh_config->ident) { + case MAC_MODEL_C660: + case MAC_MODEL_Q605: + case MAC_MODEL_Q605_ACC: + case MAC_MODEL_P475: + case MAC_MODEL_P475F: + return; + } + while (!req.complete) cuda_poll(); } @@ -463,8 +474,9 @@ void mac_poweroff(void) pmu_shutdown(); #endif } - local_irq_enable(); + pr_crit("It is now safe to turn off your Macintosh.\n"); + local_irq_disable(); while(1); } @@ -554,8 +566,8 @@ void mac_reset(void) } /* should never get here */ - local_irq_enable(); pr_crit("Restart failed. Please restart manually.\n"); + local_irq_disable(); while(1); } diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index 5b7a45d99cfb..7d8b322e5101 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -26,6 +26,7 @@ config METAG select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNDERSCORE_SYMBOL_PREFIX select IRQ_DOMAIN + select GENERIC_IRQ_EFFECTIVE_AFF_MASK select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h index 6c1380a8a0d4..eee779f26cc4 100644 --- a/arch/metag/include/asm/atomic_lock1.h +++ b/arch/metag/include/asm/atomic_lock1.h @@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i) return i; } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define ATOMIC_OP(op, c_op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h index c0c7a22be1ae..ddf7fe5708a6 100644 --- a/arch/metag/include/asm/spinlock.h +++ b/arch/metag/include/asm/spinlock.h @@ -15,11 +15,6 @@ * locked. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index e95f874ded1b..707c7f7b6bea 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h @@ -4,7 +4,6 @@ #ifdef CONFIG_NUMA #define cpu_to_node(cpu) ((void)(cpu), 0) -#define parent_node(node) ((void)(node), 0) #define cpumask_of_node(node) ((void)node, cpu_online_mask) diff --git a/arch/microblaze/include/asm/flat.h b/arch/microblaze/include/asm/flat.h index f23c3d266bae..3d2747d4c967 100644 --- a/arch/microblaze/include/asm/flat.h +++ b/arch/microblaze/include/asm/flat.h @@ -60,7 +60,7 @@ static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, * unaligned. */ -static inline void +static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 relval) { u32 *p = (__force u32 *)rp; diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index 01848f056f43..a9dad9e5e132 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -29,18 +29,9 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig index 7d32fbbca962..3598d58aac30 100644 --- a/arch/mips/configs/pistachio_defconfig +++ b/arch/mips/configs/pistachio_defconfig @@ -207,7 +207,7 @@ CONFIG_IMGPDC_WDT=y CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_REGULATOR_GPIO=y CONFIG_MEDIA_SUPPORT=y -CONFIG_MEDIA_RC_SUPPORT=y +CONFIG_RC_CORE=y # CONFIG_RC_DECODERS is not set CONFIG_RC_DEVICES=y CONFIG_IR_IMG=y diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index 1de190bdfb9c..a9e61ea54ca9 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -83,18 +83,9 @@ } static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 655e2fb5395b..da3216007fe0 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -91,20 +91,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 882823bec153..6c755bc07975 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -120,4 +120,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 6bace7695788..c7cbddfcdc3b 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static DEFINE_PER_CPU(atomic_t, tick_broadcast_count); -static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd); +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); void tick_broadcast(const struct cpumask *mask) { atomic_t *count; - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for_each_cpu(cpu, mask) { @@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info) static int __init tick_broadcast_init(void) { - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 3f87b96da5c4..7646891c4e9b 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -113,6 +113,7 @@ struct jit_ctx { u64 *reg_val_types; unsigned int long_b_conversion:1; unsigned int gen_b_offsets:1; + unsigned int use_bbit_insns:1; }; static void set_reg_val_type(u64 *rvt, int reg, enum reg_val_type type) @@ -655,19 +656,6 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx, int this_idx) return build_int_epilogue(ctx, MIPS_R_T9); } -static bool use_bbit_insns(void) -{ - switch (current_cpu_type()) { - case CPU_CAVIUM_OCTEON: - case CPU_CAVIUM_OCTEON_PLUS: - case CPU_CAVIUM_OCTEON2: - case CPU_CAVIUM_OCTEON3: - return true; - default: - return false; - } -} - static bool is_bad_offset(int b_off) { return b_off > 0x1ffff || b_off < -0x20000; @@ -682,6 +670,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, unsigned int target; u64 t64; s64 t64s; + int bpf_op = BPF_OP(insn->code); switch (insn->code) { case BPF_ALU64 | BPF_ADD | BPF_K: /* ALU64_IMM */ @@ -770,13 +759,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, sll, dst, dst, 0); if (insn->imm == 1) { /* div by 1 is a nop, mod by 1 is zero */ - if (BPF_OP(insn->code) == BPF_MOD) + if (bpf_op == BPF_MOD) emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO); break; } gen_imm_to_reg(insn, MIPS_R_AT, ctx); emit_instr(ctx, divu, dst, MIPS_R_AT); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -798,13 +787,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (insn->imm == 1) { /* div by 1 is a nop, mod by 1 is zero */ - if (BPF_OP(insn->code) == BPF_MOD) + if (bpf_op == BPF_MOD) emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO); break; } gen_imm_to_reg(insn, MIPS_R_AT, ctx); emit_instr(ctx, ddivu, dst, MIPS_R_AT); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -829,7 +818,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); did_move = false; if (insn->src_reg == BPF_REG_10) { - if (BPF_OP(insn->code) == BPF_MOV) { + if (bpf_op == BPF_MOV) { emit_instr(ctx, daddiu, dst, MIPS_R_SP, MAX_BPF_STACK); did_move = true; } else { @@ -839,7 +828,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, } else if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) { int tmp_reg = MIPS_R_AT; - if (BPF_OP(insn->code) == BPF_MOV) { + if (bpf_op == BPF_MOV) { tmp_reg = dst; did_move = true; } @@ -847,7 +836,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, dinsu, tmp_reg, MIPS_R_ZERO, 32, 32); src = MIPS_R_AT; } - switch (BPF_OP(insn->code)) { + switch (bpf_op) { case BPF_MOV: if (!did_move) emit_instr(ctx, daddu, dst, src, MIPS_R_ZERO); @@ -879,7 +868,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, ddivu, dst, src); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -923,7 +912,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (ts == REG_64BIT || ts == REG_32BIT_ZERO_EX) { int tmp_reg = MIPS_R_AT; - if (BPF_OP(insn->code) == BPF_MOV) { + if (bpf_op == BPF_MOV) { tmp_reg = dst; did_move = true; } @@ -931,7 +920,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, sll, tmp_reg, src, 0); src = MIPS_R_AT; } - switch (BPF_OP(insn->code)) { + switch (bpf_op) { case BPF_MOV: if (!did_move) emit_instr(ctx, addu, dst, src, MIPS_R_ZERO); @@ -962,7 +951,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, divu, dst, src); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -989,7 +978,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_JMP | BPF_JEQ | BPF_K: /* JMP_IMM */ case BPF_JMP | BPF_JNE | BPF_K: /* JMP_IMM */ - cmp_eq = (BPF_OP(insn->code) == BPF_JEQ); + cmp_eq = (bpf_op == BPF_JEQ); dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); if (dst < 0) return dst; @@ -1002,8 +991,12 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, goto jeq_common; case BPF_JMP | BPF_JEQ | BPF_X: /* JMP_REG */ case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JSET | BPF_X: @@ -1020,33 +1013,39 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, sll, MIPS_R_AT, dst, 0); dst = MIPS_R_AT; } - if (BPF_OP(insn->code) == BPF_JSET) { + if (bpf_op == BPF_JSET) { emit_instr(ctx, and, MIPS_R_AT, dst, src); cmp_eq = false; dst = MIPS_R_AT; src = MIPS_R_ZERO; - } else if (BPF_OP(insn->code) == BPF_JSGT) { + } else if (bpf_op == BPF_JSGT || bpf_op == BPF_JSLE) { emit_instr(ctx, dsubu, MIPS_R_AT, dst, src); if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { b_off = b_imm(exit_idx, ctx); if (is_bad_offset(b_off)) return -E2BIG; - emit_instr(ctx, blez, MIPS_R_AT, b_off); + if (bpf_op == BPF_JSGT) + emit_instr(ctx, blez, MIPS_R_AT, b_off); + else + emit_instr(ctx, bgtz, MIPS_R_AT, b_off); emit_instr(ctx, nop); return 2; /* We consumed the exit. */ } b_off = b_imm(this_idx + insn->off + 1, ctx); if (is_bad_offset(b_off)) return -E2BIG; - emit_instr(ctx, bgtz, MIPS_R_AT, b_off); + if (bpf_op == BPF_JSGT) + emit_instr(ctx, bgtz, MIPS_R_AT, b_off); + else + emit_instr(ctx, blez, MIPS_R_AT, b_off); emit_instr(ctx, nop); break; - } else if (BPF_OP(insn->code) == BPF_JSGE) { + } else if (bpf_op == BPF_JSGE || bpf_op == BPF_JSLT) { emit_instr(ctx, slt, MIPS_R_AT, dst, src); - cmp_eq = true; + cmp_eq = bpf_op == BPF_JSGE; dst = MIPS_R_AT; src = MIPS_R_ZERO; - } else if (BPF_OP(insn->code) == BPF_JGT) { + } else if (bpf_op == BPF_JGT || bpf_op == BPF_JLE) { /* dst or src could be AT */ emit_instr(ctx, dsubu, MIPS_R_T8, dst, src); emit_instr(ctx, sltu, MIPS_R_AT, dst, src); @@ -1054,16 +1053,16 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, movz, MIPS_R_T9, MIPS_R_SP, MIPS_R_T8); emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_ZERO, MIPS_R_T8); emit_instr(ctx, or, MIPS_R_AT, MIPS_R_T9, MIPS_R_AT); - cmp_eq = true; + cmp_eq = bpf_op == BPF_JGT; dst = MIPS_R_AT; src = MIPS_R_ZERO; - } else if (BPF_OP(insn->code) == BPF_JGE) { + } else if (bpf_op == BPF_JGE || bpf_op == BPF_JLT) { emit_instr(ctx, sltu, MIPS_R_AT, dst, src); - cmp_eq = true; + cmp_eq = bpf_op == BPF_JGE; dst = MIPS_R_AT; src = MIPS_R_ZERO; } else { /* JNE/JEQ case */ - cmp_eq = (BPF_OP(insn->code) == BPF_JEQ); + cmp_eq = (bpf_op == BPF_JEQ); } jeq_common: /* @@ -1122,7 +1121,9 @@ jeq_common: break; case BPF_JMP | BPF_JSGT | BPF_K: /* JMP_IMM */ case BPF_JMP | BPF_JSGE | BPF_K: /* JMP_IMM */ - cmp_eq = (BPF_OP(insn->code) == BPF_JSGE); + case BPF_JMP | BPF_JSLT | BPF_K: /* JMP_IMM */ + case BPF_JMP | BPF_JSLE | BPF_K: /* JMP_IMM */ + cmp_eq = (bpf_op == BPF_JSGE); dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); if (dst < 0) return dst; @@ -1132,65 +1133,92 @@ jeq_common: b_off = b_imm(exit_idx, ctx); if (is_bad_offset(b_off)) return -E2BIG; - if (cmp_eq) - emit_instr(ctx, bltz, dst, b_off); - else + switch (bpf_op) { + case BPF_JSGT: emit_instr(ctx, blez, dst, b_off); + break; + case BPF_JSGE: + emit_instr(ctx, bltz, dst, b_off); + break; + case BPF_JSLT: + emit_instr(ctx, bgez, dst, b_off); + break; + case BPF_JSLE: + emit_instr(ctx, bgtz, dst, b_off); + break; + } emit_instr(ctx, nop); return 2; /* We consumed the exit. */ } b_off = b_imm(this_idx + insn->off + 1, ctx); if (is_bad_offset(b_off)) return -E2BIG; - if (cmp_eq) - emit_instr(ctx, bgez, dst, b_off); - else + switch (bpf_op) { + case BPF_JSGT: emit_instr(ctx, bgtz, dst, b_off); + break; + case BPF_JSGE: + emit_instr(ctx, bgez, dst, b_off); + break; + case BPF_JSLT: + emit_instr(ctx, bltz, dst, b_off); + break; + case BPF_JSLE: + emit_instr(ctx, blez, dst, b_off); + break; + } emit_instr(ctx, nop); break; } /* * only "LT" compare available, so we must use imm + 1 - * to generate "GT" + * to generate "GT" and imm -1 to generate LE */ - t64s = insn->imm + (cmp_eq ? 0 : 1); + if (bpf_op == BPF_JSGT) + t64s = insn->imm + 1; + else if (bpf_op == BPF_JSLE) + t64s = insn->imm + 1; + else + t64s = insn->imm; + + cmp_eq = bpf_op == BPF_JSGT || bpf_op == BPF_JSGE; if (t64s >= S16_MIN && t64s <= S16_MAX) { emit_instr(ctx, slti, MIPS_R_AT, dst, (int)t64s); src = MIPS_R_AT; dst = MIPS_R_ZERO; - cmp_eq = true; goto jeq_common; } emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s); emit_instr(ctx, slt, MIPS_R_AT, dst, MIPS_R_AT); src = MIPS_R_AT; dst = MIPS_R_ZERO; - cmp_eq = true; goto jeq_common; case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: - cmp_eq = (BPF_OP(insn->code) == BPF_JGE); + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: + cmp_eq = (bpf_op == BPF_JGE); dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); if (dst < 0) return dst; /* * only "LT" compare available, so we must use imm + 1 - * to generate "GT" + * to generate "GT" and imm -1 to generate LE */ - t64s = (u64)(u32)(insn->imm) + (cmp_eq ? 0 : 1); - if (t64s >= 0 && t64s <= S16_MAX) { - emit_instr(ctx, sltiu, MIPS_R_AT, dst, (int)t64s); - src = MIPS_R_AT; - dst = MIPS_R_ZERO; - cmp_eq = true; - goto jeq_common; - } + if (bpf_op == BPF_JGT) + t64s = (u64)(u32)(insn->imm) + 1; + else if (bpf_op == BPF_JLE) + t64s = (u64)(u32)(insn->imm) + 1; + else + t64s = (u64)(u32)(insn->imm); + + cmp_eq = bpf_op == BPF_JGT || bpf_op == BPF_JGE; + emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s); emit_instr(ctx, sltu, MIPS_R_AT, dst, MIPS_R_AT); src = MIPS_R_AT; dst = MIPS_R_ZERO; - cmp_eq = true; goto jeq_common; case BPF_JMP | BPF_JSET | BPF_K: /* JMP_IMM */ @@ -1198,7 +1226,7 @@ jeq_common: if (dst < 0) return dst; - if (use_bbit_insns() && hweight32((u32)insn->imm) == 1) { + if (ctx->use_bbit_insns && hweight32((u32)insn->imm) == 1) { if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { b_off = b_imm(exit_idx, ctx); if (is_bad_offset(b_off)) @@ -1724,10 +1752,14 @@ static int reg_val_propagate_range(struct jit_ctx *ctx, u64 initial_rvt, case BPF_JEQ: case BPF_JGT: case BPF_JGE: + case BPF_JLT: + case BPF_JLE: case BPF_JSET: case BPF_JNE: case BPF_JSGT: case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: if (follow_taken) { rvt[idx] |= RVT_BRANCH_TAKEN; idx += insn->off; @@ -1853,6 +1885,19 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) memset(&ctx, 0, sizeof(ctx)); + preempt_disable(); + switch (current_cpu_type()) { + case CPU_CAVIUM_OCTEON: + case CPU_CAVIUM_OCTEON_PLUS: + case CPU_CAVIUM_OCTEON2: + case CPU_CAVIUM_OCTEON3: + ctx.use_bbit_insns = 1; + break; + default: + ctx.use_bbit_insns = 0; + } + preempt_enable(); + ctx.offsets = kcalloc(prog->len + 1, sizeof(*ctx.offsets), GFP_KERNEL); if (ctx.offsets == NULL) goto out_err; diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 9c7b8f7942d8..fe413b41df6c 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -26,11 +26,6 @@ #define arch_spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - static inline void arch_spin_unlock(arch_spinlock_t *lock) { asm volatile( diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index c710db354ff2..ac82a3f26dbf 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -102,4 +102,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h index 778087341977..8fed278a24b8 100644 --- a/arch/openrisc/include/asm/futex.h +++ b/arch/openrisc/include/asm/futex.h @@ -30,20 +30,10 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index dda1f558ef35..13648519bd41 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -9,6 +9,9 @@ config PARISC select ARCH_WANT_FRAME_POINTERS select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_WANTS_UBSAN_NO_NULL + select ARCH_SUPPORTS_MEMORY_FAILURE select RTC_CLASS select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE @@ -17,6 +20,12 @@ config PARISC select BUG select BUILDTIME_EXTABLE_SORT select HAVE_PERF_EVENTS + select HAVE_KERNEL_BZIP2 + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 + select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO + select HAVE_KERNEL_XZ select GENERIC_ATOMIC64 if !64BIT select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 75cb451b1f03..58fae5d2449d 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -24,15 +24,20 @@ KBUILD_DEFCONFIG := default_defconfig NM = sh $(srctree)/arch/parisc/nm CHECKFLAGS += -D__hppa__=1 LIBGCC = $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) +export LIBGCC ifdef CONFIG_64BIT UTS_MACHINE := parisc64 CHECKFLAGS += -D__LP64__=1 -m64 CC_ARCHES = hppa64 +LD_BFD := elf64-hppa-linux else # 32-bit CC_ARCHES = hppa hppa2.0 hppa1.1 +LD_BFD := elf32-hppa-linux endif +export LD_BFD + ifneq ($(SUBARCH),$(UTS_MACHINE)) ifeq ($(CROSS_COMPILE),) CC_SUFFIXES = linux linux-gnu unknown-linux-gnu @@ -88,6 +93,8 @@ libs-y += arch/parisc/lib/ $(LIBGCC) drivers-$(CONFIG_OPROFILE) += arch/parisc/oprofile/ +boot := arch/parisc/boot + PALO := $(shell if (which palo 2>&1); then : ; \ elif [ -x /sbin/palo ]; then echo /sbin/palo; \ fi) @@ -116,11 +123,14 @@ INSTALL_TARGETS = zinstall install PHONY += bzImage $(BOOT_TARGETS) $(INSTALL_TARGETS) -bzImage zImage: vmlinuz +zImage: vmlinuz Image: vmlinux -vmlinuz: vmlinux - @gzip -cf -9 $< > $@ +bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ + +vmlinuz: bzImage + $(OBJCOPY) $(boot)/bzImage $@ install: $(CONFIG_SHELL) $(src)/arch/parisc/install.sh \ diff --git a/arch/parisc/boot/.gitignore b/arch/parisc/boot/.gitignore new file mode 100644 index 000000000000..017d5912ad2d --- /dev/null +++ b/arch/parisc/boot/.gitignore @@ -0,0 +1,2 @@ +image +bzImage diff --git a/arch/parisc/boot/Makefile b/arch/parisc/boot/Makefile new file mode 100644 index 000000000000..cad68a584884 --- /dev/null +++ b/arch/parisc/boot/Makefile @@ -0,0 +1,26 @@ +# +# Makefile for the linux parisc-specific parts of the boot image creator. +# + +COMPILE_VERSION := __linux_compile_version_id__`hostname | \ + tr -c '[0-9A-Za-z]' '_'`__`date | \ + tr -c '[0-9A-Za-z]' '_'`_t + +ccflags-y := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I. + +targets := image +targets += bzImage +subdir- := compressed + +$(obj)/image: vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/bzImage: $(obj)/compressed/vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/compressed/vmlinux: FORCE + $(Q)$(MAKE) $(build)=$(obj)/compressed $@ + +install: $(CONFIGURE) $(obj)/bzImage + sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + System.map "$(INSTALL_PATH)" diff --git a/arch/parisc/boot/compressed/.gitignore b/arch/parisc/boot/compressed/.gitignore new file mode 100644 index 000000000000..ae06b9b4c02f --- /dev/null +++ b/arch/parisc/boot/compressed/.gitignore @@ -0,0 +1,3 @@ +sizes.h +vmlinux +vmlinux.lds diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile new file mode 100644 index 000000000000..5450a11c9d10 --- /dev/null +++ b/arch/parisc/boot/compressed/Makefile @@ -0,0 +1,86 @@ +# +# linux/arch/parisc/boot/compressed/Makefile +# +# create a compressed self-extracting vmlinux image from the original vmlinux +# + +KCOV_INSTRUMENT := n +GCOV_PROFILE := n +UBSAN_SANITIZE := n + +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 +targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 +targets += misc.o piggy.o sizes.h head.o real2.o firmware.o + +KBUILD_CFLAGS := -D__KERNEL__ -O2 -DBOOTLOADER +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks +KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs +ifndef CONFIG_64BIT +KBUILD_CFLAGS += -mfast-indirect-calls +endif + +OBJECTS += $(obj)/head.o $(obj)/real2.o $(obj)/firmware.o $(obj)/misc.o $(obj)/piggy.o + +# LDFLAGS_vmlinux := -X --whole-archive -e startup -T +LDFLAGS_vmlinux := -X -e startup --as-needed -T +$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC) + $(call if_changed,ld) + +sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\)$$/\#define SZ\2 0x\1/p' + +quiet_cmd_sizes = GEN $@ + cmd_sizes = $(NM) $< | sed -n $(sed-sizes) > $@ + +$(obj)/sizes.h: vmlinux + $(call if_changed,sizes) + +AFLAGS_head.o += -I$(objtree)/$(obj) -DBOOTLOADER +$(obj)/head.o: $(obj)/sizes.h + +CFLAGS_misc.o += -I$(objtree)/$(obj) +$(obj)/misc.o: $(obj)/sizes.h + +$(obj)/firmware.o: $(obj)/firmware.c +$(obj)/firmware.c: $(srctree)/arch/$(SRCARCH)/kernel/firmware.c + $(call cmd,shipped) + +AFLAGS_real2.o += -DBOOTLOADER +$(obj)/real2.o: $(obj)/real2.S +$(obj)/real2.S: $(srctree)/arch/$(SRCARCH)/kernel/real2.S + $(call cmd,shipped) + +$(obj)/misc.o: $(obj)/sizes.h + +CPPFLAGS_vmlinux.lds += -I$(objtree)/$(obj) -DBOOTLOADER +$(obj)/vmlinux.lds: $(obj)/sizes.h + +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .comment -S +$(obj)/vmlinux.bin: vmlinux + $(call if_changed,objcopy) + +vmlinux.bin.all-y := $(obj)/vmlinux.bin + +suffix-$(CONFIG_KERNEL_GZIP) := gz +suffix-$(CONFIG_KERNEL_BZIP2) := bz2 +suffix-$(CONFIG_KERNEL_LZ4) := lz4 +suffix-$(CONFIG_KERNEL_LZMA) := lzma +suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_XZ) := xz + +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) + $(call if_changed,gzip) +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) + $(call if_changed,bzip2) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) + $(call if_changed,lz4) +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) + $(call if_changed,lzma) +$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) + $(call if_changed,lzo) +$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) + $(call if_changed,xzkern) + +LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) + $(call if_changed,ld) diff --git a/arch/parisc/boot/compressed/head.S b/arch/parisc/boot/compressed/head.S new file mode 100644 index 000000000000..5aba20fa48aa --- /dev/null +++ b/arch/parisc/boot/compressed/head.S @@ -0,0 +1,85 @@ +/* + * Startup glue code to uncompress the kernel + * + * (C) 2017 Helge Deller <deller@gmx.de> + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/page.h> +#include <asm/psw.h> +#include <asm/pdc.h> +#include <asm/assembly.h> +#include "sizes.h" + +#define BOOTADDR(x) (x) + +#ifndef CONFIG_64BIT + .import $global$ /* forward declaration */ +#endif /*!CONFIG_64BIT*/ + + __HEAD + +ENTRY(startup) + .level LEVEL + +#define PSW_W_SM 0x200 +#define PSW_W_BIT 36 + + ;! nuke the W bit, saving original value + .level 2.0 + rsm PSW_W_SM, %r1 + + .level 1.1 + extrw,u %r1, PSW_W_BIT-32, 1, %r1 + copy %r1, %arg0 + + /* Make sure sr4-sr7 are set to zero for the kernel address space */ + mtsp %r0,%sr4 + mtsp %r0,%sr5 + mtsp %r0,%sr6 + mtsp %r0,%sr7 + + /* Clear BSS */ + + .import _bss,data + .import _ebss,data + + load32 BOOTADDR(_bss),%r3 + load32 BOOTADDR(_ebss),%r4 + ldo FRAME_SIZE(%r4),%sp /* stack at end of bss */ +$bss_loop: + cmpb,<<,n %r3,%r4,$bss_loop + stw,ma %r0,4(%r3) + + /* Initialize the global data pointer */ + loadgp + + /* arg0..arg4 were set by palo. */ + copy %arg1, %r6 /* command line */ + copy %arg2, %r7 /* rd-start */ + copy %arg3, %r8 /* rd-end */ + load32 BOOTADDR(decompress_kernel),%r3 + +#ifdef CONFIG_64BIT + .level LEVEL + ssm PSW_W_SM, %r0 /* set W-bit */ + depdi 0, 31, 32, %r3 +#endif + load32 BOOTADDR(startup_continue), %r2 + bv,n 0(%r3) + +startup_continue: +#ifdef CONFIG_64BIT + .level LEVEL + rsm PSW_W_SM, %r0 /* clear W-bit */ +#endif + + load32 KERNEL_BINARY_TEXT_START, %arg0 /* free mem */ + copy %r6, %arg1 /* command line */ + copy %r7, %arg2 /* rd-start */ + copy %r8, %arg3 /* rd-end */ + + bv,n 0(%ret0) +END(startup) diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c new file mode 100644 index 000000000000..13a4bf9ac4da --- /dev/null +++ b/arch/parisc/boot/compressed/misc.c @@ -0,0 +1,301 @@ +/* + * Definitions and wrapper functions for kernel decompressor + * + * (C) 2017 Helge Deller <deller@gmx.de> + */ + +#include <linux/uaccess.h> +#include <asm/unaligned.h> +#include <asm/page.h> +#include "sizes.h" + +/* + * gzip declarations + */ +#define STATIC static + +#undef memmove +#define memmove memmove +#define memzero(s, n) memset((s), 0, (n)) + +#define malloc malloc_gzip +#define free free_gzip + +/* Symbols defined by linker scripts */ +extern char input_data[]; +extern int input_len; +extern __le32 output_len; /* at unaligned address, little-endian */ +extern char _text, _end; +extern char _bss, _ebss; +extern char _startcode_end; +extern void startup_continue(void *entry, unsigned long cmdline, + unsigned long rd_start, unsigned long rd_end) __noreturn; + +void error(char *m) __noreturn; + +static unsigned long free_mem_ptr; +static unsigned long free_mem_end_ptr; + +#ifdef CONFIG_KERNEL_GZIP +#include "../../../../lib/decompress_inflate.c" +#endif + +#ifdef CONFIG_KERNEL_BZIP2 +#include "../../../../lib/decompress_bunzip2.c" +#endif + +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + +#ifdef CONFIG_KERNEL_LZMA +#include "../../../../lib/decompress_unlzma.c" +#endif + +#ifdef CONFIG_KERNEL_LZO +#include "../../../../lib/decompress_unlzo.c" +#endif + +#ifdef CONFIG_KERNEL_XZ +#include "../../../../lib/decompress_unxz.c" +#endif + +void *memmove(void *dest, const void *src, size_t n) +{ + const char *s = src; + char *d = dest; + + if (d <= s) { + while (n--) + *d++ = *s++; + } else { + d += n; + s += n; + while (n--) + *--d = *--s; + } + return dest; +} + +void *memset(void *s, int c, size_t count) +{ + char *xs = (char *)s; + + while (count--) + *xs++ = c; + return s; +} + +void *memcpy(void *d, const void *s, size_t len) +{ + char *dest = (char *)d; + const char *source = (const char *)s; + + while (len--) + *dest++ = *source++; + return d; +} + +size_t strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + ; + return sc - s; +} + +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + ++s; + } + return NULL; +} + +int puts(const char *s) +{ + const char *nuline = s; + + while ((nuline = strchr(s, '\n')) != NULL) { + if (nuline != s) + pdc_iodc_print(s, nuline - s); + pdc_iodc_print("\r\n", 2); + s = nuline + 1; + } + if (*s != '\0') + pdc_iodc_print(s, strlen(s)); + + return 0; +} + +static int putchar(int c) +{ + char buf[2]; + + buf[0] = c; + buf[1] = '\0'; + puts(buf); + return c; +} + +void __noreturn error(char *x) +{ + puts("\n\n"); + puts(x); + puts("\n\n -- System halted"); + while (1) /* wait forever */ + ; +} + +static int print_hex(unsigned long num) +{ + const char hex[] = "0123456789abcdef"; + char str[40]; + int i = sizeof(str)-1; + + str[i--] = '\0'; + do { + str[i--] = hex[num & 0x0f]; + num >>= 4; + } while (num); + + str[i--] = 'x'; + str[i] = '0'; + puts(&str[i]); + + return 0; +} + +int printf(const char *fmt, ...) +{ + va_list args; + int i = 0; + + va_start(args, fmt); + + while (fmt[i]) { + if (fmt[i] != '%') { +put: + putchar(fmt[i++]); + continue; + } + + if (fmt[++i] == '%') + goto put; + ++i; + print_hex(va_arg(args, unsigned long)); + } + + va_end(args); + return 0; +} + +/* helper functions for libgcc */ +void abort(void) +{ + error("aborted."); +} + +#undef malloc +void *malloc(size_t size) +{ + return malloc_gzip(size); +} + +#undef free +void free(void *ptr) +{ + return free_gzip(ptr); +} + + +static void flush_data_cache(char *start, unsigned long length) +{ + char *end = start + length; + + do { + asm volatile("fdc 0(%0)" : : "r" (start)); + asm volatile("fic 0(%%sr0,%0)" : : "r" (start)); + start += 16; + } while (start < end); + asm volatile("fdc 0(%0)" : : "r" (end)); + + asm ("sync"); +} + +unsigned long decompress_kernel(unsigned int started_wide, + unsigned int command_line, + const unsigned int rd_start, + const unsigned int rd_end) +{ + char *output; + unsigned long len, len_all; + +#ifdef CONFIG_64BIT + parisc_narrow_firmware = 0; +#endif + + set_firmware_width_unlocked(); + + putchar('U'); /* if you get this p and no more, string storage */ + /* in $GLOBAL$ is wrong or %dp is wrong */ + puts("ncompressing ...\n"); + + output = (char *) KERNEL_BINARY_TEXT_START; + len_all = __pa(SZ_end) - __pa(SZparisc_kernel_start); + + if ((unsigned long) &_startcode_end > (unsigned long) output) + error("Bootcode overlaps kernel code"); + + len = get_unaligned_le32(&output_len); + if (len > len_all) + error("Output len too big."); + else + memset(&output[len], 0, len_all - len); + + /* + * Initialize free_mem_ptr and free_mem_end_ptr. + */ + free_mem_ptr = (unsigned long) &_ebss; + free_mem_ptr += 2*1024*1024; /* leave 2 MB for stack */ + + /* Limit memory for bootoader to 1GB */ + #define ARTIFICIAL_LIMIT (1*1024*1024*1024) + free_mem_end_ptr = PAGE0->imm_max_mem; + if (free_mem_end_ptr > ARTIFICIAL_LIMIT) + free_mem_end_ptr = ARTIFICIAL_LIMIT; + +#ifdef CONFIG_BLK_DEV_INITRD + /* if we have ramdisk this is at end of memory */ + if (rd_start && rd_start < free_mem_end_ptr) + free_mem_end_ptr = rd_start; +#endif + +#ifdef DEBUG + printf("startcode_end = %x\n", &_startcode_end); + printf("commandline = %x\n", command_line); + printf("rd_start = %x\n", rd_start); + printf("rd_end = %x\n", rd_end); + + printf("free_ptr = %x\n", free_mem_ptr); + printf("free_ptr_end = %x\n", free_mem_end_ptr); + + printf("input_data = %x\n", input_data); + printf("input_len = %x\n", input_len); + printf("output = %x\n", output); + printf("output_len = %x\n", len); + printf("output_max = %x\n", len_all); +#endif + + __decompress(input_data, input_len, NULL, NULL, + output, 0, NULL, error); + + flush_data_cache(output, len); + + printf("Booting kernel ...\n\n"); + + return (unsigned long) output; +} diff --git a/arch/parisc/boot/compressed/vmlinux.lds.S b/arch/parisc/boot/compressed/vmlinux.lds.S new file mode 100644 index 000000000000..a4ce3314e78e --- /dev/null +++ b/arch/parisc/boot/compressed/vmlinux.lds.S @@ -0,0 +1,101 @@ +#include <asm-generic/vmlinux.lds.h> +#include <asm/page.h> +#include "sizes.h" + +#ifndef CONFIG_64BIT +OUTPUT_FORMAT("elf32-hppa-linux") +OUTPUT_ARCH(hppa) +#else +OUTPUT_FORMAT("elf64-hppa-linux") +OUTPUT_ARCH(hppa:hppa2.0w) +#endif + +ENTRY(startup) + +SECTIONS +{ + /* palo loads at 0x60000 */ + /* loaded kernel will move to 0x10000 */ + . = 0xe0000; /* should not overwrite palo code */ + + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + + /* keep __gp below 0x1000000 */ +#ifdef CONFIG_64BIT + . = ALIGN(16); + /* Linkage tables */ + .opd : { + *(.opd) + } PROVIDE (__gp = .); + .plt : { + *(.plt) + } + .dlt : { + *(.dlt) + } +#endif + _startcode_end = .; + + /* bootloader code and data starts behind area of extracted kernel */ + . = (SZ_end - SZparisc_kernel_start + KERNEL_BINARY_TEXT_START); + + /* align on next page boundary */ + . = ALIGN(4096); + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + . = ALIGN(8); + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + . = ALIGN(8); + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + . = ALIGN(8); + .rodata.compressed : { + *(.rodata.compressed) + } + . = ALIGN(8); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(4096); + _ebss = .; + } + + STABS_DEBUG + .note 0 : { *(.note) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { +#ifdef CONFIG_64BIT + /* temporary hack until binutils is fixed to not emit these + * for static binaries + */ + *(.PARISC.unwind) /* no unwind data */ + *(.interp) + *(.dynsym) + *(.dynstr) + *(.dynamic) + *(.hash) + *(.gnu.hash) +#endif + } +} diff --git a/arch/parisc/boot/compressed/vmlinux.scr b/arch/parisc/boot/compressed/vmlinux.scr new file mode 100644 index 000000000000..dac2d142bcfa --- /dev/null +++ b/arch/parisc/boot/compressed/vmlinux.scr @@ -0,0 +1,10 @@ +SECTIONS +{ + .rodata.compressed : { + input_len = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + output_len = . - 4; /* can be at unaligned address */ + input_data_end = .; + } +} diff --git a/arch/parisc/boot/install.sh b/arch/parisc/boot/install.sh new file mode 100644 index 000000000000..8f7c365fad83 --- /dev/null +++ b/arch/parisc/boot/install.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# +# arch/parisc/install.sh, derived from arch/i386/boot/install.sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1995 by Linus Torvalds +# +# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin +# +# "make install" script for i386 architecture +# +# Arguments: +# $1 - kernel version +# $2 - kernel image file +# $3 - kernel map file +# $4 - default install path (blank if root directory) +# + +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo ' *** You need to run "make" before "make install".' 1>&2 + echo "" 1>&2 + exit 1 + fi +} + +# Make sure the files actually exist + +verify "$2" +verify "$3" + +# User may have a custom install script + +if [ -n "${INSTALLKERNEL}" ]; then + if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi + if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi +fi + +# Default install + +if [ "$(basename $2)" = "zImage" ]; then +# Compressed install + echo "Installing compressed kernel" + base=vmlinuz +else +# Normal install + echo "Installing normal kernel" + base=vmlinux +fi + +if [ -f $4/$base-$1 ]; then + mv $4/$base-$1 $4/$base-$1.old +fi +cat $2 > $4/$base-$1 + +# Install system map file +if [ -f $4/System.map-$1 ]; then + mv $4/System.map-$1 $4/System.map-$1.old +fi +cp $3 $4/System.map-$1 diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig index 0764d3971cf6..8d41a73bd71b 100644 --- a/arch/parisc/configs/c3000_defconfig +++ b/arch/parisc/configs/c3000_defconfig @@ -31,7 +31,6 @@ CONFIG_IP_PNP_BOOTP=y CONFIG_INET6_IPCOMP=m CONFIG_IPV6_TUNNEL=m CONFIG_NETFILTER=y -CONFIG_NETFILTER_DEBUG=y CONFIG_NET_PKTGEN=m CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 5394b9c5f914..17b98a87e5e2 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i) _atomic_spin_unlock_irqrestore(v, flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + static __inline__ int atomic_read(const atomic_t *v) { return READ_ONCE((v)->counter); diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index 0ba14300cd8e..c601aab2fb36 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags) } static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { unsigned long int flags; - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval, ret; u32 tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr))) - return -EFAULT; - _futex_spin_lock_irqsave(uaddr, &flags); pagefault_disable(); @@ -85,17 +75,9 @@ out_pagefault_enable: pagefault_enable(); _futex_spin_unlock_irqrestore(uaddr, &flags); - if (ret == 0) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index a81226257878..e4a657094058 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -63,6 +63,9 @@ static inline void switch_mm(struct mm_struct *prev, { unsigned long flags; + if (prev == next) + return; + local_irq_save(flags); switch_mm_irqs_off(prev, next, tsk); local_irq_restore(flags); diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 80e742a1c162..bfed09d80bae 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -116,11 +116,15 @@ extern int npmem_ranges; /* This governs the relationship between virtual and physical addresses. * If you alter it, make sure to take care of our various fixed mapping * segments in fixmap.h */ +#if defined(BOOTLOADER) +#define __PAGE_OFFSET (0) /* bootloader uses physical addresses */ +#else #ifdef CONFIG_64BIT #define __PAGE_OFFSET (0x40000000) /* 1GB */ #else #define __PAGE_OFFSET (0x10000000) /* 256MB */ #endif +#endif /* BOOTLOADER */ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index 7569627a032b..26b4455baa83 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -5,6 +5,8 @@ #if !defined(__ASSEMBLY__) +extern int parisc_narrow_firmware; + extern int pdc_type; extern unsigned long parisc_cell_num; /* cell number the CPU runs on (PAT) */ extern unsigned long parisc_cell_loc; /* cell location of CPU (PAT) */ diff --git a/arch/parisc/include/asm/pdcpat.h b/arch/parisc/include/asm/pdcpat.h index e3c0586260d8..a468a172ee33 100644 --- a/arch/parisc/include/asm/pdcpat.h +++ b/arch/parisc/include/asm/pdcpat.h @@ -223,6 +223,18 @@ struct pdc_pat_mem_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_PD_INFO (return info) */ unsigned long clear_time; /* last PDT clear time (since Jan 1970) */ }; +struct pdc_pat_mem_cell_pdt_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_CELL_INFO */ + u64 reserved:32; + u64 cs:1; /* clear status: cleared since the last call? */ + u64 current_pdt_entries:15; + u64 ic:1; /* interleaving had to be changed ? */ + u64 max_pdt_entries:15; + unsigned long good_mem; + unsigned long first_dbe_loc; /* first location of double bit error */ + unsigned long clear_time; /* last PDT clear time (since Jan 1970) */ +}; + + struct pdc_pat_mem_read_pd_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_PD_READ */ unsigned long actual_count_bytes; unsigned long pdt_entries; @@ -325,6 +337,8 @@ extern int pdc_pat_io_pci_cfg_read(unsigned long pci_addr, int pci_size, u32 *va extern int pdc_pat_io_pci_cfg_write(unsigned long pci_addr, int pci_size, u32 val); extern int pdc_pat_mem_pdt_info(struct pdc_pat_mem_retinfo *rinfo); +extern int pdc_pat_mem_pdt_cell_info(struct pdc_pat_mem_cell_pdt_retinfo *rinfo, + unsigned long cell); extern int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, unsigned long *pdt_entries_ptr, unsigned long max_entries); extern int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index e32936cd7f10..55bfe4affca3 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -14,13 +14,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x) #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *x) -{ - volatile unsigned int *a = __ldcw_align(x); - - smp_cond_load_acquire(a, VAL); -} - static inline void arch_spin_lock_flags(arch_spinlock_t *x, unsigned long flags) { diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 5979745815a5..775b5d5e41a1 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -40,9 +40,6 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ -#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ -#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ -#define MADV_VPS_INHERIT 7 /* Inherit parents page size */ /* common/generic parameters */ #define MADV_FREE 8 /* free pages only if memory pressure */ @@ -60,21 +57,16 @@ overrides the coredump filter bits */ #define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ + +#define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index a0d4dc9f4eb2..3b2bf7ae703b 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_PEERGROUPS 0x4034 +#define SO_ZEROCOPY 0x4035 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index f622a311d04a..ab80e5c6f651 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -69,7 +69,15 @@ #include <asm/pdcpat.h> #include <asm/processor.h> /* for boot_cpu_data */ +#if defined(BOOTLOADER) +# undef spin_lock_irqsave +# define spin_lock_irqsave(a, b) { b = 1; } +# undef spin_unlock_irqrestore +# define spin_unlock_irqrestore(a, b) +#else static DEFINE_SPINLOCK(pdc_lock); +#endif + extern unsigned long pdc_result[NUM_PDC_RESULT]; extern unsigned long pdc_result2[NUM_PDC_RESULT]; @@ -142,8 +150,8 @@ static void convert_to_wide(unsigned long *addr) int i; unsigned int *p = (unsigned int *)addr; - if(unlikely(parisc_narrow_firmware)) { - for(i = 31; i >= 0; --i) + if (unlikely(parisc_narrow_firmware)) { + for (i = (NUM_PDC_RESULT-1); i >= 0; --i) addr[i] = p[i]; } #endif @@ -186,6 +194,8 @@ void set_firmware_width(void) } #endif /*CONFIG_64BIT*/ + +#if !defined(BOOTLOADER) /** * pdc_emergency_unlock - Unlock the linux pdc lock * @@ -979,16 +989,22 @@ int pdc_mem_pdt_read_entries(struct pdc_mem_read_pdt *pret, spin_lock_irqsave(&pdc_lock, flags); retval = mem_pdc_call(PDC_MEM, PDC_MEM_READ_PDT, __pa(pdc_result), - __pa(pdc_result2)); + __pa(pdt_entries_ptr)); if (retval == PDC_OK) { convert_to_wide(pdc_result); memcpy(pret, pdc_result, sizeof(*pret)); - convert_to_wide(pdc_result2); - memcpy(pdt_entries_ptr, pdc_result2, - pret->pdt_entries * sizeof(*pdt_entries_ptr)); } spin_unlock_irqrestore(&pdc_lock, flags); +#ifdef CONFIG_64BIT + /* + * 64-bit kernels should not call this PDT function in narrow mode. + * The pdt_entries_ptr array above will now contain 32-bit values + */ + if (WARN_ON_ONCE((retval == PDC_OK) && parisc_narrow_firmware)) + return PDC_ERROR; +#endif + return retval; } @@ -1143,6 +1159,8 @@ void pdc_io_reset_devices(void) spin_unlock_irqrestore(&pdc_lock, flags); } +#endif /* defined(BOOTLOADER) */ + /* locked by pdc_console_lock */ static int __attribute__((aligned(8))) iodc_retbuf[32]; static char __attribute__((aligned(64))) iodc_dbuf[4096]; @@ -1187,6 +1205,7 @@ print: return i; } +#if !defined(BOOTLOADER) /** * pdc_iodc_getc - Read a character (non-blocking) from the PDC console. * @@ -1440,6 +1459,29 @@ int pdc_pat_mem_pdt_info(struct pdc_pat_mem_retinfo *rinfo) } /** + * pdc_pat_mem_pdt_cell_info - Retrieve information about page deallocation + * table of a cell + * @rinfo: memory pdt information + * @cell: cell number + * + */ +int pdc_pat_mem_pdt_cell_info(struct pdc_pat_mem_cell_pdt_retinfo *rinfo, + unsigned long cell) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_CELL_INFO, + __pa(&pdc_result), cell); + if (retval == PDC_OK) + memcpy(rinfo, &pdc_result, sizeof(*rinfo)); + spin_unlock_irqrestore(&pdc_lock, flags); + + return retval; +} + +/** * pdc_pat_mem_read_cell_pdt - Read PDT entries from (old) PAT firmware * @pret: array of PDT entries * @pdt_entries_ptr: ptr to hold number of PDT entries @@ -1455,14 +1497,14 @@ int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, spin_lock_irqsave(&pdc_lock, flags); /* PDC_PAT_MEM_CELL_READ is available on early PAT machines only */ retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_CELL_READ, - __pa(&pdc_result), parisc_cell_num, __pa(&pdc_result2)); + __pa(&pdc_result), parisc_cell_num, + __pa(pdt_entries_ptr)); if (retval == PDC_OK) { /* build up return value as for PDC_PAT_MEM_PD_READ */ entries = min(pdc_result[0], max_entries); pret->pdt_entries = entries; pret->actual_count_bytes = entries * sizeof(unsigned long); - memcpy(pdt_entries_ptr, &pdc_result2, pret->actual_count_bytes); } spin_unlock_irqrestore(&pdc_lock, flags); @@ -1474,6 +1516,8 @@ int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, * pdc_pat_mem_read_pd_pdt - Read PDT entries from (newer) PAT firmware * @pret: array of PDT entries * @pdt_entries_ptr: ptr to hold number of PDT entries + * @count: number of bytes to read + * @offset: offset to start (in bytes) * */ int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, @@ -1524,6 +1568,7 @@ int pdc_pat_mem_get_dimm_phys_location( return retval; } #endif /* CONFIG_64BIT */ +#endif /* defined(BOOTLOADER) */ /***************** 32-bit real-mode calls ***********/ @@ -1633,4 +1678,3 @@ long real64_call(unsigned long fn, ...) } #endif /* CONFIG_64BIT */ - diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 5f0067a62738..bd4c0a7471d3 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -41,7 +41,7 @@ static unsigned long pcxl_used_bytes __read_mostly = 0; static unsigned long pcxl_used_pages __read_mostly = 0; extern unsigned long pcxl_dma_start; /* Start of pcxl dma mapping area */ -static spinlock_t pcxl_res_lock; +static DEFINE_SPINLOCK(pcxl_res_lock); static char *pcxl_res_map; static int pcxl_res_hint; static int pcxl_res_size; @@ -390,7 +390,6 @@ pcxl_dma_init(void) if (pcxl_dma_start == 0) return 0; - spin_lock_init(&pcxl_res_lock); pcxl_res_size = PCXL_DMA_MAP_SIZE >> (PAGE_SHIFT + 3); pcxl_res_hint = 0; pcxl_res_map = (char *)__get_free_pages(GFP_KERNEL, diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index d02874ecb94d..05730a83895c 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -1,19 +1,20 @@ /* * Page Deallocation Table (PDT) support * - * The Page Deallocation Table (PDT) holds a table with pointers to bad - * memory (broken RAM modules) which is maintained by firmware. + * The Page Deallocation Table (PDT) is maintained by firmware and holds a + * list of memory addresses in which memory errors were detected. + * The list contains both single-bit (correctable) and double-bit + * (uncorrectable) errors. * * Copyright 2017 by Helge Deller <deller@gmx.de> * - * TODO: - * - check regularily for new bad memory - * - add userspace interface with procfs or sysfs - * - increase number of PDT entries dynamically + * possible future enhancements: + * - add userspace interface via procfs or sysfs to clear PDT */ #include <linux/memblock.h> #include <linux/seq_file.h> +#include <linux/kthread.h> #include <asm/pdc.h> #include <asm/pdcpat.h> @@ -24,11 +25,16 @@ enum pdt_access_type { PDT_NONE, PDT_PDC, PDT_PAT_NEW, - PDT_PAT_OLD + PDT_PAT_CELL }; static enum pdt_access_type pdt_type; +/* PDT poll interval: 1 minute if errors, 5 minutes if everything OK. */ +#define PDT_POLL_INTERVAL_DEFAULT (5*60*HZ) +#define PDT_POLL_INTERVAL_SHORT (1*60*HZ) +static unsigned long pdt_poll_interval = PDT_POLL_INTERVAL_DEFAULT; + /* global PDT status information */ static struct pdc_mem_retinfo pdt_status; @@ -36,6 +42,21 @@ static struct pdc_mem_retinfo pdt_status; #define MAX_PDT_ENTRIES (MAX_PDT_TABLE_SIZE / sizeof(unsigned long)) static unsigned long pdt_entry[MAX_PDT_ENTRIES] __page_aligned_bss; +/* + * Constants for the pdt_entry format: + * A pdt_entry holds the physical address in bits 0-57, bits 58-61 are + * reserved, bit 62 is the perm bit and bit 63 is the error_type bit. + * The perm bit indicates whether the error have been verified as a permanent + * error (value of 1) or has not been verified, and may be transient (value + * of 0). The error_type bit indicates whether the error is a single bit error + * (value of 1) or a multiple bit error. + * On non-PAT machines phys_addr is encoded in bits 0-59 and error_type in bit + * 63. Those machines don't provide the perm bit. + */ + +#define PDT_ADDR_PHYS_MASK (pdt_type != PDT_PDC ? ~0x3f : ~0x0f) +#define PDT_ADDR_PERM_ERR (pdt_type != PDT_PDC ? 2UL : 0UL) +#define PDT_ADDR_SINGLE_ERR 1UL /* report PDT entries via /proc/meminfo */ void arch_report_meminfo(struct seq_file *m) @@ -49,6 +70,68 @@ void arch_report_meminfo(struct seq_file *m) pdt_status.pdt_entries); } +static int get_info_pat_new(void) +{ + struct pdc_pat_mem_retinfo pat_rinfo; + int ret; + + /* newer PAT machines like C8000 report info for all cells */ + if (is_pdc_pat()) + ret = pdc_pat_mem_pdt_info(&pat_rinfo); + else + return PDC_BAD_PROC; + + pdt_status.pdt_size = pat_rinfo.max_pdt_entries; + pdt_status.pdt_entries = pat_rinfo.current_pdt_entries; + pdt_status.pdt_status = 0; + pdt_status.first_dbe_loc = pat_rinfo.first_dbe_loc; + pdt_status.good_mem = pat_rinfo.good_mem; + + return ret; +} + +static int get_info_pat_cell(void) +{ + struct pdc_pat_mem_cell_pdt_retinfo cell_rinfo; + int ret; + + /* older PAT machines like rp5470 report cell info only */ + if (is_pdc_pat()) + ret = pdc_pat_mem_pdt_cell_info(&cell_rinfo, parisc_cell_num); + else + return PDC_BAD_PROC; + + pdt_status.pdt_size = cell_rinfo.max_pdt_entries; + pdt_status.pdt_entries = cell_rinfo.current_pdt_entries; + pdt_status.pdt_status = 0; + pdt_status.first_dbe_loc = cell_rinfo.first_dbe_loc; + pdt_status.good_mem = cell_rinfo.good_mem; + + return ret; +} + +static void report_mem_err(unsigned long pde) +{ + struct pdc_pat_mem_phys_mem_location loc; + unsigned long addr; + char dimm_txt[32]; + + addr = pde & PDT_ADDR_PHYS_MASK; + + /* show DIMM slot description on PAT machines */ + if (is_pdc_pat()) { + pdc_pat_mem_get_dimm_phys_location(&loc, addr); + sprintf(dimm_txt, "DIMM slot %02x, ", loc.dimm_slot); + } else + dimm_txt[0] = 0; + + pr_warn("PDT: BAD MEMORY at 0x%08lx, %s%s%s-bit error.\n", + addr, dimm_txt, + pde & PDT_ADDR_PERM_ERR ? "permanent ":"", + pde & PDT_ADDR_SINGLE_ERR ? "single":"multi"); +} + + /* * pdc_pdt_init() * @@ -63,18 +146,17 @@ void __init pdc_pdt_init(void) unsigned long entries; struct pdc_mem_read_pdt pdt_read_ret; - if (is_pdc_pat()) { - struct pdc_pat_mem_retinfo pat_rinfo; + pdt_type = PDT_PAT_NEW; + ret = get_info_pat_new(); - pdt_type = PDT_PAT_NEW; - ret = pdc_pat_mem_pdt_info(&pat_rinfo); - pdt_status.pdt_size = pat_rinfo.max_pdt_entries; - pdt_status.pdt_entries = pat_rinfo.current_pdt_entries; - pdt_status.pdt_status = 0; - pdt_status.first_dbe_loc = pat_rinfo.first_dbe_loc; - pdt_status.good_mem = pat_rinfo.good_mem; - } else { + if (ret != PDC_OK) { + pdt_type = PDT_PAT_CELL; + ret = get_info_pat_cell(); + } + + if (ret != PDC_OK) { pdt_type = PDT_PDC; + /* non-PAT machines provide the standard PDC call */ ret = pdc_mem_pdt_info(&pdt_status); } @@ -86,13 +168,17 @@ void __init pdc_pdt_init(void) } entries = pdt_status.pdt_entries; - WARN_ON(entries > MAX_PDT_ENTRIES); + if (WARN_ON(entries > MAX_PDT_ENTRIES)) + entries = pdt_status.pdt_entries = MAX_PDT_ENTRIES; - pr_info("PDT: size %lu, entries %lu, status %lu, dbe_loc 0x%lx," - " good_mem %lu\n", + pr_info("PDT: type %s, size %lu, entries %lu, status %lu, dbe_loc 0x%lx," + " good_mem %lu MB\n", + pdt_type == PDT_PDC ? __stringify(PDT_PDC) : + pdt_type == PDT_PAT_CELL ? __stringify(PDT_PAT_CELL) + : __stringify(PDT_PAT_NEW), pdt_status.pdt_size, pdt_status.pdt_entries, pdt_status.pdt_status, pdt_status.first_dbe_loc, - pdt_status.good_mem); + pdt_status.good_mem / 1024 / 1024); if (entries == 0) { pr_info("PDT: Firmware reports all memory OK.\n"); @@ -112,15 +198,12 @@ void __init pdc_pdt_init(void) #ifdef CONFIG_64BIT struct pdc_pat_mem_read_pd_retinfo pat_pret; - /* try old obsolete PAT firmware function first */ - pdt_type = PDT_PAT_OLD; - ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, - MAX_PDT_ENTRIES); - if (ret != PDC_OK) { - pdt_type = PDT_PAT_NEW; + if (pdt_type == PDT_PAT_CELL) + ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, + MAX_PDT_ENTRIES); + else ret = pdc_pat_mem_read_pd_pdt(&pat_pret, pdt_entry, MAX_PDT_TABLE_SIZE, 0); - } #else ret = PDC_BAD_PROC; #endif @@ -128,27 +211,142 @@ void __init pdc_pdt_init(void) if (ret != PDC_OK) { pdt_type = PDT_NONE; - pr_debug("PDT type %d, retval = %d\n", pdt_type, ret); + pr_warn("PDT: Get PDT entries failed with %d\n", ret); return; } for (i = 0; i < pdt_status.pdt_entries; i++) { - struct pdc_pat_mem_phys_mem_location loc; + report_mem_err(pdt_entry[i]); + + /* mark memory page bad */ + memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); + } +} - /* get DIMM slot number */ - loc.dimm_slot = 0xff; + +/* + * This is the PDT kernel thread main loop. + */ + +static int pdt_mainloop(void *unused) +{ + struct pdc_mem_read_pdt pdt_read_ret; + struct pdc_pat_mem_read_pd_retinfo pat_pret __maybe_unused; + unsigned long old_num_entries; + unsigned long *bad_mem_ptr; + int num, ret; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + old_num_entries = pdt_status.pdt_entries; + + schedule_timeout(pdt_poll_interval); + if (kthread_should_stop()) + break; + + /* Do we have new PDT entries? */ + switch (pdt_type) { + case PDT_PAT_NEW: + ret = get_info_pat_new(); + break; + case PDT_PAT_CELL: + ret = get_info_pat_cell(); + break; + default: + ret = pdc_mem_pdt_info(&pdt_status); + break; + } + + if (ret != PDC_OK) { + pr_warn("PDT: unexpected failure %d\n", ret); + return -EINVAL; + } + + /* if no new PDT entries, just wait again */ + num = pdt_status.pdt_entries - old_num_entries; + if (num <= 0) + continue; + + /* decrease poll interval in case we found memory errors */ + if (pdt_status.pdt_entries && + pdt_poll_interval == PDT_POLL_INTERVAL_DEFAULT) + pdt_poll_interval = PDT_POLL_INTERVAL_SHORT; + + /* limit entries to get */ + if (num > MAX_PDT_ENTRIES) { + num = MAX_PDT_ENTRIES; + pdt_status.pdt_entries = old_num_entries + num; + } + + /* get new entries */ + switch (pdt_type) { #ifdef CONFIG_64BIT - pdc_pat_mem_get_dimm_phys_location(&loc, pdt_entry[i]); + case PDT_PAT_CELL: + if (pdt_status.pdt_entries > MAX_PDT_ENTRIES) { + pr_crit("PDT: too many entries.\n"); + return -ENOMEM; + } + ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, + MAX_PDT_ENTRIES); + bad_mem_ptr = &pdt_entry[old_num_entries]; + break; + case PDT_PAT_NEW: + ret = pdc_pat_mem_read_pd_pdt(&pat_pret, + pdt_entry, + num * sizeof(unsigned long), + old_num_entries * sizeof(unsigned long)); + bad_mem_ptr = &pdt_entry[0]; + break; #endif + default: + ret = pdc_mem_pdt_read_entries(&pdt_read_ret, + pdt_entry); + bad_mem_ptr = &pdt_entry[old_num_entries]; + break; + } - pr_warn("PDT: BAD PAGE #%d at 0x%08lx, " - "DIMM slot %02x (error_type = %lu)\n", - i, - pdt_entry[i] & PAGE_MASK, - loc.dimm_slot, - pdt_entry[i] & 1); + /* report and mark memory broken */ + while (num--) { + unsigned long pde = *bad_mem_ptr++; - /* mark memory page bad */ - memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); + report_mem_err(pde); + +#ifdef CONFIG_MEMORY_FAILURE + if ((pde & PDT_ADDR_PERM_ERR) || + ((pde & PDT_ADDR_SINGLE_ERR) == 0)) + memory_failure(pde >> PAGE_SHIFT, 0, 0); + else + soft_offline_page( + pfn_to_page(pde >> PAGE_SHIFT), 0); +#else + pr_crit("PDT: memory error at 0x%lx ignored.\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y " + "for real handling.\n", + pde & PDT_ADDR_PHYS_MASK); +#endif + + } } + + return 0; } + + +static int __init pdt_initcall(void) +{ + struct task_struct *kpdtd_task; + + if (pdt_type == PDT_NONE) + return -ENODEV; + + kpdtd_task = kthread_create(pdt_mainloop, NULL, "kpdtd"); + if (IS_ERR(kpdtd_task)) + return PTR_ERR(kpdtd_task); + + wake_up_process(kpdtd_task); + + return 0; +} + +late_initcall(pdt_initcall); diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 6017a5af2e6e..0813359049ae 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -69,7 +69,7 @@ struct rdr_tbl_ent { static int perf_processor_interface __read_mostly = UNKNOWN_INTF; static int perf_enabled __read_mostly; -static spinlock_t perf_lock; +static DEFINE_SPINLOCK(perf_lock); struct parisc_device *cpu_device __read_mostly; /* RDRs to write for PCX-W */ @@ -533,8 +533,6 @@ static int __init perf_init(void) /* Patch the images to match the system */ perf_patch_images(); - spin_lock_init(&perf_lock); - /* TODO: this only lets us access the first cpu.. what to do for SMP? */ cpu_device = per_cpu(cpu_data, 0).dev; printk("Performance monitoring counters enabled for %s\n", diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 0ab32779dfa7..a778bd3c107c 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -30,6 +30,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/seq_file.h> +#include <linux/random.h> #include <linux/slab.h> #include <linux/cpu.h> #include <asm/param.h> @@ -89,7 +90,7 @@ init_percpu_prof(unsigned long cpunum) * (return 1). If so, initialize the chip and tell other partners in crime * they have work to do. */ -static int processor_probe(struct parisc_device *dev) +static int __init processor_probe(struct parisc_device *dev) { unsigned long txn_addr; unsigned long cpuid; @@ -237,28 +238,45 @@ static int processor_probe(struct parisc_device *dev) */ void __init collect_boot_cpu_data(void) { + unsigned long cr16_seed; + memset(&boot_cpu_data, 0, sizeof(boot_cpu_data)); + cr16_seed = get_cycles(); + add_device_randomness(&cr16_seed, sizeof(cr16_seed)); + boot_cpu_data.cpu_hz = 100 * PAGE0->mem_10msec; /* Hz of this PARISC */ /* get CPU-Model Information... */ #define p ((unsigned long *)&boot_cpu_data.pdc.model) - if (pdc_model_info(&boot_cpu_data.pdc.model) == PDC_OK) + if (pdc_model_info(&boot_cpu_data.pdc.model) == PDC_OK) { printk(KERN_INFO "model %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]); + + add_device_randomness(&boot_cpu_data.pdc.model, + sizeof(boot_cpu_data.pdc.model)); + } #undef p - if (pdc_model_versions(&boot_cpu_data.pdc.versions, 0) == PDC_OK) + if (pdc_model_versions(&boot_cpu_data.pdc.versions, 0) == PDC_OK) { printk(KERN_INFO "vers %08lx\n", boot_cpu_data.pdc.versions); - if (pdc_model_cpuid(&boot_cpu_data.pdc.cpuid) == PDC_OK) + add_device_randomness(&boot_cpu_data.pdc.versions, + sizeof(boot_cpu_data.pdc.versions)); + } + + if (pdc_model_cpuid(&boot_cpu_data.pdc.cpuid) == PDC_OK) { printk(KERN_INFO "CPUID vers %ld rev %ld (0x%08lx)\n", (boot_cpu_data.pdc.cpuid >> 5) & 127, boot_cpu_data.pdc.cpuid & 31, boot_cpu_data.pdc.cpuid); + add_device_randomness(&boot_cpu_data.pdc.cpuid, + sizeof(boot_cpu_data.pdc.cpuid)); + } + if (pdc_model_capabilities(&boot_cpu_data.pdc.capabilities) == PDC_OK) printk(KERN_INFO "capabilities 0x%lx\n", boot_cpu_data.pdc.capabilities); @@ -414,12 +432,12 @@ show_cpuinfo (struct seq_file *m, void *v) return 0; } -static const struct parisc_device_id processor_tbl[] = { +static const struct parisc_device_id processor_tbl[] __initconst = { { HPHW_NPROC, HVERSION_REV_ANY_ID, HVERSION_ANY_ID, SVERSION_ANY_ID }, { 0, } }; -static struct parisc_driver cpu_driver = { +static struct parisc_driver cpu_driver __refdata = { .name = "CPU", .id_table = processor_tbl, .probe = processor_probe diff --git a/arch/parisc/kernel/real2.S b/arch/parisc/kernel/real2.S index 1db58e546230..cc9963421a19 100644 --- a/arch/parisc/kernel/real2.S +++ b/arch/parisc/kernel/real2.S @@ -162,6 +162,7 @@ ENDPROC_CFI(restore_control_regs) .text .align 128 ENTRY_CFI(rfi_virt2real) +#if !defined(BOOTLOADER) /* switch to real mode... */ rsm PSW_SM_I,%r0 load32 PA(rfi_v2r_1), %r1 @@ -191,6 +192,7 @@ ENTRY_CFI(rfi_virt2real) nop rfi_v2r_1: tophys_r1 %r2 +#endif /* defined(BOOTLOADER) */ bv 0(%r2) nop ENDPROC_CFI(rfi_virt2real) @@ -198,6 +200,7 @@ ENDPROC_CFI(rfi_virt2real) .text .align 128 ENTRY_CFI(rfi_real2virt) +#if !defined(BOOTLOADER) rsm PSW_SM_I,%r0 load32 (rfi_r2v_1), %r1 nop @@ -226,6 +229,7 @@ ENTRY_CFI(rfi_real2virt) nop rfi_r2v_1: tovirt_r1 %r2 +#endif /* defined(BOOTLOADER) */ bv 0(%r2) nop ENDPROC_CFI(rfi_real2virt) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 1b73690477c5..48dc7d4d20bb 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -34,7 +34,7 @@ extern struct unwind_table_entry __start___unwind[]; extern struct unwind_table_entry __stop___unwind[]; -static spinlock_t unwind_lock; +static DEFINE_SPINLOCK(unwind_lock); /* * the kernel unwind block is not dynamically allocated so that * we can call unwind_init as early in the bootup process as @@ -181,8 +181,6 @@ int __init unwind_init(void) start = (long)&__start___unwind[0]; stop = (long)&__stop___unwind[0]; - spin_lock_init(&unwind_lock); - printk("unwind_init: start = 0x%lx, end = 0x%lx, entries = %lu\n", start, stop, (stop - start) / sizeof(struct unwind_table_entry)); diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 99115cd9e790..865a7f796c7f 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -27,8 +27,6 @@ #include <linux/compiler.h> #include <linux/uaccess.h> -DECLARE_PER_CPU(struct exception_data, exception_data); - #define get_user_space() (uaccess_kernel() ? 0 : mfsp(3)) #define get_kernel_space() (0) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 81b0031f909f..809c468edab1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -85,6 +85,17 @@ config NMI_IPI depends on SMP && (DEBUGGER || KEXEC_CORE || HARDLOCKUP_DETECTOR) default y +config PPC_WATCHDOG + bool + depends on HARDLOCKUP_DETECTOR + depends on HAVE_HARDLOCKUP_DETECTOR_ARCH + default y + help + This is a placeholder when the powerpc hardlockup detector + watchdog is selected (arch/powerpc/kernel/watchdog.c). It is + seleted via the generic lockup detector menu which is why we + have no standalone config option for it here. + config STACKTRACE_SUPPORT bool default y @@ -165,7 +176,7 @@ config PPC select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK - select ARCH_HAS_STRICT_KERNEL_RWX if (PPC_BOOK3S_64 && !RELOCATABLE && !HIBERNATION) + select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select HAVE_CBPF_JIT if !PPC64 select HAVE_CONTEXT_TRACKING if PPC64 @@ -356,10 +367,6 @@ config PPC_ADV_DEBUG_DAC_RANGE depends on PPC_ADV_DEBUG_REGS && 44x default y -config PPC_EMULATE_SSTEP - bool - default y if KPROBES || UPROBES || XMON || HAVE_HW_BREAKPOINT - config ZONE_DMA32 bool default y if PPC64 @@ -394,7 +401,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE config MATH_EMULATION bool "Math emulation" - depends on 4xx || 8xx || PPC_MPC832x || BOOKE + depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE ---help--- Some PowerPC chips designed for embedded applications do not have a floating-point unit and therefore do not implement the @@ -956,9 +963,9 @@ config PPC_PCI_CHOICE config PCI bool "PCI support" if PPC_PCI_CHOICE - default y if !40x && !CPM2 && !8xx && !PPC_83xx \ + default y if !40x && !CPM2 && !PPC_8xx && !PPC_83xx \ && !PPC_85xx && !PPC_86xx && !GAMECUBE_COMMON - default PCI_QSPAN if !4xx && !CPM2 && 8xx + default PCI_QSPAN if PPC_8xx select GENERIC_PCI_IOMAP help Find out whether your system includes a PCI bus. PCI is the name of @@ -974,7 +981,7 @@ config PCI_SYSCALL config PCI_QSPAN bool "QSpan PCI" - depends on !4xx && !CPM2 && 8xx + depends on PPC_8xx select PPC_I8259 help Say Y here if you have a system based on a Motorola 8xx-series @@ -1165,12 +1172,23 @@ config CONSISTENT_SIZE config PIN_TLB bool "Pinned Kernel TLBs (860 ONLY)" - depends on ADVANCED_OPTIONS && 8xx + depends on ADVANCED_OPTIONS && PPC_8xx && \ + !DEBUG_PAGEALLOC && !STRICT_KERNEL_RWX + +config PIN_TLB_DATA + bool "Pinned TLB for DATA" + depends on PIN_TLB + default y config PIN_TLB_IMMR bool "Pinned TLB for IMMR" depends on PIN_TLB default y + +config PIN_TLB_TEXT + bool "Pinned TLB for TEXT" + depends on PIN_TLB + default y endmenu if PPC64 diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index e2b3e7a00c9e..1381693a4a51 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -250,7 +250,7 @@ KBUILD_AFLAGS += $(aflags-y) KBUILD_CFLAGS += $(cflags-y) head-y := arch/powerpc/kernel/head_$(BITS).o -head-$(CONFIG_8xx) := arch/powerpc/kernel/head_8xx.o +head-$(CONFIG_PPC_8xx) := arch/powerpc/kernel/head_8xx.o head-$(CONFIG_40x) := arch/powerpc/kernel/head_40x.o head-$(CONFIG_44x) := arch/powerpc/kernel/head_44x.o head-$(CONFIG_FSL_BOOKE) := arch/powerpc/kernel/head_fsl_booke.o @@ -317,6 +317,10 @@ PHONY += ppc64le_defconfig ppc64le_defconfig: $(call merge_into_defconfig,ppc64_defconfig,le) +PHONY += powernv_be_defconfig +powernv_be_defconfig: + $(call merge_into_defconfig,powernv_defconfig,be) + PHONY += mpc85xx_defconfig mpc85xx_defconfig: $(call merge_into_defconfig,mpc85xx_basic_defconfig,\ diff --git a/arch/powerpc/boot/4xx.c b/arch/powerpc/boot/4xx.c index 9d3bd4c45a24..f7da65169124 100644 --- a/arch/powerpc/boot/4xx.c +++ b/arch/powerpc/boot/4xx.c @@ -564,7 +564,7 @@ void ibm405gp_fixup_clocks(unsigned int sys_clk, unsigned int ser_clk) fbdv = 16; cbdv = ((pllmr & 0x00060000) >> 17) + 1; /* CPU:PLB */ opdv = ((pllmr & 0x00018000) >> 15) + 1; /* PLB:OPB */ - ppdv = ((pllmr & 0x00001800) >> 13) + 1; /* PLB:PCI */ + ppdv = ((pllmr & 0x00006000) >> 13) + 1; /* PLB:PCI */ epdv = ((pllmr & 0x00001800) >> 11) + 2; /* PLB:EBC */ udiv = ((cpc0_cr0 & 0x3e) >> 1) + 1; diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 6f952fe1f084..c4e6fe35c075 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -107,17 +107,18 @@ src-wlib-y := string.S crt0.S stdio.c decompress.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ elf_util.c $(zlib-y) devtree.c stdlib.c \ - oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ - uartlite.c mpc52xx-psc.c opal.c + oflib.c ofconsole.c cuboot.c cpm-serial.c \ + uartlite.c opal.c +src-wlib-$(CONFIG_PPC_MPC52XX) += mpc52xx-psc.c src-wlib-$(CONFIG_PPC64_BOOT_WRAPPER) += opal-calls.S ifndef CONFIG_PPC64_BOOT_WRAPPER src-wlib-y += crtsavres.S endif src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c -src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c +src-wlib-$(CONFIG_PPC_8xx) += mpc8xx.c planetcore.c fsl-soc.c src-wlib-$(CONFIG_PPC_82xx) += pq2.c fsl-soc.c planetcore.c -src-wlib-$(CONFIG_EMBEDDED6xx) += mv64x60.c mv64x60_i2c.c ugecon.c fsl-soc.c +src-wlib-$(CONFIG_EMBEDDED6xx) += mpsc.c mv64x60.c mv64x60_i2c.c ugecon.c fsl-soc.c src-plat-y := of.c epapr.c src-plat-$(CONFIG_40x) += fixed-head.S ep405.c cuboot-hotfoot.c \ @@ -132,7 +133,7 @@ src-plat-$(CONFIG_44x) += treeboot-ebony.c cuboot-ebony.c treeboot-bamboo.c \ treeboot-iss4xx.c treeboot-currituck.c \ treeboot-akebono.c \ simpleboot.c fixed-head.S virtex.c -src-plat-$(CONFIG_8xx) += cuboot-8xx.c fixed-head.S ep88xc.c redboot-8xx.c +src-plat-$(CONFIG_PPC_8xx) += cuboot-8xx.c fixed-head.S ep88xc.c redboot-8xx.c src-plat-$(CONFIG_PPC_MPC52xx) += cuboot-52xx.c src-plat-$(CONFIG_PPC_82xx) += cuboot-pq2.c fixed-head.S ep8248e.c cuboot-824x.c src-plat-$(CONFIG_PPC_83xx) += cuboot-83xx.c fixed-head.S redboot-83xx.c diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S index 12866ccb5694..dcf2f15e6797 100644 --- a/arch/powerpc/boot/crt0.S +++ b/arch/powerpc/boot/crt0.S @@ -26,17 +26,17 @@ _zimage_start_opd: #ifdef __powerpc64__ .balign 8 -p_start: .llong _start -p_etext: .llong _etext -p_bss_start: .llong __bss_start -p_end: .llong _end - -p_toc: .llong __toc_start + 0x8000 - p_base -p_dyn: .llong __dynamic_start - p_base -p_rela: .llong __rela_dyn_start - p_base -p_prom: .llong 0 +p_start: .8byte _start +p_etext: .8byte _etext +p_bss_start: .8byte __bss_start +p_end: .8byte _end + +p_toc: .8byte __toc_start + 0x8000 - p_base +p_dyn: .8byte __dynamic_start - p_base +p_rela: .8byte __rela_dyn_start - p_base +p_prom: .8byte 0 .weak _platform_stack_top -p_pstack: .llong _platform_stack_top +p_pstack: .8byte _platform_stack_top #else p_start: .long _start p_etext: .long _etext diff --git a/arch/powerpc/boot/dts/fsp2.dts b/arch/powerpc/boot/dts/fsp2.dts index 475953ada707..f10a64aeb83b 100644 --- a/arch/powerpc/boot/dts/fsp2.dts +++ b/arch/powerpc/boot/dts/fsp2.dts @@ -52,6 +52,7 @@ clocks { mmc_clk: mmc_clk { compatible = "fixed-clock"; + #clock-cells = <0>; clock-frequency = <50000000>; clock-output-names = "mmc_clk"; }; @@ -359,20 +360,6 @@ interrupts = <31 0x4 15 0x84>; }; - mmc0: sdhci@020c0000 { - compatible = "st,sdhci-stih407", "st,sdhci"; - status = "disabled"; - reg = <0x020c0000 0x20000>; - reg-names = "mmc"; - interrupt-parent = <&UIC1_3>; - interrupts = <21 0x4 22 0x4>; - interrupt-names = "mmcirq"; - pinctrl-names = "default"; - pinctrl-0 = <>; - clock-names = "mmc"; - clocks = <&mmc_clk>; - }; - plb6 { compatible = "ibm,plb6"; #address-cells = <2>; @@ -501,6 +488,24 @@ /*RXDE*/ 4 &UIC1_2 13 0x4>; }; + mmc0: mmc@20c0000 { + compatible = "st,sdhci-stih407", "st,sdhci"; + reg = <0x020c0000 0x20000>; + reg-names = "mmc"; + interrupts = <21 0x4>; + interrupt-parent = <&UIC1_3>; + interrupt-names = "mmcirq"; + pinctrl-names = "default"; + pinctrl-0 = <>; + clock-names = "mmc"; + clocks = <&mmc_clk>; + bus-width = <4>; + non-removable; + sd-uhs-sdr50; + sd-uhs-sdr104; + sd-uhs-ddr50; + }; + opb { compatible = "ibm,opb"; #address-cells = <1>; diff --git a/arch/powerpc/boot/ppc_asm.h b/arch/powerpc/boot/ppc_asm.h index 68e388ee94fe..c63299f9fdd9 100644 --- a/arch/powerpc/boot/ppc_asm.h +++ b/arch/powerpc/boot/ppc_asm.h @@ -80,4 +80,12 @@ .long 0xa6037b7d; /* mtsrr1 r11 */ \ .long 0x2400004c /* rfid */ +#ifdef CONFIG_PPC_8xx +#define MFTBL(dest) mftb dest +#define MFTBU(dest) mftbu dest +#else +#define MFTBL(dest) mfspr dest, SPRN_TBRL +#define MFTBU(dest) mfspr dest, SPRN_TBRU +#endif + #endif /* _PPC64_PPC_ASM_H */ diff --git a/arch/powerpc/boot/serial.c b/arch/powerpc/boot/serial.c index e04c1e4063ae..7b5c02b1afd0 100644 --- a/arch/powerpc/boot/serial.c +++ b/arch/powerpc/boot/serial.c @@ -120,15 +120,19 @@ int serial_console_init(void) if (dt_is_compatible(devp, "ns16550") || dt_is_compatible(devp, "pnpPNP,501")) rc = ns16550_console_init(devp, &serial_cd); +#ifdef CONFIG_EMBEDDED6xx else if (dt_is_compatible(devp, "marvell,mv64360-mpsc")) rc = mpsc_console_init(devp, &serial_cd); +#endif else if (dt_is_compatible(devp, "fsl,cpm1-scc-uart") || dt_is_compatible(devp, "fsl,cpm1-smc-uart") || dt_is_compatible(devp, "fsl,cpm2-scc-uart") || dt_is_compatible(devp, "fsl,cpm2-smc-uart")) rc = cpm_console_init(devp, &serial_cd); +#ifdef CONFIG_PPC_MPC52XX else if (dt_is_compatible(devp, "fsl,mpc5200-psc-uart")) rc = mpc5200_psc_console_init(devp, &serial_cd); +#endif else if (dt_is_compatible(devp, "xlnx,opb-uartlite-1.00.b") || dt_is_compatible(devp, "xlnx,xps-uartlite-1.00.a")) rc = uartlite_console_init(devp, &serial_cd); diff --git a/arch/powerpc/boot/util.S b/arch/powerpc/boot/util.S index 243b8497d58b..ec069177d942 100644 --- a/arch/powerpc/boot/util.S +++ b/arch/powerpc/boot/util.S @@ -71,32 +71,18 @@ udelay: add r4,r4,r5 addi r4,r4,-1 divw r4,r4,r5 /* BUS ticks */ -#ifdef CONFIG_8xx -1: mftbu r5 - mftb r6 - mftbu r7 -#else -1: mfspr r5, SPRN_TBRU - mfspr r6, SPRN_TBRL - mfspr r7, SPRN_TBRU -#endif +1: MFTBU(r5) + MFTBL(r6) + MFTBU(r7) cmpw 0,r5,r7 bne 1b /* Get [synced] base time */ addc r9,r6,r4 /* Compute end time */ addze r8,r5 -#ifdef CONFIG_8xx -2: mftbu r5 -#else -2: mfspr r5, SPRN_TBRU -#endif +2: MFTBU(r5) cmpw 0,r5,r8 blt 2b bgt 3f -#ifdef CONFIG_8xx - mftb r6 -#else - mfspr r6, SPRN_TBRL -#endif + MFTBL(r6) cmpw 0,r6,r9 blt 2b 3: blr diff --git a/arch/powerpc/configs/40x/acadia_defconfig b/arch/powerpc/configs/40x/acadia_defconfig index 3438ed99c088..e57344c3b0d7 100644 --- a/arch/powerpc/configs/40x/acadia_defconfig +++ b/arch/powerpc/configs/40x/acadia_defconfig @@ -64,4 +64,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/40x/ep405_defconfig b/arch/powerpc/configs/40x/ep405_defconfig index 36c44c0b560c..0f66f8a87be8 100644 --- a/arch/powerpc/configs/40x/ep405_defconfig +++ b/arch/powerpc/configs/40x/ep405_defconfig @@ -64,4 +64,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/40x/kilauea_defconfig b/arch/powerpc/configs/40x/kilauea_defconfig index ad2156c6e2fc..b5cc7426c21f 100644 --- a/arch/powerpc/configs/40x/kilauea_defconfig +++ b/arch/powerpc/configs/40x/kilauea_defconfig @@ -72,4 +72,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/40x/klondike_defconfig b/arch/powerpc/configs/40x/klondike_defconfig index 28adb782ec51..caab658d1da1 100644 --- a/arch/powerpc/configs/40x/klondike_defconfig +++ b/arch/powerpc/configs/40x/klondike_defconfig @@ -26,7 +26,6 @@ CONFIG_SCSI_SAS_ATTRS=y # CONFIG_VT is not set # CONFIG_UNIX98_PTYS is not set # CONFIG_LEGACY_PTYS is not set -# CONFIG_DEVKMEM is not set # CONFIG_HW_RANDOM is not set # CONFIG_HWMON is not set # CONFIG_USB_SUPPORT is not set diff --git a/arch/powerpc/configs/40x/makalu_defconfig b/arch/powerpc/configs/40x/makalu_defconfig index a00f434c4d47..e0b1489b7c7b 100644 --- a/arch/powerpc/configs/40x/makalu_defconfig +++ b/arch/powerpc/configs/40x/makalu_defconfig @@ -62,4 +62,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/40x/obs600_defconfig b/arch/powerpc/configs/40x/obs600_defconfig index e500e6a12b3e..aac06d2ad01a 100644 --- a/arch/powerpc/configs/40x/obs600_defconfig +++ b/arch/powerpc/configs/40x/obs600_defconfig @@ -72,4 +72,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/40x/virtex_defconfig b/arch/powerpc/configs/40x/virtex_defconfig index 65dc084a154c..a2b2770eee8f 100644 --- a/arch/powerpc/configs/40x/virtex_defconfig +++ b/arch/powerpc/configs/40x/virtex_defconfig @@ -41,9 +41,9 @@ CONFIG_NETDEVICES=y CONFIG_SERIO_XILINX_XPS_PS2=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_UARTLITE=y CONFIG_SERIAL_UARTLITE_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y CONFIG_XILINX_HWICAP=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y @@ -74,4 +74,3 @@ CONFIG_FONT_8x16=y CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_KERNEL=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/40x/walnut_defconfig b/arch/powerpc/configs/40x/walnut_defconfig index 567f99bd64a3..6faa03cd661c 100644 --- a/arch/powerpc/configs/40x/walnut_defconfig +++ b/arch/powerpc/configs/40x/walnut_defconfig @@ -57,4 +57,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/akebono_defconfig b/arch/powerpc/configs/44x/akebono_defconfig index 143b2fbddb46..9fcd361607e2 100644 --- a/arch/powerpc/configs/44x/akebono_defconfig +++ b/arch/powerpc/configs/44x/akebono_defconfig @@ -123,7 +123,6 @@ CONFIG_NLS_DEFAULT="n" CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y CONFIG_XMON=y @@ -135,5 +134,4 @@ CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_SHA1_PPC=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/44x/bamboo_defconfig b/arch/powerpc/configs/44x/bamboo_defconfig index 477d99fefd9a..6f3a6ecc81e7 100644 --- a/arch/powerpc/configs/44x/bamboo_defconfig +++ b/arch/powerpc/configs/44x/bamboo_defconfig @@ -55,4 +55,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/currituck_defconfig b/arch/powerpc/configs/44x/currituck_defconfig index 3799a26de6f4..5f1df5fe4453 100644 --- a/arch/powerpc/configs/44x/currituck_defconfig +++ b/arch/powerpc/configs/44x/currituck_defconfig @@ -81,7 +81,6 @@ CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_NLS_DEFAULT="n" CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y CONFIG_XMON=y @@ -94,5 +93,4 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/44x/ebony_defconfig b/arch/powerpc/configs/44x/ebony_defconfig index c265f54ab9e5..e2b6578993d5 100644 --- a/arch/powerpc/configs/44x/ebony_defconfig +++ b/arch/powerpc/configs/44x/ebony_defconfig @@ -59,5 +59,4 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/44x/eiger_defconfig b/arch/powerpc/configs/44x/eiger_defconfig index bb6bd6d90821..f6dc23fef683 100644 --- a/arch/powerpc/configs/44x/eiger_defconfig +++ b/arch/powerpc/configs/44x/eiger_defconfig @@ -84,18 +84,14 @@ CONFIG_CRYPTO_CCM=y CONFIG_CRYPTO_GCM=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_CTS=y -CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_LRW=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_XTS=y -CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_XCBC=y CONFIG_CRYPTO_MD4=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=y CONFIG_CRYPTO_ARC4=y CONFIG_CRYPTO_BLOWFISH=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/fsp2_defconfig b/arch/powerpc/configs/44x/fsp2_defconfig index e8e6a6999852..bae6b26bcfba 100644 --- a/arch/powerpc/configs/44x/fsp2_defconfig +++ b/arch/powerpc/configs/44x/fsp2_defconfig @@ -92,8 +92,10 @@ CONFIG_MMC_DEBUG=y CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_SDHCI_OF_ARASAN=y +CONFIG_MMC_SDHCI_ST=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_M41T80=y +CONFIG_RESET_CONTROLLER=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y @@ -115,7 +117,6 @@ CONFIG_PRINTK_TIME=y CONFIG_MESSAGE_LOGLEVEL_DEFAULT=3 CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y CONFIG_CRYPTO_CBC=y diff --git a/arch/powerpc/configs/44x/icon_defconfig b/arch/powerpc/configs/44x/icon_defconfig index 060f2edddb71..4453a4590b1a 100644 --- a/arch/powerpc/configs/44x/icon_defconfig +++ b/arch/powerpc/configs/44x/icon_defconfig @@ -47,8 +47,6 @@ CONFIG_FUSION_LOGGING=y CONFIG_NETDEVICES=y CONFIG_IBM_EMAC=y # CONFIG_WLAN is not set -CONFIG_INPUT_MOUSEDEV_SCREEN_X=640 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=480 # CONFIG_MOUSE_PS2_ALPS is not set # CONFIG_MOUSE_PS2_LOGIPS2PP is not set # CONFIG_MOUSE_PS2_SYNAPTICS is not set @@ -94,4 +92,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/iss476-smp_defconfig b/arch/powerpc/configs/44x/iss476-smp_defconfig index 115a6b2be18b..d24bfa6ecd62 100644 --- a/arch/powerpc/configs/44x/iss476-smp_defconfig +++ b/arch/powerpc/configs/44x/iss476-smp_defconfig @@ -63,7 +63,6 @@ CONFIG_TMPFS=y CONFIG_CRAMFS=y # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y CONFIG_PPC_EARLY_DEBUG=y @@ -72,5 +71,4 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/44x/katmai_defconfig b/arch/powerpc/configs/44x/katmai_defconfig index b999048c4ae6..5d3f685a7af8 100644 --- a/arch/powerpc/configs/44x/katmai_defconfig +++ b/arch/powerpc/configs/44x/katmai_defconfig @@ -60,4 +60,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/rainier_defconfig b/arch/powerpc/configs/44x/rainier_defconfig index b8c9ee45d0a2..7b8355a5698d 100644 --- a/arch/powerpc/configs/44x/rainier_defconfig +++ b/arch/powerpc/configs/44x/rainier_defconfig @@ -66,4 +66,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/redwood_defconfig b/arch/powerpc/configs/44x/redwood_defconfig index a4bb048448da..918cfb63f0c8 100644 --- a/arch/powerpc/configs/44x/redwood_defconfig +++ b/arch/powerpc/configs/44x/redwood_defconfig @@ -83,18 +83,14 @@ CONFIG_CRYPTO_CCM=y CONFIG_CRYPTO_GCM=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_CTS=y -CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_LRW=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_XTS=y -CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_XCBC=y CONFIG_CRYPTO_MD4=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=y CONFIG_CRYPTO_ARC4=y CONFIG_CRYPTO_BLOWFISH=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/sequoia_defconfig b/arch/powerpc/configs/44x/sequoia_defconfig index b3792fd8111d..1e04122912f3 100644 --- a/arch/powerpc/configs/44x/sequoia_defconfig +++ b/arch/powerpc/configs/44x/sequoia_defconfig @@ -67,4 +67,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/taishan_defconfig b/arch/powerpc/configs/44x/taishan_defconfig index ff6f86241418..42cc7b4ed95f 100644 --- a/arch/powerpc/configs/44x/taishan_defconfig +++ b/arch/powerpc/configs/44x/taishan_defconfig @@ -61,4 +61,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/virtex5_defconfig b/arch/powerpc/configs/44x/virtex5_defconfig index ce052064bcbb..99cc3dc02df1 100644 --- a/arch/powerpc/configs/44x/virtex5_defconfig +++ b/arch/powerpc/configs/44x/virtex5_defconfig @@ -40,9 +40,9 @@ CONFIG_NETDEVICES=y CONFIG_SERIO_XILINX_XPS_PS2=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_UARTLITE=y CONFIG_SERIAL_UARTLITE_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y CONFIG_XILINX_HWICAP=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y @@ -73,4 +73,3 @@ CONFIG_FONT_8x16=y CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_KERNEL=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/44x/warp_defconfig b/arch/powerpc/configs/44x/warp_defconfig index ab932488e68b..b5c866073efd 100644 --- a/arch/powerpc/configs/44x/warp_defconfig +++ b/arch/powerpc/configs/44x/warp_defconfig @@ -97,4 +97,3 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_BUGVERBOSE is not set -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/52xx/cm5200_defconfig b/arch/powerpc/configs/52xx/cm5200_defconfig index c1faac800806..73948e88ac82 100644 --- a/arch/powerpc/configs/52xx/cm5200_defconfig +++ b/arch/powerpc/configs/52xx/cm5200_defconfig @@ -77,4 +77,3 @@ CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/52xx/lite5200b_defconfig b/arch/powerpc/configs/52xx/lite5200b_defconfig index 9493b02ac660..6fc7f786c83c 100644 --- a/arch/powerpc/configs/52xx/lite5200b_defconfig +++ b/arch/powerpc/configs/52xx/lite5200b_defconfig @@ -14,6 +14,7 @@ CONFIG_PPC_MPC52xx=y CONFIG_PPC_MPC5200_SIMPLE=y CONFIG_PPC_LITE5200=y # CONFIG_PPC_PMAC is not set +CONFIG_GEN_RTC=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -44,7 +45,6 @@ CONFIG_SERIAL_MPC52xx=y CONFIG_SERIAL_MPC52xx_CONSOLE=y CONFIG_SERIAL_MPC52xx_CONSOLE_BAUD=115200 # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -62,4 +62,3 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/52xx/motionpro_defconfig b/arch/powerpc/configs/52xx/motionpro_defconfig index fe8126bc1655..ae2a1f74103b 100644 --- a/arch/powerpc/configs/52xx/motionpro_defconfig +++ b/arch/powerpc/configs/52xx/motionpro_defconfig @@ -41,16 +41,16 @@ CONFIG_ATA=y CONFIG_PATA_MPC52xx=y CONFIG_NETDEVICES=y CONFIG_FEC_MPC52xx=y -CONFIG_MARVELL_PHY=y +CONFIG_MDIO_BITBANG=y +CONFIG_BROADCOM_PHY=y +CONFIG_CICADA_PHY=y CONFIG_DAVICOM_PHY=y -CONFIG_QSEMI_PHY=y +CONFIG_ICPLUS_PHY=y CONFIG_LXT_PHY=y -CONFIG_CICADA_PHY=y -CONFIG_VITESSE_PHY=y +CONFIG_MARVELL_PHY=y +CONFIG_QSEMI_PHY=y CONFIG_SMSC_PHY=y -CONFIG_BROADCOM_PHY=y -CONFIG_ICPLUS_PHY=y -CONFIG_MDIO_BITBANG=y +CONFIG_VITESSE_PHY=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set @@ -90,4 +90,3 @@ CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/52xx/tqm5200_defconfig b/arch/powerpc/configs/52xx/tqm5200_defconfig index b8b316b884aa..0777e6efd22d 100644 --- a/arch/powerpc/configs/52xx/tqm5200_defconfig +++ b/arch/powerpc/configs/52xx/tqm5200_defconfig @@ -48,7 +48,6 @@ CONFIG_PATA_PLATFORM=y CONFIG_NETDEVICES=y CONFIG_FEC_MPC52xx=y CONFIG_LXT_PHY=y -CONFIG_FIXED_PHY=y CONFIG_SERIAL_MPC52xx=y CONFIG_SERIAL_MPC52xx_CONSOLE=y CONFIG_SERIAL_MPC52xx_CONSOLE_BAUD=115200 @@ -92,4 +91,3 @@ CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/asp8347_defconfig b/arch/powerpc/configs/83xx/asp8347_defconfig index b60cac088a7b..dd884df32dfd 100644 --- a/arch/powerpc/configs/83xx/asp8347_defconfig +++ b/arch/powerpc/configs/83xx/asp8347_defconfig @@ -42,7 +42,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_NETDEVICES=y CONFIG_GIANFAR=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -71,4 +70,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/kmeter1_defconfig b/arch/powerpc/configs/83xx/kmeter1_defconfig index 9547dcdd6489..d21b5cb365f2 100644 --- a/arch/powerpc/configs/83xx/kmeter1_defconfig +++ b/arch/powerpc/configs/83xx/kmeter1_defconfig @@ -55,7 +55,6 @@ CONFIG_HDLC=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_HW_RANDOM=y diff --git a/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig b/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig index 80aa844c1428..1f69f4edf074 100644 --- a/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig +++ b/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig @@ -48,8 +48,6 @@ CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y CONFIG_CICADA_PHY=y -CONFIG_FIXED_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -87,4 +85,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig b/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig index d89d13bc6901..797fc3ffddee 100644 --- a/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig +++ b/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig @@ -47,7 +47,6 @@ CONFIG_MD_RAID1=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -85,4 +84,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc832x_mds_defconfig b/arch/powerpc/configs/83xx/mpc832x_mds_defconfig index e789518a2881..4f914906ee4b 100644 --- a/arch/powerpc/configs/83xx/mpc832x_mds_defconfig +++ b/arch/powerpc/configs/83xx/mpc832x_mds_defconfig @@ -14,7 +14,6 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y CONFIG_MPC832x_MDS=y -CONFIG_QUICC_ENGINE=y CONFIG_MATH_EMULATION=y CONFIG_PCI=y CONFIG_NET=y @@ -36,7 +35,6 @@ CONFIG_SCSI=y CONFIG_NETDEVICES=y CONFIG_UCC_GETH=y CONFIG_DAVICOM_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -50,6 +48,7 @@ CONFIG_I2C_MPC=y CONFIG_WATCHDOG=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_DS1374=y +CONFIG_QUICC_ENGINE=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -59,4 +58,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc832x_rdb_defconfig b/arch/powerpc/configs/83xx/mpc832x_rdb_defconfig index 917a49ca2bd1..a484eb8401e8 100644 --- a/arch/powerpc/configs/83xx/mpc832x_rdb_defconfig +++ b/arch/powerpc/configs/83xx/mpc832x_rdb_defconfig @@ -14,7 +14,7 @@ CONFIG_LDM_PARTITION=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y CONFIG_MPC832x_RDB=y -CONFIG_QUICC_ENGINE=y +CONFIG_GEN_RTC=y CONFIG_MATH_EMULATION=y CONFIG_PCI=y CONFIG_NET=y @@ -38,7 +38,6 @@ CONFIG_NETDEVICES=y CONFIG_UCC_GETH=y CONFIG_E1000=y CONFIG_ICPLUS_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -46,7 +45,6 @@ CONFIG_ICPLUS_PHY=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_HW_RANDOM=y -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -62,6 +60,7 @@ CONFIG_USB_OHCI_HCD_PPC_OF_BE=y CONFIG_USB_STORAGE=y CONFIG_MMC=y CONFIG_MMC_SPI=y +CONFIG_QUICC_ENGINE=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_MSDOS_FS=y @@ -78,4 +77,3 @@ CONFIG_NLS_ISO8859_1=y CONFIG_CRC_T10DIF=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc834x_itx_defconfig b/arch/powerpc/configs/83xx/mpc834x_itx_defconfig index 00f636e95cc8..37f4d93b3f81 100644 --- a/arch/powerpc/configs/83xx/mpc834x_itx_defconfig +++ b/arch/powerpc/configs/83xx/mpc834x_itx_defconfig @@ -49,7 +49,6 @@ CONFIG_MD_RAID1=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_CICADA_PHY=y -CONFIG_FIXED_PHY=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set @@ -84,4 +83,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_CRC_T10DIF=y CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc834x_itxgp_defconfig b/arch/powerpc/configs/83xx/mpc834x_itxgp_defconfig index a539d44d1dba..7adb6708a761 100644 --- a/arch/powerpc/configs/83xx/mpc834x_itxgp_defconfig +++ b/arch/powerpc/configs/83xx/mpc834x_itxgp_defconfig @@ -75,4 +75,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_CRC_T10DIF=y CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc834x_mds_defconfig b/arch/powerpc/configs/83xx/mpc834x_mds_defconfig index 9f0ddc830c82..d7ce3551529d 100644 --- a/arch/powerpc/configs/83xx/mpc834x_mds_defconfig +++ b/arch/powerpc/configs/83xx/mpc834x_mds_defconfig @@ -35,7 +35,6 @@ CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y CONFIG_MARVELL_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -58,4 +57,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc836x_mds_defconfig b/arch/powerpc/configs/83xx/mpc836x_mds_defconfig index ceed4c1f0ab5..92134cee3f37 100644 --- a/arch/powerpc/configs/83xx/mpc836x_mds_defconfig +++ b/arch/powerpc/configs/83xx/mpc836x_mds_defconfig @@ -14,7 +14,6 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y CONFIG_MPC836x_MDS=y -CONFIG_QUICC_ENGINE=y CONFIG_PCI=y CONFIG_NET=y CONFIG_PACKET=y @@ -41,7 +40,6 @@ CONFIG_SCSI=y CONFIG_NETDEVICES=y CONFIG_UCC_GETH=y CONFIG_MARVELL_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -55,6 +53,7 @@ CONFIG_I2C_MPC=y CONFIG_WATCHDOG=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_DS1374=y +CONFIG_QUICC_ENGINE=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -64,4 +63,3 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc836x_rdk_defconfig b/arch/powerpc/configs/83xx/mpc836x_rdk_defconfig index a6819bf3ef5e..97f7ea5f205f 100644 --- a/arch/powerpc/configs/83xx/mpc836x_rdk_defconfig +++ b/arch/powerpc/configs/83xx/mpc836x_rdk_defconfig @@ -12,7 +12,6 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y CONFIG_MPC836x_RDK=y -CONFIG_QUICC_ENGINE=y CONFIG_QE_GPIO=y CONFIG_PCI=y CONFIG_NET=y @@ -39,11 +38,9 @@ CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_NETDEVICES=y CONFIG_UCC_GETH=y CONFIG_BROADCOM_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_QE=y @@ -63,6 +60,7 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_USB_SUPPORT is not set +CONFIG_QUICC_ENGINE=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -72,4 +70,3 @@ CONFIG_NFS_FS=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_PPC_EARLY_DEBUG=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc837x_mds_defconfig b/arch/powerpc/configs/83xx/mpc837x_mds_defconfig index 4bd1992e4d98..ee7510a33d06 100644 --- a/arch/powerpc/configs/83xx/mpc837x_mds_defconfig +++ b/arch/powerpc/configs/83xx/mpc837x_mds_defconfig @@ -11,6 +11,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y CONFIG_MPC837x_MDS=y +CONFIG_GEN_RTC=y CONFIG_PCI=y CONFIG_NET=y CONFIG_PACKET=y @@ -35,7 +36,6 @@ CONFIG_SATA_FSL=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_MARVELL_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -43,7 +43,6 @@ CONFIG_MARVELL_PHY=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -58,4 +57,3 @@ CONFIG_ROOT_NFS=y CONFIG_CRC_T10DIF=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/mpc837x_rdb_defconfig b/arch/powerpc/configs/83xx/mpc837x_rdb_defconfig index 2d4bb63882b8..8966a9af4230 100644 --- a/arch/powerpc/configs/83xx/mpc837x_rdb_defconfig +++ b/arch/powerpc/configs/83xx/mpc837x_rdb_defconfig @@ -11,6 +11,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y CONFIG_MPC837x_RDB=y +CONFIG_GEN_RTC=y CONFIG_PCI=y CONFIG_NET=y CONFIG_PACKET=y @@ -41,9 +42,7 @@ CONFIG_MD_RAID456=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_MARVELL_PHY=y -CONFIG_FIXED_PHY=y CONFIG_INPUT_FF_MEMLESS=m -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -51,7 +50,6 @@ CONFIG_INPUT_FF_MEMLESS=m CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -86,4 +84,3 @@ CONFIG_CRC_T10DIF=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/83xx/sbc834x_defconfig b/arch/powerpc/configs/83xx/sbc834x_defconfig index b3380dbd1925..7d74699334da 100644 --- a/arch/powerpc/configs/83xx/sbc834x_defconfig +++ b/arch/powerpc/configs/83xx/sbc834x_defconfig @@ -11,6 +11,7 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y CONFIG_SBC834x=y +CONFIG_GEN_RTC=y CONFIG_PCI=y CONFIG_NET=y CONFIG_PACKET=y @@ -41,7 +42,6 @@ CONFIG_BLK_DEV_SD=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_BROADCOM_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -52,7 +52,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_NR_UARTS=2 CONFIG_SERIAL_8250_RUNTIME_UARTS=2 # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -72,5 +71,4 @@ CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig index a917f7afb4f9..dd98f43b2fb8 100644 --- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig +++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig @@ -22,7 +22,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_GE_IMP3A=y -CONFIG_QUICC_ENGINE=y CONFIG_QE_GPIO=y CONFIG_CPM2=y CONFIG_HIGHMEM=y @@ -161,6 +160,7 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_RX8581=y CONFIG_DMADEVICES=y CONFIG_FSL_DMA=y +CONFIG_QUICC_ENGINE=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y @@ -233,5 +233,4 @@ CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DEV_TALITOS=y diff --git a/arch/powerpc/configs/85xx/ksi8560_defconfig b/arch/powerpc/configs/85xx/ksi8560_defconfig index bd814dfb0bbd..9ce6f48cfb61 100644 --- a/arch/powerpc/configs/85xx/ksi8560_defconfig +++ b/arch/powerpc/configs/85xx/ksi8560_defconfig @@ -8,6 +8,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_KSI8560=y CONFIG_CPM2=y +CONFIG_GEN_RTC=y CONFIG_HIGHMEM=y CONFIG_BINFMT_MISC=y CONFIG_MATH_EMULATION=y @@ -39,14 +40,12 @@ CONFIG_FS_ENET=y CONFIG_FS_ENET_MDIO_FCC=y CONFIG_GIANFAR=y CONFIG_MARVELL_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set # CONFIG_VT is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -57,4 +56,3 @@ CONFIG_DEBUG_FS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_MUTEXES=y # CONFIG_DEBUG_BUGVERBOSE is not set -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/mpc8540_ads_defconfig b/arch/powerpc/configs/85xx/mpc8540_ads_defconfig index 32af10def641..5fbc3f904046 100644 --- a/arch/powerpc/configs/85xx/mpc8540_ads_defconfig +++ b/arch/powerpc/configs/85xx/mpc8540_ads_defconfig @@ -9,6 +9,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_MPC8540_ADS=y +CONFIG_GEN_RTC=y CONFIG_BINFMT_MISC=y CONFIG_MATH_EMULATION=y # CONFIG_SECCOMP is not set @@ -30,7 +31,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_NETDEVICES=y CONFIG_GIANFAR=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -38,7 +38,6 @@ CONFIG_GIANFAR=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -47,4 +46,3 @@ CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_MUTEXES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/mpc8560_ads_defconfig b/arch/powerpc/configs/85xx/mpc8560_ads_defconfig index a52b2170ee33..ff981d7905c7 100644 --- a/arch/powerpc/configs/85xx/mpc8560_ads_defconfig +++ b/arch/powerpc/configs/85xx/mpc8560_ads_defconfig @@ -7,6 +7,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_MPC8560_ADS=y +CONFIG_GEN_RTC=y CONFIG_BINFMT_MISC=y CONFIG_MATH_EMULATION=y # CONFIG_SECCOMP is not set @@ -32,16 +33,14 @@ CONFIG_FS_ENET=y # CONFIG_FS_ENET_HAS_SCC is not set CONFIG_GIANFAR=y CONFIG_E1000=y -CONFIG_MARVELL_PHY=y CONFIG_DAVICOM_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set +CONFIG_MARVELL_PHY=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set # CONFIG_VT is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -50,4 +49,3 @@ CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_MUTEXES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig b/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig index 002bb48abaa3..974f0706d777 100644 --- a/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig +++ b/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig @@ -9,6 +9,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_MPC85xx_CDS=y +CONFIG_GEN_RTC=y CONFIG_BINFMT_MISC=y CONFIG_MATH_EMULATION=y # CONFIG_SECCOMP is not set @@ -35,7 +36,6 @@ CONFIG_BLK_DEV_VIA82CXXX=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E1000=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -43,7 +43,6 @@ CONFIG_E1000=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -52,4 +51,3 @@ CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_MUTEXES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/sbc8548_defconfig b/arch/powerpc/configs/85xx/sbc8548_defconfig index 97ae02377cf3..7e3e84a842e4 100644 --- a/arch/powerpc/configs/85xx/sbc8548_defconfig +++ b/arch/powerpc/configs/85xx/sbc8548_defconfig @@ -6,6 +6,7 @@ CONFIG_EXPERT=y CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set CONFIG_SBC8548=y +CONFIG_GEN_RTC=y CONFIG_BINFMT_MISC=y CONFIG_MATH_EMULATION=y # CONFIG_SECCOMP is not set @@ -36,7 +37,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_BROADCOM_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -44,10 +44,8 @@ CONFIG_BROADCOM_PHY=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y # CONFIG_USB_SUPPORT is not set CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/socrates_defconfig b/arch/powerpc/configs/85xx/socrates_defconfig index 13579cb30539..6106fadbbd8b 100644 --- a/arch/powerpc/configs/85xx/socrates_defconfig +++ b/arch/powerpc/configs/85xx/socrates_defconfig @@ -42,8 +42,6 @@ CONFIG_BLK_DEV_SD=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_MARVELL_PHY=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=800 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=480 CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set @@ -86,4 +84,3 @@ CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_FONTS=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/stx_gp3_defconfig b/arch/powerpc/configs/85xx/stx_gp3_defconfig index 384926f3ce1d..5b9cc01b9098 100644 --- a/arch/powerpc/configs/85xx/stx_gp3_defconfig +++ b/arch/powerpc/configs/85xx/stx_gp3_defconfig @@ -39,8 +39,6 @@ CONFIG_SCSI_CONSTANTS=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_MARVELL_PHY=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1280 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=1024 CONFIG_INPUT_JOYDEV=m CONFIG_INPUT_EVDEV=m # CONFIG_VT is not set @@ -68,4 +66,3 @@ CONFIG_CRC_T10DIF=m CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set CONFIG_BDI_SWITCH=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/tqm8540_defconfig b/arch/powerpc/configs/85xx/tqm8540_defconfig index 908f3885f4a5..98982a0e82d8 100644 --- a/arch/powerpc/configs/85xx/tqm8540_defconfig +++ b/arch/powerpc/configs/85xx/tqm8540_defconfig @@ -9,6 +9,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_TQM8540=y +CONFIG_GEN_RTC=y CONFIG_MATH_EMULATION=y CONFIG_PCI=y CONFIG_NET=y @@ -35,14 +36,12 @@ CONFIG_BLK_DEV_VIA82CXXX=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set # CONFIG_VT is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -56,4 +55,3 @@ CONFIG_JFFS2_FS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/tqm8541_defconfig b/arch/powerpc/configs/85xx/tqm8541_defconfig index f47e57610b7c..a6e21db1dafe 100644 --- a/arch/powerpc/configs/85xx/tqm8541_defconfig +++ b/arch/powerpc/configs/85xx/tqm8541_defconfig @@ -9,6 +9,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_TQM8541=y +CONFIG_GEN_RTC=y CONFIG_MATH_EMULATION=y CONFIG_PCI=y CONFIG_NET=y @@ -35,7 +36,6 @@ CONFIG_BLK_DEV_VIA82CXXX=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -44,7 +44,6 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -58,4 +57,3 @@ CONFIG_JFFS2_FS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/tqm8548_defconfig b/arch/powerpc/configs/85xx/tqm8548_defconfig index 42f5d0a7698e..2697e4e8a761 100644 --- a/arch/powerpc/configs/85xx/tqm8548_defconfig +++ b/arch/powerpc/configs/85xx/tqm8548_defconfig @@ -43,7 +43,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_NETDEVICES=y CONFIG_GIANFAR=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -66,4 +65,3 @@ CONFIG_ROOT_NFS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_MUTEXES=y # CONFIG_DEBUG_BUGVERBOSE is not set -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/tqm8555_defconfig b/arch/powerpc/configs/85xx/tqm8555_defconfig index 71552b7929cd..ca1de3979474 100644 --- a/arch/powerpc/configs/85xx/tqm8555_defconfig +++ b/arch/powerpc/configs/85xx/tqm8555_defconfig @@ -9,6 +9,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_TQM8555=y +CONFIG_GEN_RTC=y CONFIG_MATH_EMULATION=y CONFIG_PCI=y CONFIG_NET=y @@ -35,7 +36,6 @@ CONFIG_BLK_DEV_VIA82CXXX=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -44,7 +44,6 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -58,4 +57,3 @@ CONFIG_JFFS2_FS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/tqm8560_defconfig b/arch/powerpc/configs/85xx/tqm8560_defconfig index 25aac973d6d7..ca3b8c8ef30f 100644 --- a/arch/powerpc/configs/85xx/tqm8560_defconfig +++ b/arch/powerpc/configs/85xx/tqm8560_defconfig @@ -9,6 +9,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set CONFIG_TQM8560=y +CONFIG_GEN_RTC=y CONFIG_MATH_EMULATION=y CONFIG_PCI=y CONFIG_NET=y @@ -35,7 +36,6 @@ CONFIG_BLK_DEV_VIA82CXXX=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -44,7 +44,6 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MPC=y @@ -58,4 +57,3 @@ CONFIG_JFFS2_FS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig index 72900b84d3e0..6531139a8a8d 100644 --- a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig +++ b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig @@ -54,7 +54,6 @@ CONFIG_IP_PIMSM_V2=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -CONFIG_IPV6=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_REDBOOT_PARTS=y @@ -86,7 +85,6 @@ CONFIG_DUMMY=y CONFIG_GIANFAR=y CONFIG_E1000=y CONFIG_BROADCOM_PHY=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set CONFIG_SERIO_LIBPS2=y @@ -107,8 +105,8 @@ CONFIG_SENSORS_LM90=y CONFIG_WATCHDOG=y CONFIG_USB=y CONFIG_USB_MON=y -CONFIG_USB_ISP1760=y CONFIG_USB_STORAGE=y +CONFIG_USB_ISP1760=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_PCA955X=y @@ -143,4 +141,3 @@ CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_BUGVERBOSE is not set CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_MD5=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/adder875_defconfig b/arch/powerpc/configs/adder875_defconfig index 6a3f825452e9..935ea3ade7de 100644 --- a/arch/powerpc/configs/adder875_defconfig +++ b/arch/powerpc/configs/adder875_defconfig @@ -12,6 +12,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_IOSCHED_CFQ is not set CONFIG_PPC_ADDER875=y CONFIG_8xx_COPYBACK=y +CONFIG_GEN_RTC=y CONFIG_HZ_1000=y # CONFIG_SECCOMP is not set CONFIG_NET=y @@ -41,7 +42,6 @@ CONFIG_DAVICOM_PHY=y # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y # CONFIG_HWMON is not set CONFIG_THERMAL=y # CONFIG_USB_SUPPORT is not set diff --git a/arch/powerpc/configs/amigaone_defconfig b/arch/powerpc/configs/amigaone_defconfig index 8d3e3c41258d..12f397d403c6 100644 --- a/arch/powerpc/configs/amigaone_defconfig +++ b/arch/powerpc/configs/amigaone_defconfig @@ -45,7 +45,6 @@ CONFIG_PARPORT_PC_FIFO=y CONFIG_BLK_DEV_FD=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y -CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y CONFIG_CHR_DEV_ST=y CONFIG_BLK_DEV_SR=y @@ -120,5 +119,4 @@ CONFIG_XMON=y CONFIG_XMON_DEFAULT=y CONFIG_CRYPTO_CBC=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/be.config b/arch/powerpc/configs/be.config new file mode 100644 index 000000000000..c5cdc99a6530 --- /dev/null +++ b/arch/powerpc/configs/be.config @@ -0,0 +1 @@ +CONFIG_CPU_BIG_ENDIAN=y diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig index 7c9d95370150..f1552af9eecc 100644 --- a/arch/powerpc/configs/c2k_defconfig +++ b/arch/powerpc/configs/c2k_defconfig @@ -27,6 +27,7 @@ CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_POWERSAVE=m CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_GEN_RTC=y CONFIG_HIGHMEM=y CONFIG_PREEMPT_VOLUNTARY=y CONFIG_BINFMT_MISC=y @@ -197,7 +198,6 @@ CONFIG_TUN=m # CONFIG_ATM_DRIVERS is not set CONFIG_MV643XX_ETH=y CONFIG_VITESSE_PHY=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set @@ -209,7 +209,6 @@ CONFIG_SERIAL_NONSTANDARD=y CONFIG_SERIAL_MPSC=y CONFIG_SERIAL_MPSC_CONSOLE=y CONFIG_NVRAM=m -CONFIG_GEN_RTC=m CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=8192 CONFIG_I2C=m @@ -390,7 +389,6 @@ CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y -CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA1=y @@ -402,4 +400,3 @@ CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig index aa564599e368..560a93a84efe 100644 --- a/arch/powerpc/configs/cell_defconfig +++ b/arch/powerpc/configs/cell_defconfig @@ -4,7 +4,6 @@ CONFIG_ALTIVEC=y CONFIG_SMP=y CONFIG_NR_CPUS=4 CONFIG_SYSVIPC=y -CONFIG_FHANDLE=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y @@ -34,10 +33,10 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_GEN_RTC=y CONFIG_BINFMT_MISC=m CONFIG_IRQ_ALL_CPUS=y CONFIG_NUMA=y -CONFIG_MEMORY_HOTREMOVE=y CONFIG_PPC_64K_PAGES=y CONFIG_SCHED_SMT=y CONFIG_PCIEPORTBUS=y @@ -53,7 +52,6 @@ CONFIG_IP_PNP_RARP=y CONFIG_NET_IPIP=y CONFIG_SYN_COOKIES=y # CONFIG_INET_XFRM_MODE_BEET is not set -CONFIG_IPV6=y CONFIG_INET6_AH=m CONFIG_INET6_ESP=m CONFIG_INET6_IPCOMP=m @@ -141,7 +139,6 @@ CONFIG_SKY2=m CONFIG_GELIC_NET=m CONFIG_GELIC_WIRELESS=y CONFIG_SPIDER_NET=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO_I8042 is not set @@ -149,8 +146,6 @@ CONFIG_SPIDER_NET=y CONFIG_SERIAL_NONSTANDARD=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_TXX9_NR_UARTS=2 -CONFIG_SERIAL_TXX9_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y CONFIG_HVC_RTAS=y CONFIG_IPMI_HANDLER=m @@ -159,7 +154,6 @@ CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m CONFIG_IPMI_POWEROFF=m # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_WATCHDOG=y # CONFIG_VGA_CONSOLE is not set @@ -207,7 +201,6 @@ CONFIG_NLS_ISO8859_13=m CONFIG_NLS_ISO8859_14=m CONFIG_NLS_ISO8859_15=m # CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_MUTEXES=y diff --git a/arch/powerpc/configs/chrp32_defconfig b/arch/powerpc/configs/chrp32_defconfig index 1f6f90cd8aff..a203b1cf67d3 100644 --- a/arch/powerpc/configs/chrp32_defconfig +++ b/arch/powerpc/configs/chrp32_defconfig @@ -16,6 +16,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_PARTITION_ADVANCED=y CONFIG_MAC_PARTITION=y # CONFIG_PPC_PMAC is not set +CONFIG_GEN_RTC=y CONFIG_HIGHMEM=y CONFIG_BINFMT_MISC=y CONFIG_IRQ_ALL_CPUS=y @@ -79,7 +80,6 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set CONFIG_NVRAM=y -CONFIG_GEN_RTC=y # CONFIG_HWMON is not set CONFIG_FB=y CONFIG_FIRMWARE_EDID=y @@ -124,5 +124,4 @@ CONFIG_XMON=y CONFIG_XMON_DEFAULT=y CONFIG_CRYPTO_CBC=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/ep8248e_defconfig b/arch/powerpc/configs/ep8248e_defconfig index 3403b85f9d81..2e6c8a45ae88 100644 --- a/arch/powerpc/configs/ep8248e_defconfig +++ b/arch/powerpc/configs/ep8248e_defconfig @@ -70,5 +70,4 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/ep88xc_defconfig b/arch/powerpc/configs/ep88xc_defconfig index 95411aeeeb8d..7cb590e8f8fd 100644 --- a/arch/powerpc/configs/ep88xc_defconfig +++ b/arch/powerpc/configs/ep88xc_defconfig @@ -14,6 +14,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_IOSCHED_CFQ is not set CONFIG_PPC_EP88XC=y CONFIG_8xx_COPYBACK=y +CONFIG_GEN_RTC=y CONFIG_HZ_100=y # CONFIG_SECCOMP is not set CONFIG_NET=y @@ -45,7 +46,6 @@ CONFIG_LXT_PHY=y # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y # CONFIG_HWMON is not set # CONFIG_USB_SUPPORT is not set # CONFIG_DNOTIFY is not set diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index e18f2e06553f..e084fa548d73 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -4,7 +4,6 @@ CONFIG_SMP=y CONFIG_NR_CPUS=4 CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y @@ -25,6 +24,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_PMAC64=y +CONFIG_GEN_RTC=y CONFIG_KEXEC=y CONFIG_IRQ_ALL_CPUS=y CONFIG_PCI_MSI=y @@ -115,7 +115,6 @@ CONFIG_USB_USBNET=m # CONFIG_USB_NET_NET1080 is not set # CONFIG_USB_NET_CDC_SUBSET is not set # CONFIG_USB_NET_ZAURUS is not set -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_JOYDEV=m CONFIG_INPUT_EVDEV=y # CONFIG_KEYBOARD_ATKBD is not set @@ -123,7 +122,6 @@ CONFIG_INPUT_EVDEV=y # CONFIG_SERIO_I8042 is not set # CONFIG_SERIO_SERPORT is not set # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_RAW_DRIVER=y CONFIG_I2C_CHARDEV=y CONFIG_AGP=m @@ -213,20 +211,20 @@ CONFIG_USB_SERIAL_CYBERJACK=m CONFIG_USB_SERIAL_XIRCOM=m CONFIG_USB_SERIAL_OMNINET=m CONFIG_USB_APPLEDISPLAY=m -CONFIG_FS_DAX=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y -CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=y CONFIG_REISERFS_FS_XATTR=y CONFIG_REISERFS_FS_POSIX_ACL=y CONFIG_REISERFS_FS_SECURITY=y CONFIG_XFS_FS=m CONFIG_XFS_POSIX_ACL=y +CONFIG_FS_DAX=y CONFIG_ISO9660_FS=y CONFIG_JOLIET=y CONFIG_ZISOFS=y @@ -254,14 +252,12 @@ CONFIG_NLS_ISO8859_1=y CONFIG_NLS_ISO8859_15=y CONFIG_NLS_UTF8=y CONFIG_CRC_T10DIF=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_MUTEXES=y CONFIG_LATENCYTOP=y CONFIG_BOOTX_TEXT=y CONFIG_PPC_EARLY_DEBUG=y -CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y @@ -276,5 +272,4 @@ CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig index c0eec4a5df4e..79bbc8238b32 100644 --- a/arch/powerpc/configs/gamecube_defconfig +++ b/arch/powerpc/configs/gamecube_defconfig @@ -45,7 +45,6 @@ CONFIG_BLK_DEV_RAM_COUNT=2 CONFIG_NETDEVICES=y # CONFIG_WLAN is not set CONFIG_INPUT_FF_MEMLESS=m -# CONFIG_INPUT_MOUSEDEV is not set CONFIG_INPUT_JOYDEV=y CONFIG_INPUT_EVDEV=y # CONFIG_KEYBOARD_ATKBD is not set @@ -54,7 +53,6 @@ CONFIG_INPUT_JOYSTICK=y # CONFIG_SERIO_I8042 is not set # CONFIG_SERIO_SERPORT is not set CONFIG_LEGACY_PTY_COUNT=64 -# CONFIG_DEVKMEM is not set # CONFIG_HW_RANDOM is not set # CONFIG_HWMON is not set CONFIG_FB=y diff --git a/arch/powerpc/configs/holly_defconfig b/arch/powerpc/configs/holly_defconfig index e56e80090529..71d8d2430b6c 100644 --- a/arch/powerpc/configs/holly_defconfig +++ b/arch/powerpc/configs/holly_defconfig @@ -11,6 +11,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_PPC_PMAC is not set CONFIG_EMBEDDED6xx=y CONFIG_PPC_HOLLY=y +CONFIG_GEN_RTC=y CONFIG_BINFMT_MISC=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyS0,115200" @@ -37,7 +38,6 @@ CONFIG_NETDEVICES=y CONFIG_VORTEX=y CONFIG_TSI108_ETH=y CONFIG_PHYLIB=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -49,7 +49,6 @@ CONFIG_SERIAL_8250_EXTENDED=y CONFIG_SERIAL_8250_SHARE_IRQ=y CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y diff --git a/arch/powerpc/configs/linkstation_defconfig b/arch/powerpc/configs/linkstation_defconfig index b413c19d7031..477794c41d50 100644 --- a/arch/powerpc/configs/linkstation_defconfig +++ b/arch/powerpc/configs/linkstation_defconfig @@ -26,7 +26,6 @@ CONFIG_IP_PNP_BOOTP=y # CONFIG_IPV6 is not set CONFIG_NETFILTER=y CONFIG_NF_CONNTRACK=m -CONFIG_NF_CT_PROTO_SCTP=m CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m @@ -79,7 +78,6 @@ CONFIG_NET_TULIP=y CONFIG_TULIP=y CONFIG_TULIP_MMIO=y CONFIG_R8169=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=m # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set @@ -142,4 +140,3 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_DEFLATE=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/maple_defconfig b/arch/powerpc/configs/maple_defconfig index c4018179e219..078cdb427fc9 100644 --- a/arch/powerpc/configs/maple_defconfig +++ b/arch/powerpc/configs/maple_defconfig @@ -3,7 +3,6 @@ CONFIG_SMP=y CONFIG_NR_CPUS=4 CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y @@ -24,6 +23,7 @@ CONFIG_MAC_PARTITION=y # CONFIG_PPC_PMAC is not set CONFIG_PPC_MAPLE=y CONFIG_UDBG_RTAS_CONSOLE=y +CONFIG_GEN_RTC=y CONFIG_KEXEC=y CONFIG_IRQ_ALL_CPUS=y CONFIG_PCI_MSI=y @@ -53,9 +53,6 @@ CONFIG_AMD8111_ETH=y CONFIG_TIGON3=y CONFIG_E1000=y CONFIG_USB_PEGASUS=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1600 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=1200 # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -63,7 +60,6 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_HVC_RTAS=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_I2C=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_AMD8111=y @@ -100,8 +96,8 @@ CONFIG_USB_SERIAL_KEYSPAN_USA49W=y CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y CONFIG_USB_SERIAL_TI=m CONFIG_EXT2_FS=y -CONFIG_FS_DAX=y CONFIG_EXT4_FS=y +CONFIG_FS_DAX=y CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y CONFIG_PROC_KCORE=y @@ -127,5 +123,4 @@ CONFIG_BOOTX_TEXT=y CONFIG_PPC_EARLY_DEBUG=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/mgcoge_defconfig b/arch/powerpc/configs/mgcoge_defconfig index 197acaa026eb..5d5f08e5b8d9 100644 --- a/arch/powerpc/configs/mgcoge_defconfig +++ b/arch/powerpc/configs/mgcoge_defconfig @@ -46,7 +46,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_NETDEVICES=y CONFIG_FS_ENET=y CONFIG_FS_ENET_MDIO_FCC=y -CONFIG_FIXED_PHY=y # CONFIG_WLAN is not set # CONFIG_INPUT is not set # CONFIG_SERIO is not set @@ -83,5 +82,4 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_BDI_SWITCH=y CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig index 0b4854cf26cb..10be5773ad5d 100644 --- a/arch/powerpc/configs/mpc512x_defconfig +++ b/arch/powerpc/configs/mpc512x_defconfig @@ -12,6 +12,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_IOSCHED_CFQ is not set # CONFIG_PPC_CHRP is not set CONFIG_PPC_MPC512x=y +CONFIG_MPC512x_LPBFIFO=y CONFIG_MPC5121_ADS=y CONFIG_MPC512x_GENERIC=y CONFIG_PDM360NG=y @@ -61,25 +62,22 @@ CONFIG_BLK_DEV_SD=y CONFIG_CHR_DEV_SG=y CONFIG_NETDEVICES=y CONFIG_FS_ENET=y -CONFIG_MARVELL_PHY=y -CONFIG_DAVICOM_PHY=y -CONFIG_QSEMI_PHY=y -CONFIG_LXT_PHY=y -CONFIG_CICADA_PHY=y -CONFIG_VITESSE_PHY=y -CONFIG_SMSC_PHY=y +CONFIG_MDIO_BITBANG=y CONFIG_BROADCOM_PHY=y +CONFIG_CICADA_PHY=y +CONFIG_DAVICOM_PHY=y CONFIG_ICPLUS_PHY=y -CONFIG_REALTEK_PHY=y +CONFIG_LSI_ET1011C_PHY=y +CONFIG_LXT_PHY=y +CONFIG_MARVELL_PHY=y CONFIG_NATIONAL_PHY=y +CONFIG_QSEMI_PHY=y +CONFIG_REALTEK_PHY=y +CONFIG_SMSC_PHY=y CONFIG_STE10XP=y -CONFIG_LSI_ET1011C_PHY=y -CONFIG_FIXED_PHY=y -CONFIG_MDIO_BITBANG=y +CONFIG_VITESSE_PHY=y # CONFIG_WLAN is not set -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_MPC52xx=y CONFIG_SERIAL_MPC52xx_CONSOLE=y CONFIG_SERIAL_MPC52xx_CONSOLE_BAUD=115200 @@ -111,10 +109,9 @@ CONFIG_RTC_DRV_M41T80=y CONFIG_RTC_DRV_MPC5121=y CONFIG_DMADEVICES=y CONFIG_MPC512X_DMA=y -CONFIG_MPC512x_LPBFIFO=y -CONFIG_FS_DAX=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y +CONFIG_FS_DAX=y # CONFIG_DNOTIFY is not set CONFIG_VFAT_FS=y CONFIG_TMPFS=y @@ -126,5 +123,4 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/mpc5200_defconfig b/arch/powerpc/configs/mpc5200_defconfig index 88336d0df0d6..7a2b2aa37def 100644 --- a/arch/powerpc/configs/mpc5200_defconfig +++ b/arch/powerpc/configs/mpc5200_defconfig @@ -50,7 +50,6 @@ CONFIG_NETDEVICES=y CONFIG_FEC_MPC52xx=y CONFIG_AMD_PHY=y CONFIG_LXT_PHY=y -CONFIG_FIXED_PHY=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -71,12 +70,10 @@ CONFIG_SENSORS_LM87=m CONFIG_WATCHDOG=y CONFIG_MFD_SM501=m CONFIG_DRM=y -CONFIG_FB=y CONFIG_FB_FOREIGN_ENDIAN=y CONFIG_FB_RADEON=y CONFIG_FB_SM501=m # CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=y CONFIG_SND=y @@ -130,4 +127,3 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/mpc7448_hpc2_defconfig b/arch/powerpc/configs/mpc7448_hpc2_defconfig index d933326b4cf9..4b14c02b437c 100644 --- a/arch/powerpc/configs/mpc7448_hpc2_defconfig +++ b/arch/powerpc/configs/mpc7448_hpc2_defconfig @@ -11,6 +11,7 @@ CONFIG_PARTITION_ADVANCED=y # CONFIG_PPC_PMAC is not set CONFIG_EMBEDDED6xx=y CONFIG_MPC7448HPC2=y +CONFIG_GEN_RTC=y CONFIG_BINFMT_MISC=y # CONFIG_SECCOMP is not set CONFIG_NET=y @@ -38,7 +39,6 @@ CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set CONFIG_TSI108_ETH=y CONFIG_PHYLIB=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -46,7 +46,6 @@ CONFIG_PHYLIB=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set -CONFIG_GEN_RTC=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -54,4 +53,3 @@ CONFIG_TMPFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_CRC_T10DIF=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/mpc8272_ads_defconfig b/arch/powerpc/configs/mpc8272_ads_defconfig index 4cb0f617c0d6..b1e88b64536b 100644 --- a/arch/powerpc/configs/mpc8272_ads_defconfig +++ b/arch/powerpc/configs/mpc8272_ads_defconfig @@ -77,5 +77,4 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/mpc83xx_defconfig b/arch/powerpc/configs/mpc83xx_defconfig index 6574477fd726..d1b82035d35f 100644 --- a/arch/powerpc/configs/mpc83xx_defconfig +++ b/arch/powerpc/configs/mpc83xx_defconfig @@ -21,7 +21,6 @@ CONFIG_MPC837x_MDS=y CONFIG_MPC837x_RDB=y CONFIG_SBC834x=y CONFIG_ASP834x=y -CONFIG_QUICC_ENGINE=y CONFIG_QE_GPIO=y CONFIG_MATH_EMULATION=y CONFIG_PCI=y @@ -60,13 +59,11 @@ CONFIG_SATA_SIL=y CONFIG_NETDEVICES=y CONFIG_UCC_GETH=y CONFIG_GIANFAR=y -CONFIG_MARVELL_PHY=y CONFIG_DAVICOM_PHY=y -CONFIG_VITESSE_PHY=y CONFIG_ICPLUS_PHY=y -CONFIG_FIXED_PHY=y +CONFIG_MARVELL_PHY=y +CONFIG_VITESSE_PHY=y CONFIG_INPUT_FF_MEMLESS=m -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -99,6 +96,7 @@ CONFIG_USB_EHCI_FSL=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_DS1307=y CONFIG_RTC_DRV_DS1374=y +CONFIG_QUICC_ENGINE=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_PROC_KCORE=y @@ -109,7 +107,5 @@ CONFIG_ROOT_NFS=y CONFIG_CRC_T10DIF=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DEV_TALITOS=y diff --git a/arch/powerpc/configs/mpc866_ads_defconfig b/arch/powerpc/configs/mpc866_ads_defconfig index 998454471a48..f1f176c29fa3 100644 --- a/arch/powerpc/configs/mpc866_ads_defconfig +++ b/arch/powerpc/configs/mpc866_ads_defconfig @@ -14,6 +14,7 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_MPC86XADS=y CONFIG_8xx_COPYBACK=y CONFIG_8xx_CPU6=y +CONFIG_GEN_RTC=y CONFIG_HZ_1000=y CONFIG_MATH_EMULATION=y # CONFIG_SECCOMP is not set @@ -28,12 +29,10 @@ CONFIG_SYN_COOKIES=y CONFIG_BLK_DEV_LOOP=y CONFIG_NETDEVICES=y CONFIG_FS_ENET=y -CONFIG_FIXED_PHY=y # CONFIG_VT is not set # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT4_FS=y @@ -43,4 +42,3 @@ CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_CRC_CCITT=y CONFIG_CRC32_SLICEBY4=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/mpc86xx_basic_defconfig b/arch/powerpc/configs/mpc86xx_basic_defconfig index 3283f0586e11..67bd1fa036ee 100644 --- a/arch/powerpc/configs/mpc86xx_basic_defconfig +++ b/arch/powerpc/configs/mpc86xx_basic_defconfig @@ -1,11 +1,11 @@ -CONFIG_HIGHMEM=y -CONFIG_KEXEC=y CONFIG_PPC_86xx=y -CONFIG_PROC_KCORE=y +CONFIG_MPC8641_HPCN=y +CONFIG_SBC8641D=y +CONFIG_MPC8610_HPCD=y CONFIG_GEF_PPC9A=y CONFIG_GEF_SBC310=y CONFIG_GEF_SBC610=y -CONFIG_MPC8610_HPCD=y -CONFIG_MPC8641_HPCN=y -CONFIG_SBC8641D=y CONFIG_MVME7100=y +CONFIG_HIGHMEM=y +CONFIG_KEXEC=y +CONFIG_PROC_KCORE=y diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index 91f53f1bec5d..ec3fcc2bf737 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -13,6 +13,7 @@ CONFIG_EXPERT=y CONFIG_PARTITION_ADVANCED=y # CONFIG_IOSCHED_CFQ is not set CONFIG_8xx_COPYBACK=y +CONFIG_GEN_RTC=y CONFIG_HZ_100=y # CONFIG_SECCOMP is not set CONFIG_NET=y @@ -51,7 +52,6 @@ CONFIG_DAVICOM_PHY=y # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y -CONFIG_GEN_RTC=y # CONFIG_HWMON is not set # CONFIG_USB_SUPPORT is not set # CONFIG_DNOTIFY is not set diff --git a/arch/powerpc/configs/mvme5100_defconfig b/arch/powerpc/configs/mvme5100_defconfig index 139add95a16a..63e38c7220f1 100644 --- a/arch/powerpc/configs/mvme5100_defconfig +++ b/arch/powerpc/configs/mvme5100_defconfig @@ -35,7 +35,6 @@ CONFIG_IP_PNP_BOOTP=y # CONFIG_IPV6 is not set CONFIG_NETFILTER=y CONFIG_NF_CONNTRACK=m -CONFIG_NF_CT_PROTO_SCTP=m CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m @@ -70,7 +69,6 @@ CONFIG_TUN=m # CONFIG_NET_VENDOR_3COM is not set CONFIG_E100=y # CONFIG_WLAN is not set -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -130,4 +128,3 @@ CONFIG_CRYPTO_DES=y CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_DEFLATE=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig index fe43ff47bd2f..8cf4a46bef86 100644 --- a/arch/powerpc/configs/pasemi_defconfig +++ b/arch/powerpc/configs/pasemi_defconfig @@ -3,7 +3,6 @@ CONFIG_ALTIVEC=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_SYSVIPC=y -CONFIG_FHANDLE=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BLK_DEV_INITRD=y @@ -145,6 +144,7 @@ CONFIG_EDAC=y CONFIG_EDAC_PASEMI=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_DS1307=y +CONFIG_RAS=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y @@ -167,7 +167,6 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_CRC_CCITT=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y @@ -175,7 +174,5 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_XMON=y CONFIG_XMON_DEFAULT=y CONFIG_CRYPTO_MD4=y -CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=y CONFIG_CRYPTO_BLOWFISH=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index fc1e7a7388b8..8e798b1fbc99 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -21,6 +21,7 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_PMAC=y CONFIG_PPC601_SYNC_FIX=y +CONFIG_GEN_RTC=y CONFIG_HIGHMEM=y CONFIG_BINFMT_MISC=m CONFIG_HIBERNATION=y @@ -179,10 +180,10 @@ CONFIG_PPP_ASYNC=y CONFIG_PPP_SYNC_TTY=m CONFIG_USB_USBNET=m # CONFIG_USB_NET_CDC_SUBSET is not set -CONFIG_PRISM54=m CONFIG_B43=m CONFIG_B43LEGACY=m CONFIG_P54_COMMON=m +CONFIG_PRISM54=m CONFIG_INPUT_EVDEV=y # CONFIG_KEYBOARD_ATKBD is not set # CONFIG_MOUSE_PS2 is not set @@ -193,7 +194,6 @@ CONFIG_SERIAL_8250=m CONFIG_SERIAL_PMACZILOG=m CONFIG_SERIAL_PMACZILOG_TTYS=y CONFIG_NVRAM=y -CONFIG_GEN_RTC=y CONFIG_I2C_CHARDEV=m CONFIG_APM_POWER=y CONFIG_BATTERY_PMU=y @@ -201,8 +201,9 @@ CONFIG_HWMON=m CONFIG_AGP=m CONFIG_AGP_UNINORTH=m CONFIG_DRM=m -CONFIG_DRM_R128=m CONFIG_DRM_RADEON=m +CONFIG_DRM_LEGACY=y +CONFIG_DRM_R128=m CONFIG_FB=y CONFIG_FB_OF=y CONFIG_FB_CONTROL=y @@ -300,8 +301,6 @@ CONFIG_NFSD_V4=y CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_ISO8859_1=m CONFIG_CRC_T10DIF=y -CONFIG_LIBCRC32C=m -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y @@ -310,7 +309,6 @@ CONFIG_XMON=y CONFIG_XMON_DEFAULT=y CONFIG_BOOTX_TEXT=y CONFIG_PPC_EARLY_DEBUG=y -CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_SHA512=m @@ -325,4 +323,3 @@ CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_DEFLATE=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 34fc9bbfca9e..caee834760d2 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -1,10 +1,8 @@ CONFIG_PPC64=y -CONFIG_SMP=y CONFIG_NR_CPUS=2048 CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_AUDIT=y CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y @@ -26,8 +24,8 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_BPF=y CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y @@ -62,7 +60,6 @@ CONFIG_PPC_64K_PAGES=y CONFIG_PPC_SUBPAGE_PROT=y CONFIG_SCHED_SMT=y CONFIG_PM=y -CONFIG_PCI_MSI=y CONFIG_HOTPLUG_PCI=y CONFIG_NET=y CONFIG_PACKET=y @@ -158,7 +155,6 @@ CONFIG_NETCONSOLE=y CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m -CONFIG_VHOST_NET=m CONFIG_VORTEX=m CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y @@ -184,16 +180,13 @@ CONFIG_PPP_DEFLATE=m CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=m CONFIG_INPUT_MISC=y # CONFIG_SERIO_SERPORT is not set -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_JSM=m CONFIG_VIRTIO_CONSOLE=m -CONFIG_POWERNV_OP_PANEL=m CONFIG_IPMI_HANDLER=y CONFIG_IPMI_DEVICE_INTERFACE=y CONFIG_IPMI_POWERNV=y @@ -296,9 +289,12 @@ CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_LATENCYTOP=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENT=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y @@ -310,6 +306,7 @@ CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m @@ -318,14 +315,13 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_LZO=m CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_VMX=y -CONFIG_CRYPTO_DEV_VMX_ENCRYPT=m CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m +CONFIG_VHOST_NET=m diff --git a/arch/powerpc/configs/ppc40x_defconfig b/arch/powerpc/configs/ppc40x_defconfig index 370c0bbcff71..10fb1df63b46 100644 --- a/arch/powerpc/configs/ppc40x_defconfig +++ b/arch/powerpc/configs/ppc40x_defconfig @@ -51,9 +51,9 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_EXTENDED=y CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_UARTLITE=y CONFIG_SERIAL_UARTLITE_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set CONFIG_XILINX_HWICAP=m CONFIG_I2C=m @@ -85,4 +85,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/ppc44x_defconfig b/arch/powerpc/configs/ppc44x_defconfig index 2766e8f590bc..66dd6bf45cde 100644 --- a/arch/powerpc/configs/ppc44x_defconfig +++ b/arch/powerpc/configs/ppc44x_defconfig @@ -68,9 +68,9 @@ CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_SERIAL_8250_PCI is not set CONFIG_SERIAL_8250_EXTENDED=y CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_UARTLITE=y CONFIG_SERIAL_UARTLITE_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set CONFIG_XILINX_HWICAP=m CONFIG_I2C=m @@ -94,7 +94,6 @@ CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_JFFS2_FS=y CONFIG_UBIFS_FS=m -CONFIG_LOGFS=m CONFIG_CRAMFS=y CONFIG_SQUASHFS=m CONFIG_SQUASHFS_XATTR=y @@ -108,6 +107,5 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set CONFIG_VIRTUALIZATION=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index c5246d29f385..791db775a09c 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -1,8 +1,6 @@ CONFIG_PPC64=y -CONFIG_SMP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y @@ -28,9 +26,10 @@ CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_PARTITION_ADVANCED=y CONFIG_PPC_SPLPAR=y +CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y -CONFIG_DTL=y +CONFIG_IBMEBUS=y CONFIG_PPC_MAPLE=y CONFIG_PPC_PASEMI=y CONFIG_PPC_PASEMI_IOMMU=y @@ -41,9 +40,8 @@ CONFIG_PS3_FLASH=m CONFIG_PS3_LPM=m CONFIG_PPC_IBM_CELL_BLADE=y CONFIG_RTAS_FLASH=m -CONFIG_IBMEBUS=y -CONFIG_CPU_FREQ_PMAC64=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_PMAC64=y CONFIG_HZ_100=y CONFIG_BINFMT_MISC=m CONFIG_PPC_TRANSACTIONAL_MEM=y @@ -51,14 +49,15 @@ CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y CONFIG_IRQ_ALL_CPUS=y -CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_PPC_64K_PAGES=y CONFIG_SCHED_SMT=y -CONFIG_PCCARD=y -CONFIG_ELECTRA_CF=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_RPA=m CONFIG_HOTPLUG_PCI_RPA_DLPAR=m +CONFIG_PCCARD=y +CONFIG_ELECTRA_CF=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -154,7 +153,6 @@ CONFIG_DUMMY=m CONFIG_NETCONSOLE=y CONFIG_TUN=m CONFIG_VIRTIO_NET=m -CONFIG_VHOST_NET=m CONFIG_VORTEX=m CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y @@ -181,15 +179,14 @@ CONFIG_SUNGEM=y CONFIG_GELIC_NET=m CONFIG_GELIC_WIRELESS=y CONFIG_SPIDER_NET=m -CONFIG_MARVELL_PHY=y CONFIG_BROADCOM_PHY=m +CONFIG_MARVELL_PHY=y CONFIG_PPP=m CONFIG_PPP_BSDCOMP=m CONFIG_PPP_DEFLATE=m CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=m CONFIG_INPUT_MISC=y CONFIG_INPUT_PCSPKR=m @@ -197,7 +194,6 @@ CONFIG_INPUT_PCSPKR=m CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_ICOM=m -CONFIG_SERIAL_TXX9_CONSOLE=y CONFIG_SERIAL_JSM=m CONFIG_HVC_CONSOLE=y CONFIG_HVC_RTAS=y @@ -250,6 +246,9 @@ CONFIG_USB_EHCI_HCD=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_STORAGE=m CONFIG_USB_APPLEDISPLAY=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=m +CONFIG_LEDS_POWERNV=m CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_MAD=m CONFIG_INFINIBAND_USER_ACCESS=m @@ -267,7 +266,7 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_DS1307=y CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m -CONFIG_FS_DAX=y +CONFIG_RAS=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y @@ -287,6 +286,7 @@ CONFIG_XFS_POSIX_ACL=y CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m +CONFIG_FS_DAX=y CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m @@ -328,9 +328,11 @@ CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_DEBUG_MUTEXES=y CONFIG_LATENCYTOP=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENT=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y @@ -343,6 +345,7 @@ CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m @@ -351,19 +354,14 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_LZO=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m CONFIG_CRYPTO_DEV_VMX=y -CONFIG_CRYPTO_DEV_VMX_ENCRYPT=m CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m -CONFIG_LEDS_POWERNV=m +CONFIG_VHOST_NET=m diff --git a/arch/powerpc/configs/ppc64e_defconfig b/arch/powerpc/configs/ppc64e_defconfig index 6340e6c53c54..d0fe0f8f77c2 100644 --- a/arch/powerpc/configs/ppc64e_defconfig +++ b/arch/powerpc/configs/ppc64e_defconfig @@ -3,7 +3,6 @@ CONFIG_PPC_BOOK3E_64=y CONFIG_SMP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_TASKSTATS=y @@ -30,8 +29,8 @@ CONFIG_BINFMT_MISC=m CONFIG_IRQ_ALL_CPUS=y CONFIG_SPARSEMEM_MANUAL=y CONFIG_PCI_MSI=y -CONFIG_PCCARD=y CONFIG_HOTPLUG_PCI=y +CONFIG_PCCARD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -108,15 +107,14 @@ CONFIG_E100=y CONFIG_E1000=y CONFIG_IXGB=m CONFIG_SUNGEM=y -CONFIG_MARVELL_PHY=y CONFIG_BROADCOM_PHY=m +CONFIG_MARVELL_PHY=y CONFIG_PPP=m CONFIG_PPP_BSDCOMP=m CONFIG_PPP_DEFLATE=m CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=m CONFIG_INPUT_MISC=y # CONFIG_SERIO_SERPORT is not set @@ -172,10 +170,8 @@ CONFIG_INFINIBAND=m CONFIG_INFINIBAND_MTHCA=m CONFIG_INFINIBAND_IPOIB=m CONFIG_INFINIBAND_ISER=m -CONFIG_EDAC=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_DS1307=y -CONFIG_FS_DAX=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y @@ -192,6 +188,7 @@ CONFIG_JFS_POSIX_ACL=y CONFIG_JFS_SECURITY=y CONFIG_XFS_FS=m CONFIG_XFS_POSIX_ACL=y +CONFIG_FS_DAX=y CONFIG_AUTOFS4_FS=m CONFIG_ISO9660_FS=y CONFIG_UDF_FS=m @@ -251,5 +248,4 @@ CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_LZO=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 18d0d60dadbf..ae6eba482d75 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -11,10 +11,10 @@ CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_CGROUPS=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set @@ -61,7 +61,7 @@ CONFIG_SBC8641D=y CONFIG_MPC8610_HPCD=y CONFIG_GEF_SBC610=y CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_STAT=m +CONFIG_CPU_FREQ_STAT=y CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_POWERSAVE=m @@ -70,7 +70,6 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m CONFIG_CPU_FREQ_PMAC=y CONFIG_TAU=y CONFIG_TAU_AVERAGE=y -CONFIG_QUICC_ENGINE=y CONFIG_QE_GPIO=y CONFIG_MCU_MPC8349EMITX=y CONFIG_HIGHMEM=y @@ -141,7 +140,6 @@ CONFIG_NETFILTER=y CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CT_PROTO_UDPLITE=m CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m @@ -187,7 +185,6 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m CONFIG_NETFILTER_XT_MATCH_RATEEST=m CONFIG_NETFILTER_XT_MATCH_REALM=m CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m CONFIG_NETFILTER_XT_MATCH_STATE=m CONFIG_NETFILTER_XT_MATCH_STATISTIC=m CONFIG_NETFILTER_XT_MATCH_STRING=m @@ -195,7 +192,6 @@ CONFIG_NETFILTER_XT_MATCH_TCPMSS=m CONFIG_NETFILTER_XT_MATCH_TIME=m CONFIG_NETFILTER_XT_MATCH_U32=m CONFIG_NF_CONNTRACK_IPV4=m -# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -334,9 +330,7 @@ CONFIG_BT_BNEP_MC_FILTER=y CONFIG_BT_BNEP_PROTO_FILTER=y CONFIG_BT_HIDP=m CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_H4=y CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_LL=y CONFIG_BT_HCIBCM203X=m CONFIG_BT_HCIBPA10X=m CONFIG_BT_HCIBFUSB=m @@ -370,7 +364,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_CDROM_PKTCDVD=m CONFIG_VIRTIO_BLK=m -CONFIG_BLK_DEV_HD=y CONFIG_ENCLOSURE_SERVICES=m CONFIG_SENSORS_TSL2550=m CONFIG_EEPROM_AT24=m @@ -548,16 +541,16 @@ CONFIG_PCMCIA_XIRC2PS=m CONFIG_FDDI=y CONFIG_SKFP=m CONFIG_NET_SB1000=m -CONFIG_MARVELL_PHY=m -CONFIG_DAVICOM_PHY=m -CONFIG_QSEMI_PHY=m -CONFIG_LXT_PHY=m -CONFIG_CICADA_PHY=m -CONFIG_VITESSE_PHY=m -CONFIG_SMSC_PHY=m CONFIG_BROADCOM_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_DAVICOM_PHY=m CONFIG_ICPLUS_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_QSEMI_PHY=m CONFIG_REALTEK_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_VITESSE_PHY=m CONFIG_PLIP=m CONFIG_PPP_DEFLATE=m CONFIG_PPP_FILTER=y @@ -585,7 +578,6 @@ CONFIG_USB_ALI_M5632=y CONFIG_USB_AN2720=y CONFIG_USB_EPSON2888=y CONFIG_USB_KC2190=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_JOYDEV=m CONFIG_INPUT_EVDEV=y CONFIG_MOUSE_SERIAL=m @@ -647,7 +639,6 @@ CONFIG_SYNCLINKMP=m CONFIG_SYNCLINK_GT=m CONFIG_NOZOMI=m CONFIG_N_HDLC=m -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_CS=m @@ -657,13 +648,13 @@ CONFIG_SERIAL_8250_MANY_PORTS=y CONFIG_SERIAL_8250_SHARE_IRQ=y CONFIG_SERIAL_8250_DETECT_IRQ=y CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_UARTLITE=m CONFIG_SERIAL_PMACZILOG=m CONFIG_SERIAL_MPC52xx=y CONFIG_SERIAL_MPC52xx_CONSOLE=y CONFIG_SERIAL_MPC52xx_CONSOLE_BAUD=115200 CONFIG_SERIAL_JSM=m -CONFIG_SERIAL_OF_PLATFORM=y CONFIG_PRINTER=m CONFIG_LP_CONSOLE=y CONFIG_PPDEV=m @@ -748,9 +739,10 @@ CONFIG_MFD_SM501_GPIO=y CONFIG_AGP=y CONFIG_AGP_UNINORTH=y CONFIG_DRM=m +CONFIG_DRM_RADEON=m +CONFIG_DRM_LEGACY=y CONFIG_DRM_TDFX=m CONFIG_DRM_R128=m -CONFIG_DRM_RADEON=m CONFIG_DRM_MGA=m CONFIG_DRM_SIS=m CONFIG_DRM_VIA=m @@ -899,7 +891,7 @@ CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_FSL=y +CONFIG_USB_EHCI_FSL=m CONFIG_USB_OHCI_HCD=m CONFIG_USB_OHCI_HCD_PPC_OF_BE=y CONFIG_USB_OHCI_HCD_PPC_OF_LE=y @@ -967,7 +959,6 @@ CONFIG_USB_ADUTUX=m CONFIG_USB_SEVSEG=m CONFIG_USB_LEGOTOWER=m CONFIG_USB_LCD=m -CONFIG_USB_LED=m CONFIG_USB_IDMOUSE=m CONFIG_USB_FTDI_ELAN=m CONFIG_USB_APPLEDISPLAY=m @@ -1020,15 +1011,14 @@ CONFIG_UIO_CIF=m CONFIG_UIO_PDRV_GENIRQ=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m -CONFIG_FS_DAX=y +CONFIG_QUICC_ENGINE=y CONFIG_EXT2_FS=m CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT4_FS=m +CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y -CONFIG_EXT4_FS=y CONFIG_JBD2_DEBUG=y CONFIG_REISERFS_FS=m CONFIG_REISERFS_PROC_INFO=y @@ -1042,6 +1032,7 @@ CONFIG_XFS_FS=m CONFIG_XFS_QUOTA=y CONFIG_XFS_POSIX_ACL=y CONFIG_GFS2_FS=m +CONFIG_FS_DAX=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m @@ -1146,7 +1137,6 @@ CONFIG_DEBUG_VM=y CONFIG_DEBUG_HIGHMEM=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_SHIRQ=y -CONFIG_TIMER_STATS=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y @@ -1173,7 +1163,6 @@ CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_GCM=m CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m @@ -1201,7 +1190,6 @@ CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_LZO=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DEV_HIFN_795X=m CONFIG_CRYPTO_DEV_HIFN_795X_RNG=y CONFIG_CRYPTO_DEV_TALITOS=m diff --git a/arch/powerpc/configs/pq2fads_defconfig b/arch/powerpc/configs/pq2fads_defconfig index 50b2bad51d0a..0ededa8c837d 100644 --- a/arch/powerpc/configs/pq2fads_defconfig +++ b/arch/powerpc/configs/pq2fads_defconfig @@ -79,4 +79,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig index ee0ec5a682fc..2efa025bf483 100644 --- a/arch/powerpc/configs/ps3_defconfig +++ b/arch/powerpc/configs/ps3_defconfig @@ -5,7 +5,6 @@ CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -94,7 +93,6 @@ CONFIG_USB_USBNET=m # CONFIG_USB_NET_CDC_SUBSET is not set # CONFIG_USB_NET_ZAURUS is not set CONFIG_INPUT_FF_MEMLESS=m -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_JOYDEV=m CONFIG_INPUT_EVDEV=m # CONFIG_INPUT_KEYBOARD is not set @@ -161,7 +159,6 @@ CONFIG_NLS_ISO8859_1=y CONFIG_CRC_CCITT=m CONFIG_CRC_T10DIF=y CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEBUG_STACKOVERFLOW=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index fd5d98a0b95c..3d935969e5a2 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -1,11 +1,8 @@ CONFIG_PPC64=y -CONFIG_SMP=y CONFIG_NR_CPUS=2048 CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_AUDIT=y -CONFIG_AUDITSYSCALL=y CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y @@ -18,17 +15,16 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_NUMA_BALANCING=y -CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y CONFIG_CGROUPS=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_BPF=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_BPF=y CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y @@ -43,12 +39,12 @@ CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_PARTITION_ADVANCED=y CONFIG_PPC_SPLPAR=y +CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y -CONFIG_DTL=y +CONFIG_IBMEBUS=y # CONFIG_PPC_PMAC is not set CONFIG_RTAS_FLASH=m -CONFIG_IBMEBUS=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_HZ_100=y CONFIG_BINFMT_MISC=m @@ -155,7 +151,6 @@ CONFIG_NETCONSOLE=y CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m -CONFIG_VHOST_NET=m CONFIG_VORTEX=m CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y @@ -183,12 +178,10 @@ CONFIG_PPP_DEFLATE=m CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=m CONFIG_INPUT_MISC=y CONFIG_INPUT_PCSPKR=m # CONFIG_SERIO_SERPORT is not set -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_ICOM=m @@ -198,8 +191,6 @@ CONFIG_HVC_RTAS=y CONFIG_HVCS=m CONFIG_VIRTIO_CONSOLE=m CONFIG_IBM_BSR=m -CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_GENERIC=y CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=1024 CONFIG_FB=y @@ -227,6 +218,9 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_HCD_PPC_OF is not set CONFIG_USB_OHCI_HCD=y CONFIG_USB_STORAGE=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=m +CONFIG_LEDS_POWERNV=m CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_MAD=m CONFIG_INFINIBAND_USER_ACCESS=m @@ -238,9 +232,10 @@ CONFIG_INFINIBAND_IPOIB=m CONFIG_INFINIBAND_IPOIB_CM=y CONFIG_INFINIBAND_SRP=m CONFIG_INFINIBAND_ISER=m +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_GENERIC=y CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m -CONFIG_FS_DAX=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y @@ -256,6 +251,7 @@ CONFIG_XFS_POSIX_ACL=y CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m +CONFIG_FS_DAX=y CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m @@ -294,9 +290,11 @@ CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_LATENCYTOP=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENT=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y @@ -307,6 +305,7 @@ CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m @@ -315,19 +314,14 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_LZO=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m CONFIG_CRYPTO_DEV_VMX=y -CONFIG_CRYPTO_DEV_VMX_ENCRYPT=m CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m -CONFIG_LEDS_POWERNV=m +CONFIG_VHOST_NET=m diff --git a/arch/powerpc/configs/tqm8xx_defconfig b/arch/powerpc/configs/tqm8xx_defconfig index 78fddf24b5d3..cd72193fac0a 100644 --- a/arch/powerpc/configs/tqm8xx_defconfig +++ b/arch/powerpc/configs/tqm8xx_defconfig @@ -18,6 +18,7 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_TQM8XX=y CONFIG_8xx_COPYBACK=y # CONFIG_8xx_CPU15 is not set +CONFIG_GEN_RTC=y CONFIG_HZ_100=y # CONFIG_SECCOMP is not set CONFIG_NET=y @@ -44,7 +45,6 @@ CONFIG_MTD_PHYSMAP_OF=y CONFIG_NETDEVICES=y CONFIG_FS_ENET=y CONFIG_DAVICOM_PHY=y -CONFIG_FIXED_PHY=y # CONFIG_WLAN is not set # CONFIG_INPUT is not set # CONFIG_SERIO is not set @@ -53,7 +53,6 @@ CONFIG_FIXED_PHY=y CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y CONFIG_HW_RANDOM=y -CONFIG_GEN_RTC=y # CONFIG_HWMON is not set # CONFIG_USB_SUPPORT is not set # CONFIG_DNOTIFY is not set diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index dcdd51b57783..aef41b17a8bc 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -55,9 +55,6 @@ CONFIG_B43_SDIO=y # CONFIG_B43_PHY_LP is not set CONFIG_B43_DEBUG=y CONFIG_INPUT_FF_MEMLESS=m -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -CONFIG_INPUT_MOUSEDEV_SCREEN_X=640 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=480 CONFIG_INPUT_JOYDEV=y CONFIG_INPUT_EVDEV=y # CONFIG_KEYBOARD_ATKBD is not set @@ -68,7 +65,6 @@ CONFIG_INPUT_UINPUT=y # CONFIG_SERIO_I8042 is not set # CONFIG_SERIO_SERPORT is not set CONFIG_LEGACY_PTY_COUNT=64 -# CONFIG_DEVKMEM is not set # CONFIG_HW_RANDOM is not set CONFIG_NVRAM=y CONFIG_I2C=y @@ -119,5 +115,4 @@ CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_DMA_API_DEBUG=y CONFIG_PPC_EARLY_DEBUG=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 5c4fbc80dc6c..2542ea15d338 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -8,3 +8,4 @@ generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += rwsem.h generic-y += vtime.h +generic-y += msi.h diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index cee3aa087653..7f2a7702596c 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -25,7 +25,7 @@ #define PPC_LCMPI stringify_in_c(cmpdi) #define PPC_LCMPLI stringify_in_c(cmpldi) #define PPC_LCMP stringify_in_c(cmpd) -#define PPC_LONG stringify_in_c(.llong) +#define PPC_LONG stringify_in_c(.8byte) #define PPC_LONG_ALIGN stringify_in_c(.balign 8) #define PPC_TLNEI stringify_in_c(tdnei) #define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index 25d42bd3f114..9c601adfc500 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,13 +74,6 @@ do { \ ___p1; \ }) -/* - * This must resolve to hwsync on SMP for the context switch path. - * See _switch, and core scheduler context switch memory ordering - * comments. - */ -#define smp_mb__before_spinlock() smp_mb() - #include <asm-generic/barrier.h> #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 7fb755880409..4d453f979553 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -294,13 +294,11 @@ static inline void __ptep_set_access_flags(struct mm_struct *mm, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, - pmd_t **pmdp); - int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} +static inline int pte_read(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 36fc7bfe9e11..f88452019114 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -40,7 +40,7 @@ * Define the address range of the kernel non-linear virtual area */ #define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define H_KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) +#define H_KERN_VIRT_SIZE ASM_CONST(0x0000400000000000) /* 64T */ /* * The vmalloc space starts at the beginning of that region, and @@ -48,9 +48,11 @@ * (we keep a quarter for the virtual memmap) */ #define H_VMALLOC_START H_KERN_VIRT_START -#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) +#define H_VMALLOC_SIZE ASM_CONST(0x380000000000) /* 56T */ #define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_KERN_IO_START H_VMALLOC_END + /* * Region IDs */ diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 5c28bd6f2ae1..2d1ca488ca44 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -54,9 +54,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static inline bool gigantic_page_supported(void) { - if (radix_enabled()) - return true; - return false; + return true; } #endif diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 6981a52b3887..f28d21c69f79 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -468,7 +468,7 @@ extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend, int psize, int ssize); int htab_remove_mapping(unsigned long vstart, unsigned long vend, int psize, int ssize); -extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); +extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); #ifdef CONFIG_PPC_PSERIES diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 5b4023c616f7..c3b00e8ff791 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -83,6 +83,9 @@ typedef struct { mm_context_id_t id; u16 user_psize; /* page size index */ + /* Number of bits in the mm_cpumask */ + atomic_t active_cpus; + /* NPU NMMU context */ struct npu_context *npu_context; @@ -97,11 +100,6 @@ typedef struct { #ifdef CONFIG_PPC_SUBPAGE_PROT struct subpage_prot_table spt; #endif /* CONFIG_PPC_SUBPAGE_PROT */ -#ifdef CONFIG_PPC_ICSWX - struct spinlock *cop_lockp; /* guard acop and cop_pid */ - unsigned long acop; /* mask of enabled coprocessor types */ - unsigned int cop_pid; /* pid value used with coprocessors */ -#endif /* CONFIG_PPC_ICSWX */ #ifdef CONFIG_PPC_64K_PAGES /* for 4K PTE fragment support */ void *pte_frag; diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index e2329db9d6f4..1fcfa425cefa 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -41,8 +41,6 @@ extern struct kmem_cache *pgtable_cache[]; pgtable_cache[(shift) - 1]; \ }) -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO - extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); extern void pte_fragment_free(unsigned long *, int); extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 818a58fc3f4f..b9aff515b4de 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -272,8 +272,10 @@ extern unsigned long __vmalloc_end; extern unsigned long __kernel_virt_start; extern unsigned long __kernel_virt_size; +extern unsigned long __kernel_io_start; #define KERN_VIRT_START __kernel_virt_start #define KERN_VIRT_SIZE __kernel_virt_size +#define KERN_IO_START __kernel_io_start extern struct page *vmemmap; extern unsigned long ioremap_bot; extern unsigned long pci_io_base; @@ -298,7 +300,6 @@ extern unsigned long pci_io_base; * PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE */ -#define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1)) #define FULL_IO_SIZE 0x80000000ul #define ISA_IO_BASE (KERN_IO_START) #define ISA_IO_END (KERN_IO_START + 0x10000ul) @@ -409,6 +410,11 @@ static inline int pte_write(pte_t pte) return __pte_write(pte) || pte_savedwrite(pte); } +static inline int pte_read(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_READ)); +} + #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -1167,6 +1173,7 @@ static inline bool arch_needs_pgtable_deposit(void) return false; return true; } +extern void serialize_against_pte_lookup(struct mm_struct *mm); static inline pmd_t pmd_mkdevmap(pmd_t pmd) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 544440b5aff3..1e5ba94e62ef 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -110,6 +110,8 @@ */ #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) +#define RADIX_KERN_IO_START (RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1)) + #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) #define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index cc7fbde4f53c..9b433a624bf3 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -22,22 +22,21 @@ extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end extern void radix__local_flush_tlb_mm(struct mm_struct *mm); extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -extern void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); extern void radix__tlb_flush(struct mmu_gather *tlb); #ifdef CONFIG_SMP extern void radix__flush_tlb_mm(struct mm_struct *mm); extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); #else #define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) #define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p) -#define radix__flush_tlb_pwc(tlb, addr) radix__local_flush_tlb_pwc(tlb, addr) #endif +extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); +extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); extern void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, unsigned long page_size); extern void radix__flush_tlb_lpid(unsigned long lpid); diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 87fcc1948817..7ee763d3bea9 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -133,6 +133,7 @@ extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long); extern void bad_page_fault(struct pt_regs *, unsigned long, int); extern void _exception(int, struct pt_regs *, int, unsigned long); extern void die(const char *, struct pt_regs *, long); +extern bool die_will_crash(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h index 5a90292afbad..d122f7f957ce 100644 --- a/arch/powerpc/include/asm/cache.h +++ b/arch/powerpc/include/asm/cache.h @@ -5,7 +5,7 @@ /* bytes per L1 cache line */ -#if defined(CONFIG_8xx) || defined(CONFIG_403GCX) +#if defined(CONFIG_PPC_8xx) || defined(CONFIG_403GCX) #define L1_CACHE_SHIFT 4 #define MAX_COPY_PREFETCH 1 #elif defined(CONFIG_PPC_E500MC) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 52586f9956bb..eb43b5c3a7b5 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -67,6 +67,17 @@ #define ERR_DEEP_STATE_ESL_MISMATCH -2 #ifndef __ASSEMBLY__ +/* Additional SPRs that need to be saved/restored during stop */ +struct stop_sprs { + u64 pid; + u64 ldbar; + u64 fscr; + u64 hfscr; + u64 mmcr1; + u64 mmcr2; + u64 mmcra; +}; + extern u32 pnv_fastsleep_workaround_at_entry[]; extern u32 pnv_fastsleep_workaround_at_exit[]; @@ -90,20 +101,4 @@ static inline void report_invalid_psscr_val(u64 psscr_val, int err) #endif -/* Idle state entry routines */ -#ifdef CONFIG_PPC_P7_NAP -#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ - /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ - std r0,0(r1); \ - ptesync; \ - ld r0,0(r1); \ -236: cmpd cr0,r0,r0; \ - bne 236b; \ - IDLE_INST; \ - -#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ - IDLE_STATE_ENTER_SEQ(IDLE_INST) \ - b . -#endif /* CONFIG_PPC_P7_NAP */ - #endif diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index d02ad93bf708..a9bf921f4efc 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -513,7 +513,7 @@ enum { #else CPU_FTRS_GENERIC_32 | #endif -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx CPU_FTRS_8XX | #endif #ifdef CONFIG_40x @@ -565,7 +565,7 @@ enum { #else CPU_FTRS_GENERIC_32 & #endif -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx CPU_FTRS_8XX & #endif #ifdef CONFIG_40x diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 8e37b71674f4..9847ae3a12d1 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -131,7 +131,6 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe) struct eeh_dev { int mode; /* EEH mode */ int class_code; /* Class code of the device */ - int config_addr; /* Config address */ int pe_config_addr; /* PE config address */ u32 config_space[16]; /* Saved PCI config space */ int pcix_cap; /* Saved PCIx capability */ @@ -141,7 +140,6 @@ struct eeh_dev { struct eeh_pe *pe; /* Associated PE */ struct list_head list; /* Form link list in the PE */ struct list_head rmv_list; /* Record the removed edevs */ - struct pci_controller *phb; /* Associated PHB */ struct pci_dn *pdn; /* Associated PCI device node */ struct pci_dev *pdev; /* Associated PCI device */ bool in_error; /* Error flag for edev */ @@ -262,7 +260,8 @@ typedef void *(*eeh_traverse_func)(void *data, void *flag); void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); -struct eeh_pe *eeh_pe_get(struct eeh_dev *edev); +struct eeh_pe *eeh_pe_get(struct pci_controller *phb, + int pe_no, int config_addr); int eeh_add_to_parent_pe(struct eeh_dev *edev); int eeh_rmv_from_parent_pe(struct eeh_dev *edev); void eeh_pe_update_time_stamp(struct eeh_pe *pe); diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index ce88bbe1d809..5a23010af600 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -209,11 +209,13 @@ extern int early_init_dt_scan_fw_dump(unsigned long node, extern int fadump_reserve_mem(void); extern int setup_fadump(void); extern int is_fadump_active(void); +extern int should_fadump_crash(void); extern void crash_fadump(struct pt_regs *, const char *); extern void fadump_cleanup(void); #else /* CONFIG_FA_DUMP */ static inline int is_fadump_active(void) { return 0; } +static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } #endif #endif diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 2de2319b99e2..8f88f771cc55 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -19,11 +19,11 @@ */ #if defined(CONFIG_PPC64) && !defined(__powerpc64__) /* 64 bits kernel, 32 bits code (ie. vdso32) */ -#define FTR_ENTRY_LONG .llong +#define FTR_ENTRY_LONG .8byte #define FTR_ENTRY_OFFSET .long 0xffffffff; .long #elif defined(CONFIG_PPC64) -#define FTR_ENTRY_LONG .llong -#define FTR_ENTRY_OFFSET .llong +#define FTR_ENTRY_LONG .8byte +#define FTR_ENTRY_OFFSET .8byte #else #define FTR_ENTRY_LONG .long #define FTR_ENTRY_OFFSET .long diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index 4508b322f2cd..6c40dfda5912 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -17,6 +17,7 @@ #ifndef __ASSEMBLY__ #include <linux/kernel.h> #include <asm/page.h> +#include <asm/pgtable.h> #ifdef CONFIG_HIGHMEM #include <linux/threads.h> #include <asm/kmap_types.h> @@ -62,9 +63,6 @@ enum fixed_addresses { __end_of_fixed_addresses }; -extern void __set_fixmap (enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags); - #define __FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) @@ -72,5 +70,11 @@ extern void __set_fixmap (enum fixed_addresses idx, #include <asm-generic/fixmap.h> +static inline void __set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + map_kernel_page(fix_to_virt(idx), phys, pgprot_val(flags)); +} + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/fs_pd.h b/arch/powerpc/include/asm/fs_pd.h index f79d6c74eb2a..8def56ec05c6 100644 --- a/arch/powerpc/include/asm/fs_pd.h +++ b/arch/powerpc/include/asm/fs_pd.h @@ -26,7 +26,7 @@ #define cpm2_unmap(addr) do {} while(0) #endif -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx #include <asm/8xx_immap.h> extern immap_t __iomem *mpc8xx_immr; diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index eaada6c92344..719ed9b61ea7 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -29,18 +29,10 @@ : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ : "cr0", "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h index 8add8b861e8d..c97603d617e3 100644 --- a/arch/powerpc/include/asm/hardirq.h +++ b/arch/powerpc/include/asm/hardirq.h @@ -12,6 +12,10 @@ typedef struct { unsigned int mce_exceptions; unsigned int spurious_irqs; unsigned int hmi_exceptions; + unsigned int sreset_irqs; +#ifdef CONFIG_PPC_WATCHDOG + unsigned int soft_nmi_irqs; +#endif #ifdef CONFIG_PPC_DOORBELL unsigned int doorbell_irqs; #endif diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 7f4025a6c69e..b8a0fb442c64 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -218,18 +218,4 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, } #endif /* CONFIG_HUGETLB_PAGE */ -/* - * FSL Book3E platforms require special gpage handling - the gpages - * are reserved early in the boot process by memblock instead of via - * the .dts as on IBM platforms. - */ -#if defined(CONFIG_HUGETLB_PAGE) && (defined(CONFIG_PPC_FSL_BOOK3E) || \ - defined(CONFIG_PPC_8xx)) -extern void __init reserve_hugetlb_gpages(void); -#else -static inline void reserve_hugetlb_gpages(void) -{ -} -#endif - #endif /* _ASM_POWERPC_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 57d38b504ff7..3d34dc0869f6 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -280,7 +280,18 @@ #define H_RESIZE_HPT_COMMIT 0x370 #define H_REGISTER_PROC_TBL 0x37C #define H_SIGNAL_SYS_RESET 0x380 -#define MAX_HCALL_OPCODE H_SIGNAL_SYS_RESET +#define H_INT_GET_SOURCE_INFO 0x3A8 +#define H_INT_SET_SOURCE_CONFIG 0x3AC +#define H_INT_GET_SOURCE_CONFIG 0x3B0 +#define H_INT_GET_QUEUE_INFO 0x3B4 +#define H_INT_SET_QUEUE_CONFIG 0x3B8 +#define H_INT_GET_QUEUE_CONFIG 0x3BC +#define H_INT_SET_OS_REPORTING_LINE 0x3C0 +#define H_INT_GET_OS_REPORTING_LINE 0x3C4 +#define H_INT_ESB 0x3C8 +#define H_INT_SYNC 0x3CC +#define H_INT_RESET 0x3D0 +#define MAX_HCALL_OPCODE H_INT_RESET /* H_VIOCTL functions */ #define H_GET_VIOA_DUMP_SIZE 0x01 diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/include/asm/icswx.h index 27e588f6c72e..6a2c87577541 100644 --- a/arch/powerpc/include/asm/icswx.h +++ b/arch/powerpc/include/asm/icswx.h @@ -69,7 +69,10 @@ struct coprocessor_completion_block { #define CSB_CC_WR_PROTECTION (16) #define CSB_CC_UNKNOWN_CODE (17) #define CSB_CC_ABORT (18) +#define CSB_CC_EXCEED_BYTE_COUNT (19) /* P9 or later */ #define CSB_CC_TRANSPORT (20) +#define CSB_CC_INVALID_CRB (21) /* P9 or later */ +#define CSB_CC_INVALID_DDE (30) /* P9 or later */ #define CSB_CC_SEGMENTED_DDL (31) #define CSB_CC_PROGRESS_POINT (32) #define CSB_CC_DDE_OVERFLOW (33) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h new file mode 100644 index 000000000000..7f74c282710f --- /dev/null +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -0,0 +1,128 @@ +#ifndef __ASM_POWERPC_IMC_PMU_H +#define __ASM_POWERPC_IMC_PMU_H + +/* + * IMC Nest Performance Monitor counter support. + * + * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. + * (C) 2017 Anju T Sudhakar, IBM Corporation. + * (C) 2017 Hemant K Shaw, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or later version. + */ + +#include <linux/perf_event.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/io.h> +#include <asm/opal.h> + +/* + * For static allocation of some of the structures. + */ +#define IMC_MAX_PMUS 32 + +/* + * Compatibility macros for IMC devices + */ +#define IMC_DTB_COMPAT "ibm,opal-in-memory-counters" +#define IMC_DTB_UNIT_COMPAT "ibm,imc-counters" + + +/* + * LDBAR: Counter address and Enable/Disable macro. + * perf/imc-pmu.c has the LDBAR layout information. + */ +#define THREAD_IMC_LDBAR_MASK 0x0003ffffffffe000ULL +#define THREAD_IMC_ENABLE 0x8000000000000000ULL + +/* + * Structure to hold memory address information for imc units. + */ +struct imc_mem_info { + u64 *vbase; + u32 id; +}; + +/* + * Place holder for nest pmu events and values. + */ +struct imc_events { + u32 value; + char *name; + char *unit; + char *scale; +}; + +/* Event attribute array index */ +#define IMC_FORMAT_ATTR 0 +#define IMC_EVENT_ATTR 1 +#define IMC_CPUMASK_ATTR 2 +#define IMC_NULL_ATTR 3 + +/* PMU Format attribute macros */ +#define IMC_EVENT_OFFSET_MASK 0xffffffffULL + +/* + * Device tree parser code detects IMC pmu support and + * registers new IMC pmus. This structure will hold the + * pmu functions, events, counter memory information + * and attrs for each imc pmu and will be referenced at + * the time of pmu registration. + */ +struct imc_pmu { + struct pmu pmu; + struct imc_mem_info *mem_info; + struct imc_events **events; + /* + * Attribute groups for the PMU. Slot 0 used for + * format attribute, slot 1 used for cpusmask attribute, + * slot 2 used for event attribute. Slot 3 keep as + * NULL. + */ + const struct attribute_group *attr_groups[4]; + u32 counter_mem_size; + int domain; + /* + * flag to notify whether the memory is mmaped + * or allocated by kernel. + */ + bool imc_counter_mmaped; +}; + +/* + * Structure to hold id, lock and reference count for the imc events which + * are inited. + */ +struct imc_pmu_ref { + struct mutex lock; + unsigned int id; + int refc; +}; + +/* + * In-Memory Collection Counters type. + * Data comes from Device tree. + * Three device type are supported. + */ + +enum { + IMC_TYPE_THREAD = 0x1, + IMC_TYPE_CORE = 0x4, + IMC_TYPE_CHIP = 0x10, +}; + +/* + * Domains for IMC PMUs + */ +#define IMC_DOMAIN_NEST 1 +#define IMC_DOMAIN_CORE 2 +#define IMC_DOMAIN_THREAD 3 + +extern int init_imc_pmu(struct device_node *parent, + struct imc_pmu *pmu_ptr, int pmu_id); +extern void thread_imc_disable(void); +#endif /* __ASM_POWERPC_IMC_PMU_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 7cea76f11c26..83596f32f50b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -104,6 +104,10 @@ struct kvmppc_host_state { u8 napping; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* + * hwthread_req/hwthread_state pair is used to pull sibling threads + * out of guest on pre-ISAv3.0B CPUs where threads share MMU. + */ u8 hwthread_req; u8 hwthread_state; u8 host_ipi; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index cd2fc1cc1cc7..73b92017b6d7 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -76,7 +76,6 @@ struct machdep_calls { void __noreturn (*restart)(char *cmd); void __noreturn (*halt)(void); - void (*panic)(char *str); void (*cpu_die)(void); long (*time_init)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 35bec1c5bd5a..309592589e30 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -77,76 +77,8 @@ extern void switch_cop(struct mm_struct *next); extern int use_cop(unsigned long acop, struct mm_struct *mm); extern void drop_cop(unsigned long acop, struct mm_struct *mm); -/* - * switch_mm is the entry point called from the architecture independent - * code in kernel/sched/core.c - */ -static inline void switch_mm_irqs_off(struct mm_struct *prev, - struct mm_struct *next, - struct task_struct *tsk) -{ - bool new_on_cpu = false; - - /* Mark this context has been used on the new CPU */ - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { - cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); - - /* - * This full barrier orders the store to the cpumask above vs - * a subsequent operation which allows this CPU to begin loading - * translations for next. - * - * When using the radix MMU that operation is the load of the - * MMU context id, which is then moved to SPRN_PID. - * - * For the hash MMU it is either the first load from slb_cache - * in switch_slb(), and/or the store of paca->mm_ctx_id in - * copy_mm_to_paca(). - * - * On the read side the barrier is in pte_xchg(), which orders - * the store to the PTE vs the load of mm_cpumask. - */ - smp_mb(); - - new_on_cpu = true; - } - - /* 32-bit keeps track of the current PGDIR in the thread struct */ -#ifdef CONFIG_PPC32 - tsk->thread.pgdir = next->pgd; -#endif /* CONFIG_PPC32 */ - - /* 64-bit Book3E keeps track of current PGD in the PACA */ -#ifdef CONFIG_PPC_BOOK3E_64 - get_paca()->pgd = next->pgd; -#endif - /* Nothing else to do if we aren't actually switching */ - if (prev == next) - return; - -#ifdef CONFIG_PPC_ICSWX - /* Switch coprocessor context only if prev or next uses a coprocessor */ - if (prev->context.acop || next->context.acop) - switch_cop(next); -#endif /* CONFIG_PPC_ICSWX */ - - /* We must stop all altivec streams before changing the HW - * context - */ -#ifdef CONFIG_ALTIVEC - if (cpu_has_feature(CPU_FTR_ALTIVEC)) - asm volatile ("dssall"); -#endif /* CONFIG_ALTIVEC */ - - if (new_on_cpu) - radix_kvm_prefetch_workaround(next); - - /* - * The actual HW switching method differs between the various - * sub architectures. Out of line for now - */ - switch_mmu_context(prev, next, tsk); -} +extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -168,11 +100,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { - unsigned long flags; - - local_irq_save(flags); switch_mm(prev, next, current); - local_irq_restore(flags); } /* We don't currently use enter_lazy_tlb() for anything */ diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index 6f8e79cd35d8..3760150a0ff0 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -1,9 +1,8 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_PPC_WATCHDOG extern void arch_touch_nmi_watchdog(void); - extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 91314268f04f..185c6a47f9ba 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -121,7 +121,7 @@ extern int icache_44x_need_flush; #include <asm/nohash/pte-book3e.h> #elif defined(CONFIG_FSL_BOOKE) #include <asm/nohash/32/pte-fsl-booke.h> -#elif defined(CONFIG_8xx) +#elif defined(CONFIG_PPC_8xx) #include <asm/nohash/32/pte-8xx.h> #endif @@ -337,9 +337,6 @@ static inline void __ptep_set_access_flags(struct mm_struct *mm, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, - pmd_t **pmdp); - int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index e5805ad78e12..17989c3d9a24 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -14,6 +14,7 @@ static inline int pte_write(pte_t pte) { return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO; } +static inline int pte_read(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 3130a73652c7..450a60b81d2a 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -42,6 +42,7 @@ #define OPAL_I2C_STOP_ERR -24 #define OPAL_XIVE_PROVISIONING -31 #define OPAL_XIVE_FREE_ACTIVE -32 +#define OPAL_TIMEOUT -33 /* API Tokens (in r0) */ #define OPAL_INVALID_CALL -1 @@ -190,7 +191,16 @@ #define OPAL_NPU_INIT_CONTEXT 146 #define OPAL_NPU_DESTROY_CONTEXT 147 #define OPAL_NPU_MAP_LPAR 148 -#define OPAL_LAST 148 +#define OPAL_IMC_COUNTERS_INIT 149 +#define OPAL_IMC_COUNTERS_START 150 +#define OPAL_IMC_COUNTERS_STOP 151 +#define OPAL_GET_POWERCAP 152 +#define OPAL_SET_POWERCAP 153 +#define OPAL_GET_POWER_SHIFT_RATIO 154 +#define OPAL_SET_POWER_SHIFT_RATIO 155 +#define OPAL_SENSOR_GROUP_CLEAR 156 +#define OPAL_PCI_SET_P2P 157 +#define OPAL_LAST 157 /* Device tree flags */ @@ -1084,6 +1094,18 @@ enum { XIVE_DUMP_EMU_STATE = 5, }; +/* "type" argument options for OPAL_IMC_COUNTERS_* calls */ +enum { + OPAL_IMC_COUNTERS_NEST = 1, + OPAL_IMC_COUNTERS_CORE = 2, +}; + + +/* PCI p2p descriptor */ +#define OPAL_PCI_P2P_ENABLE 0x1 +#define OPAL_PCI_P2P_LOAD 0x2 +#define OPAL_PCI_P2P_STORE 0x4 + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 588fb1c23af9..726c23304a57 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -50,7 +50,7 @@ int64_t opal_tpo_write(uint64_t token, uint32_t year_mon_day, uint32_t hour_min); int64_t opal_cec_power_down(uint64_t request); int64_t opal_cec_reboot(void); -int64_t opal_cec_reboot2(uint32_t reboot_type, char *diag); +int64_t opal_cec_reboot2(uint32_t reboot_type, const char *diag); int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset); int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset); int64_t opal_handle_interrupt(uint64_t isn, __be64 *outstanding_event_mask); @@ -267,6 +267,19 @@ int64_t opal_xive_allocate_irq(uint32_t chip_id); int64_t opal_xive_free_irq(uint32_t girq); int64_t opal_xive_sync(uint32_t type, uint32_t id); int64_t opal_xive_dump(uint32_t type, uint32_t id); +int64_t opal_pci_set_p2p(uint64_t phb_init, uint64_t phb_target, + uint64_t desc, uint16_t pe_number); + +int64_t opal_imc_counters_init(uint32_t type, uint64_t address, + uint64_t cpu_pir); +int64_t opal_imc_counters_start(uint32_t type, uint64_t cpu_pir); +int64_t opal_imc_counters_stop(uint32_t type, uint64_t cpu_pir); + +int opal_get_powercap(u32 handle, int token, u32 *pcap); +int opal_set_powercap(u32 handle, int token, u32 pcap); +int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr); +int opal_set_power_shift_ratio(u32 handle, int token, u32 psr); +int opal_sensor_group_clear(u32 group_hndl, int token); /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, @@ -345,6 +358,10 @@ static inline int opal_get_async_rc(struct opal_msg msg) void opal_wake_poller(void); +void opal_powercap_init(void); +void opal_psr_init(void); +void opal_sensor_groups_init(void); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc88a31cc79a..04b60af027ae 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -31,6 +31,7 @@ #endif #include <asm/accounting.h> #include <asm/hmi.h> +#include <asm/cpuidle.h> register struct paca_struct *local_paca asm("r13"); @@ -183,6 +184,12 @@ struct paca_struct { struct paca_struct **thread_sibling_pacas; /* The PSSCR value that the kernel requested before going to stop */ u64 requested_psscr; + + /* + * Save area for additional SPRs that need to be + * saved/restored during cpuidle stop. + */ + struct stop_sprs stop_sprs; #endif #ifdef CONFIG_PPC_STD_MMU_64 diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 56c67d3f0108..0b8aa1fe2d5f 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -195,7 +195,6 @@ struct pci_dn { struct pci_dn *parent; struct pci_controller *phb; /* for pci devices */ struct iommu_table_group *table_group; /* for phb's or bridges */ - struct device_node *node; /* back-pointer to the device_node */ int pci_ext_config_space; /* for pci devices */ diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index d795c5d5789c..45ae1212ab8a 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -17,6 +17,8 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) } #endif /* MODULE */ +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) + #ifdef CONFIG_PPC_BOOK3S #include <asm/book3s/pgalloc.h> #else diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index afae9a336136..7d0d38f58243 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -66,22 +66,14 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 #endif -pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *shift); -static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *shift) -{ - VM_WARN(!arch_irqs_disabled(), - "%s called with irq enabled\n", __func__); - return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); -} +/* can we use this in kvm */ unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) void mark_initmem_nx(void); #else static inline void mark_initmem_nx(void) { } diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index de9681034353..3e5cf251ad9a 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -26,6 +26,8 @@ extern int pnv_pci_get_presence_state(uint64_t id, uint8_t *state); extern int pnv_pci_get_power_state(uint64_t id, uint8_t *state); extern int pnv_pci_set_power_state(uint64_t id, uint8_t state, struct opal_msg *msg); +extern int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, + u64 desc); int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode); int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index fa9ebaead91e..ce0930d68857 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -193,6 +193,7 @@ #define PPC_INST_CLRBHRB 0x7c00035c #define PPC_INST_COPY 0x7c20060c #define PPC_INST_CP_ABORT 0x7c00068c +#define PPC_INST_DARN 0x7c0005e6 #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe #define PPC_INST_DCBAL 0x7c2005ec @@ -204,6 +205,8 @@ #define PPC_INST_ISEL_MASK 0xfc00003e #define PPC_INST_LDARX 0x7c0000a8 #define PPC_INST_STDCX 0x7c0001ad +#define PPC_INST_LQARX 0x7c000228 +#define PPC_INST_STQCX 0x7c00016d #define PPC_INST_LSWI 0x7c0004aa #define PPC_INST_LSWX 0x7c00042a #define PPC_INST_LWARX 0x7c000028 @@ -261,7 +264,7 @@ #define PPC_INST_TLBSRX_DOT 0x7c0006a5 #define PPC_INST_VPMSUMW 0x10000488 #define PPC_INST_VPMSUMD 0x100004c8 -#define PPC_INST_XXLOR 0xf0000510 +#define PPC_INST_XXLOR 0xf0000490 #define PPC_INST_XXSWAPD 0xf0000250 #define PPC_INST_XVCPSGNDP 0xf0000780 #define PPC_INST_TRECHKPT 0x7c0007dd @@ -395,16 +398,25 @@ #define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) #define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_DARN(t, l) stringify_in_c(.long PPC_INST_DARN | \ + ___PPC_RT(t) | \ + (((l) & 0x3) << 16)) #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ __PPC_RA(a) | __PPC_RB(b)) +#define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LQARX | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | __PPC_EH(eh)) #define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \ ___PPC_RT(t) | ___PPC_RA(a) | \ ___PPC_RB(b) | __PPC_EH(eh)) #define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \ ___PPC_RT(t) | ___PPC_RA(a) | \ ___PPC_RB(b) | __PPC_EH(eh)) +#define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ ___PPC_RB(b)) #define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) @@ -414,6 +426,8 @@ ___PPC_RB(b)) #define PPC_MSGCLRP(b) stringify_in_c(.long PPC_INST_MSGCLRP | \ ___PPC_RB(b)) +#define PPC_PASTE(a, b) stringify_in_c(.long PPC_INST_PASTE | \ + ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_POPCNTB(a, s) stringify_in_c(.long PPC_INST_POPCNTB | \ __PPC_RA(a) | __PPC_RS(s)) #define PPC_POPCNTD(a, s) stringify_in_c(.long PPC_INST_POPCNTD | \ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 6baeeb9acd0d..36f3e41c9fbe 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -378,10 +378,16 @@ BEGIN_FTR_SECTION_NESTED(96); \ cmpwi dest,0; \ beq- 90b; \ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) -#elif defined(CONFIG_8xx) -#define MFTB(dest) mftb dest #else -#define MFTB(dest) mfspr dest, SPRN_TBRL +#define MFTB(dest) MFTBL(dest) +#endif + +#ifdef CONFIG_PPC_8xx +#define MFTBL(dest) mftb dest +#define MFTBU(dest) mftbu dest +#else +#define MFTBL(dest) mfspr dest, SPRN_TBRL +#define MFTBU(dest) mfspr dest, SPRN_TBRU #endif #ifndef CONFIG_SMP @@ -411,7 +417,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) * and they must be used. */ -#if !defined(CONFIG_4xx) && !defined(CONFIG_8xx) +#if !defined(CONFIG_4xx) && !defined(CONFIG_PPC_8xx) #define tlbia \ li r4,1024; \ mtctr r4; \ @@ -439,7 +445,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) .machine push ; \ .machine "power4" ; \ lis scratch,0x60000000@h; \ - dcbt r0,scratch,0b01010; \ + dcbt 0,scratch,0b01010; \ .machine pop /* diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 35c00d7a0cf8..825bd5998701 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -159,7 +159,10 @@ struct of_drconf_cell { #define OV5_PFO_HW_842 0x1140 /* PFO Compression Accelerator */ #define OV5_PFO_HW_ENCR 0x1120 /* PFO Encryption Accelerator */ #define OV5_SUB_PROCESSORS 0x1501 /* 1,2,or 4 Sub-Processors supported */ -#define OV5_XIVE_EXPLOIT 0x1701 /* XIVE exploitation supported */ +#define OV5_XIVE_SUPPORT 0x17C0 /* XIVE Exploitation Support Mask */ +#define OV5_XIVE_LEGACY 0x1700 /* XIVE legacy mode Only */ +#define OV5_XIVE_EXPLOIT 0x1740 /* XIVE exploitation mode Only */ +#define OV5_XIVE_EITHER 0x1780 /* XIVE legacy or exploitation mode */ /* MMU Base Architecture */ #define OV5_MMU_SUPPORT 0x18C0 /* MMU Mode Support Mask */ #define OV5_MMU_HASH 0x1800 /* Hash MMU Only */ diff --git a/arch/powerpc/include/asm/pte-walk.h b/arch/powerpc/include/asm/pte-walk.h new file mode 100644 index 000000000000..2d633e9d686c --- /dev/null +++ b/arch/powerpc/include/asm/pte-walk.h @@ -0,0 +1,35 @@ +#ifndef _ASM_POWERPC_PTE_WALK_H +#define _ASM_POWERPC_PTE_WALK_H + +#include <linux/sched.h> + +/* Don't use this directly */ +extern pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hshift); + +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hshift) +{ + VM_WARN(!arch_irqs_disabled(), "%s called with irq enabled\n", __func__); + return __find_linux_pte(pgdir, ea, is_thp, hshift); +} + +static inline pte_t *find_init_mm_pte(unsigned long ea, unsigned *hshift) +{ + pgd_t *pgdir = init_mm.pgd; + return __find_linux_pte(pgdir, ea, NULL, hshift); +} +/* + * This is what we should always use. Any other lockless page table lookup needs + * careful audit against THP split. + */ +static inline pte_t *find_current_mm_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hshift) +{ + VM_WARN(!arch_irqs_disabled(), "%s called with irq enabled\n", __func__); + VM_WARN(pgdir != current->mm->pgd, + "%s lock less page table lookup called on wrong mm\n", __func__); + return __find_linux_pte(pgdir, ea, is_thp, hshift); +} + +#endif /* _ASM_POWERPC_PTE_WALK_H */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a3b6575c7842..f92eaf7a4c0d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -22,9 +22,9 @@ #include <asm/reg_fsl_emb.h> #endif -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx #include <asm/reg_8xx.h> -#endif /* CONFIG_8xx */ +#endif /* CONFIG_PPC_8xx */ #define MSR_SF_LG 63 /* Enable 64 bit mode */ #define MSR_ISF_LG 61 /* Interrupt 64b mode valid on 630 */ @@ -135,7 +135,7 @@ #define MSR_KERNEL (MSR_ | MSR_64BIT) #define MSR_USER32 (MSR_ | MSR_PR | MSR_EE) #define MSR_USER64 (MSR_USER32 | MSR_64BIT) -#elif defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_8xx) +#elif defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) /* Default MSR for kernel mode. */ #define MSR_KERNEL (MSR_ME|MSR_RI|MSR_IR|MSR_DR) #define MSR_USER (MSR_KERNEL|MSR_PR|MSR_EE) @@ -272,16 +272,65 @@ #define SPRN_DAR 0x013 /* Data Address Register */ #define SPRN_DBCR 0x136 /* e300 Data Breakpoint Control Reg */ #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ -#define DSISR_NOHPTE 0x40000000 /* no translation found */ -#define DSISR_PROTFAULT 0x08000000 /* protection fault */ -#define DSISR_BADACCESS 0x04000000 /* bad access to CI or G */ -#define DSISR_ISSTORE 0x02000000 /* access was a store */ -#define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ -#define DSISR_NOSEGMENT 0x00200000 /* SLB miss */ -#define DSISR_KEYFAULT 0x00200000 /* Key fault */ -#define DSISR_UNSUPP_MMU 0x00080000 /* Unsupported MMU config */ -#define DSISR_SET_RC 0x00040000 /* Failed setting of R/C bits */ -#define DSISR_PGDIRFAULT 0x00020000 /* Fault on page directory */ +#define DSISR_BAD_DIRECT_ST 0x80000000 /* Obsolete: Direct store error */ +#define DSISR_NOHPTE 0x40000000 /* no translation found */ +#define DSISR_ATTR_CONFLICT 0x20000000 /* P9: Process vs. Partition attr */ +#define DSISR_NOEXEC_OR_G 0x10000000 /* Alias of SRR1 bit, see below */ +#define DSISR_PROTFAULT 0x08000000 /* protection fault */ +#define DSISR_BADACCESS 0x04000000 /* bad access to CI or G */ +#define DSISR_ISSTORE 0x02000000 /* access was a store */ +#define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ +#define DSISR_NOSEGMENT 0x00200000 /* STAB miss (unsupported) */ +#define DSISR_KEYFAULT 0x00200000 /* Storage Key fault */ +#define DSISR_BAD_EXT_CTRL 0x00100000 /* Obsolete: External ctrl error */ +#define DSISR_UNSUPP_MMU 0x00080000 /* P9: Unsupported MMU config */ +#define DSISR_SET_RC 0x00040000 /* P9: Failed setting of R/C bits */ +#define DSISR_PRTABLE_FAULT 0x00020000 /* P9: Fault on process table */ +#define DSISR_ICSWX_NO_CT 0x00004000 /* P7: icswx unavailable cp type */ +#define DSISR_BAD_COPYPASTE 0x00000008 /* P9: Copy/Paste on wrong memtype */ +#define DSISR_BAD_AMO 0x00000004 /* P9: Incorrect AMO opcode */ +#define DSISR_BAD_CI_LDST 0x00000002 /* P8: Bad HV CI load/store */ + +/* + * DSISR_NOEXEC_OR_G doesn't actually exist. This bit is always + * 0 on DSIs. However, on ISIs, the corresponding bit in SRR1 + * indicates an attempt at executing from a no-execute PTE + * or segment or from a guarded page. + * + * We add a definition here for completeness as we alias + * DSISR and SRR1 in do_page_fault. + */ + +/* + * DSISR bits that are treated as a fault. Any bit set + * here will skip hash_page, and cause do_page_fault to + * trigger a SIGBUS or SIGSEGV: + */ +#define DSISR_BAD_FAULT_32S (DSISR_BAD_DIRECT_ST | \ + DSISR_BADACCESS | \ + DSISR_BAD_EXT_CTRL) +#define DSISR_BAD_FAULT_64S (DSISR_BAD_FAULT_32S | \ + DSISR_ATTR_CONFLICT | \ + DSISR_KEYFAULT | \ + DSISR_UNSUPP_MMU | \ + DSISR_PRTABLE_FAULT | \ + DSISR_ICSWX_NO_CT | \ + DSISR_BAD_COPYPASTE | \ + DSISR_BAD_AMO | \ + DSISR_BAD_CI_LDST) +/* + * These bits are equivalent in SRR1 and DSISR for 0x400 + * instruction access interrupts on Book3S + */ +#define DSISR_SRR1_MATCH_32S (DSISR_NOHPTE | \ + DSISR_NOEXEC_OR_G | \ + DSISR_PROTFAULT) +#define DSISR_SRR1_MATCH_64S (DSISR_SRR1_MATCH_32S | \ + DSISR_KEYFAULT | \ + DSISR_UNSUPP_MMU | \ + DSISR_SET_RC | \ + DSISR_PRTABLE_FAULT) + #define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_CIR 0x11B /* Chip Information Register (hyper, R/0) */ @@ -307,6 +356,7 @@ #define SPRN_PMSR 0x355 /* Power Management Status Reg */ #define SPRN_PMMAR 0x356 /* Power Management Memory Activity Register */ #define SPRN_PSSCR 0x357 /* Processor Stop Status and Control Register (ISA 3.0) */ +#define SPRN_PSSCR_PR 0x337 /* PSSCR ISA 3.0, privileged mode access */ #define SPRN_PMCR 0x374 /* Power Management Control Register */ /* HFSCR and FSCR bit numbers are the same */ @@ -675,6 +725,7 @@ * may not be recoverable */ #define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */ #define SRR1_WS_DEEP 0x00010000 /* All resources maintained */ +#define SRR1_PROGTM 0x00200000 /* TM Bad Thing */ #define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ #define SRR1_PROGILL 0x00080000 /* Illegal instruction */ #define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ @@ -1114,7 +1165,7 @@ #endif #endif -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx #define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 #define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 #define SPRN_SPRG_SCRATCH2 SPRN_SPRG2 @@ -1197,10 +1248,8 @@ * differentiated by the version number in the Communication Processor * Module (CPM). */ -#define PVR_821 0x00500000 -#define PVR_823 PVR_821 -#define PVR_850 PVR_821 -#define PVR_860 PVR_821 +#define PVR_8xx 0x00500000 + #define PVR_8240 0x00810100 #define PVR_8245 0x80811014 #define PVR_8260 PVR_8240 @@ -1295,12 +1344,12 @@ static inline void msr_check_and_clear(unsigned long bits) ".section __ftr_fixup,\"a\"\n" \ ".align 3\n" \ "98:\n" \ - " .llong %1\n" \ - " .llong %1\n" \ - " .llong 97b-98b\n" \ - " .llong 99b-98b\n" \ - " .llong 0\n" \ - " .llong 0\n" \ + " .8byte %1\n" \ + " .8byte %1\n" \ + " .8byte 97b-98b\n" \ + " .8byte 99b-98b\n" \ + " .8byte 0\n" \ + " .8byte 0\n" \ ".previous" \ : "=r" (rval) \ : "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL) : "cr0"); \ @@ -1313,7 +1362,7 @@ static inline void msr_check_and_clear(unsigned long bits) #else /* __powerpc64__ */ -#if defined(CONFIG_8xx) +#if defined(CONFIG_PPC_8xx) #define mftbl() ({unsigned long rval; \ asm volatile("mftbl %0" : "=r" (rval)); rval;}) #define mftbu() ({unsigned long rval; \ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 737e012ef56e..eb2a33d5df26 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -221,10 +221,7 @@ #define SPRN_CSRR0 SPRN_SRR2 /* Critical Save and Restore Register 0 */ #define SPRN_CSRR1 SPRN_SRR3 /* Critical Save and Restore Register 1 */ #endif - -#ifdef CONFIG_PPC_ICSWX #define SPRN_HACOP 0x15F /* Hypervisor Available Coprocessor Register */ -#endif /* Bit definitions for CCR1. */ #define CCR1_DPC 0x00000100 /* Disable L1 I-Cache/D-Cache parity checking */ diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 654d64c9f3ac..3a3fb0ca68f5 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -23,7 +23,6 @@ extern void reloc_got2(unsigned long); void check_for_initrd(void); void initmem_init(void); -void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 #ifdef CONFIG_PPC_PSERIES diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 8ea98504f900..fac963e10d39 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -97,6 +97,7 @@ static inline void set_hard_smp_processor_id(int cpu, int phys) #endif DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); static inline struct cpumask *cpu_sibling_mask(int cpu) @@ -109,6 +110,11 @@ static inline struct cpumask *cpu_core_mask(int cpu) return per_cpu(cpu_core_map, cpu); } +static inline struct cpumask *cpu_l2_cache_mask(int cpu) +{ + return per_cpu(cpu_l2_cache_map, cpu); +} + extern int cpu_to_core_id(int cpu); /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 8c1b913de6d7..edbe571bcc54 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -170,39 +170,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->slock = 0; } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - arch_spinlock_t lock_val; - - smp_mb(); - - /* - * Atomically load and store back the lock value (unchanged). This - * ensures that our observation of the lock value is ordered with - * respect to other lock operations. - */ - __asm__ __volatile__( -"1: " PPC_LWARX(%0, 0, %2, 0) "\n" -" stwcx. %0, 0, %2\n" -" bne- 1b\n" - : "=&r" (lock_val), "+m" (*lock) - : "r" (lock) - : "cr0", "xer"); - - if (arch_spin_value_unlocked(lock_val)) - goto out; - - while (lock->slock) { - HMT_low(); - if (SHARED_PROCESSOR) - __spin_yield(lock); - } - HMT_medium(); - -out: - smp_mb(); -} - /* * Read-write spinlocks, allowing multiple readers * but only one writer. @@ -342,5 +309,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) __rw_yield(lock) #define arch_write_relax(lock) __rw_yield(lock) +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index d3a42cc45a82..ab9d849644d0 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -23,12 +23,9 @@ struct pt_regs; #define IS_RFID(instr) (((instr) & 0xfc0007fe) == 0x4c000024) #define IS_RFI(instr) (((instr) & 0xfc0007fe) == 0x4c000064) -/* Emulate instructions that cause a transfer of control. */ -extern int emulate_step(struct pt_regs *regs, unsigned int instr); - enum instruction_type { COMPUTE, /* arith/logical/CR op, etc. */ - LOAD, + LOAD, /* load and store types need to be contiguous */ LOAD_MULTI, LOAD_FP, LOAD_VMX, @@ -55,10 +52,31 @@ enum instruction_type { #define INSTR_TYPE_MASK 0x1f +#define OP_IS_LOAD_STORE(type) (LOAD <= (type) && (type) <= STCX) + +/* Compute flags, ORed in with type */ +#define SETREG 0x20 +#define SETCC 0x40 +#define SETXER 0x80 + +/* Branch flags, ORed in with type */ +#define SETLK 0x20 +#define BRTAKEN 0x40 +#define DECCTR 0x80 + /* Load/store flags, ORed in with type */ #define SIGNEXT 0x20 #define UPDATE 0x40 /* matches bit in opcode 31 instructions */ #define BYTEREV 0x80 +#define FPCONV 0x100 + +/* Barrier type field, ORed in with type */ +#define BARRIER_MASK 0xe0 +#define BARRIER_SYNC 0x00 +#define BARRIER_ISYNC 0x20 +#define BARRIER_EIEIO 0x40 +#define BARRIER_LWSYNC 0x60 +#define BARRIER_PTESYNC 0x80 /* Cacheop values, ORed in with type */ #define CACHEOP_MASK 0x700 @@ -67,10 +85,17 @@ enum instruction_type { #define DCBTST 0x200 #define DCBT 0x300 #define ICBI 0x400 +#define DCBZ 0x500 + +/* VSX flags values */ +#define VSX_FPCONV 1 /* do floating point SP/DP conversion */ +#define VSX_SPLAT 2 /* store loaded value into all elements */ +#define VSX_LDLEFT 4 /* load VSX register from left */ +#define VSX_CHECK_VEC 8 /* check MSR_VEC not MSR_VSX for reg >= 32 */ /* Size field in type word */ -#define SIZE(n) ((n) << 8) -#define GETSIZE(w) ((w) >> 8) +#define SIZE(n) ((n) << 12) +#define GETSIZE(w) ((w) >> 12) #define MKOP(t, f, s) ((t) | (f) | SIZE(s)) @@ -83,7 +108,63 @@ struct instruction_op { int update_reg; /* For MFSPR */ int spr; + u32 ccval; + u32 xerval; + u8 element_size; /* for VSX/VMX loads/stores */ + u8 vsx_flags; +}; + +union vsx_reg { + u8 b[16]; + u16 h[8]; + u32 w[4]; + unsigned long d[2]; + float fp[4]; + double dp[2]; + __vector128 v; }; -extern int analyse_instr(struct instruction_op *op, struct pt_regs *regs, +/* + * Decode an instruction, and return information about it in *op + * without changing *regs. + * + * Return value is 1 if the instruction can be emulated just by + * updating *regs with the information in *op, -1 if we need the + * GPRs but *regs doesn't contain the full register set, or 0 + * otherwise. + */ +extern int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, unsigned int instr); + +/* + * Emulate an instruction that can be executed just by updating + * fields in *regs. + */ +void emulate_update_regs(struct pt_regs *reg, struct instruction_op *op); + +/* + * Emulate instructions that cause a transfer of control, + * arithmetic/logical instructions, loads and stores, + * cache operations and barriers. + * + * Returns 1 if the instruction was emulated successfully, + * 0 if it could not be emulated, or -1 for an instruction that + * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.). + */ +extern int emulate_step(struct pt_regs *regs, unsigned int instr); + +/* + * Emulate a load or store instruction by reading/writing the + * memory of the current process. FP/VMX/VSX registers are assumed + * to hold live values if the appropriate enable bit in regs->msr is + * set; otherwise this will use the saved values in the thread struct + * for user-mode accesses. + */ +extern int emulate_loadstore(struct pt_regs *regs, struct instruction_op *op); + +extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, + const void *mem, bool cross_endian); +extern void emulate_vsx_store(struct instruction_op *op, + const union vsx_reg *reg, void *mem, + bool cross_endian); +extern int emulate_dcbz(unsigned long ea, struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index da3cdffca440..cc9addefb51c 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -10,6 +10,7 @@ #define __HAVE_ARCH_MEMMOVE #define __HAVE_ARCH_MEMCMP #define __HAVE_ARCH_MEMCHR +#define __HAVE_ARCH_MEMSET16 extern char * strcpy(char *,const char *); extern char * strncpy(char *,const char *, __kernel_size_t); @@ -23,6 +24,31 @@ extern void * memmove(void *,const void *,__kernel_size_t); extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); +#ifdef CONFIG_PPC64 +#define __HAVE_ARCH_MEMSET32 +#define __HAVE_ARCH_MEMSET64 + +extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t); +extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); +extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); + +static inline void *memset16(uint16_t *p, uint16_t v, __kernel_size_t n) +{ + return __memset16(p, v, n * 2); +} + +static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n) +{ + return __memset32(p, v, n * 4); +} + +static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) +{ + return __memset64(p, v, n * 8); +} +#else +extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_STRING_H */ diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h index 2cf846edb3fc..cb61eae5b7ed 100644 --- a/arch/powerpc/include/asm/timex.h +++ b/arch/powerpc/include/asm/timex.h @@ -29,7 +29,7 @@ static inline cycles_t get_cycles(void) ret = 0; __asm__ __volatile__( -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx "97: mftb %0\n" #else "97: mfspr %0, %2\n" @@ -45,11 +45,7 @@ static inline cycles_t get_cycles(void) " .long 0\n" " .long 0\n" ".previous" -#ifdef CONFIG_8xx - : "=r" (ret) : "i" (CPU_FTR_601)); -#else : "=r" (ret) : "i" (CPU_FTR_601), "i" (SPRN_TBRL)); -#endif return ret; #endif } diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 609557569f65..a7eabff27a0f 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -69,13 +69,22 @@ static inline int mm_is_core_local(struct mm_struct *mm) topology_sibling_cpumask(smp_processor_id())); } +#ifdef CONFIG_PPC_BOOK3S_64 +static inline int mm_is_thread_local(struct mm_struct *mm) +{ + if (atomic_read(&mm->context.active_cpus) > 1) + return false; + return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)); +} +#else /* CONFIG_PPC_BOOK3S_64 */ static inline int mm_is_thread_local(struct mm_struct *mm) { return cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())); } +#endif /* !CONFIG_PPC_BOOK3S_64 */ -#else +#else /* CONFIG_SMP */ static inline int mm_is_core_local(struct mm_struct *mm) { return 1; diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index dc4e15937ccf..2d84bca8d053 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -16,8 +16,6 @@ struct device_node; #include <asm/mmzone.h> -#define parent_node(node) (node) - #define cpumask_of_node(node) ((node) == -1 ? \ cpu_all_mask : \ node_to_cpumask_map[node]) diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h new file mode 100644 index 000000000000..fd5963acd658 --- /dev/null +++ b/arch/powerpc/include/asm/vas.h @@ -0,0 +1,159 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ASM_POWERPC_VAS_H +#define _ASM_POWERPC_VAS_H + +/* + * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25 + * (Local FIFO Size Register) of the VAS workbook. + */ +#define VAS_RX_FIFO_SIZE_MIN (1 << 10) /* 1KB */ +#define VAS_RX_FIFO_SIZE_MAX (8 << 20) /* 8MB */ + +/* + * Threshold Control Mode: Have paste operation fail if the number of + * requests in receive FIFO exceeds a threshold. + * + * NOTE: No special error code yet if paste is rejected because of these + * limits. So users can't distinguish between this and other errors. + */ +#define VAS_THRESH_DISABLED 0 +#define VAS_THRESH_FIFO_GT_HALF_FULL 1 +#define VAS_THRESH_FIFO_GT_QTR_FULL 2 +#define VAS_THRESH_FIFO_GT_EIGHTH_FULL 3 + +/* + * Get/Set bit fields + */ +#define GET_FIELD(m, v) (((v) & (m)) >> MASK_LSH(m)) +#define MASK_LSH(m) (__builtin_ffsl(m) - 1) +#define SET_FIELD(m, v, val) \ + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_LSH(m)) & (m))) + +/* + * Co-processor Engine type. + */ +enum vas_cop_type { + VAS_COP_TYPE_FAULT, + VAS_COP_TYPE_842, + VAS_COP_TYPE_842_HIPRI, + VAS_COP_TYPE_GZIP, + VAS_COP_TYPE_GZIP_HIPRI, + VAS_COP_TYPE_FTW, + VAS_COP_TYPE_MAX, +}; + +/* + * Receive window attributes specified by the (in-kernel) owner of window. + */ +struct vas_rx_win_attr { + void *rx_fifo; + int rx_fifo_size; + int wcreds_max; + + bool pin_win; + bool rej_no_credit; + bool tx_wcred_mode; + bool rx_wcred_mode; + bool tx_win_ord_mode; + bool rx_win_ord_mode; + bool data_stamp; + bool nx_win; + bool fault_win; + bool user_win; + bool notify_disable; + bool intr_disable; + bool notify_early; + + int lnotify_lpid; + int lnotify_pid; + int lnotify_tid; + u32 pswid; + + int tc_mode; +}; + +/* + * Window attributes specified by the in-kernel owner of a send window. + */ +struct vas_tx_win_attr { + enum vas_cop_type cop; + int wcreds_max; + int lpid; + int pidr; /* hardware PID (from SPRN_PID) */ + int pid; /* linux process id */ + int pswid; + int rsvd_txbuf_count; + int tc_mode; + + bool user_win; + bool pin_win; + bool rej_no_credit; + bool rsvd_txbuf_enable; + bool tx_wcred_mode; + bool rx_wcred_mode; + bool tx_win_ord_mode; + bool rx_win_ord_mode; +}; + +/* + * Helper to initialize receive window attributes to defaults for an + * NX window. + */ +void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop); + +/* + * Open a VAS receive window for the instance of VAS identified by @vasid + * Use @attr to initialize the attributes of the window. + * + * Return a handle to the window or ERR_PTR() on error. + */ +struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, + struct vas_rx_win_attr *attr); + +/* + * Helper to initialize send window attributes to defaults for an NX window. + */ +extern void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, + enum vas_cop_type cop); + +/* + * Open a VAS send window for the instance of VAS identified by @vasid + * and the co-processor type @cop. Use @attr to initialize attributes + * of the window. + * + * Note: The instance of VAS must already have an open receive window for + * the coprocessor type @cop. + * + * Return a handle to the send window or ERR_PTR() on error. + */ +struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, + struct vas_tx_win_attr *attr); + +/* + * Close the send or receive window identified by @win. For receive windows + * return -EAGAIN if there are active send windows attached to this receive + * window. + */ +int vas_win_close(struct vas_window *win); + +/* + * Copy the co-processor request block (CRB) @crb into the local L2 cache. + */ +int vas_copy_crb(void *crb, int offset); + +/* + * Paste a previously copied CRB (see vas_copy_crb()) from the L2 cache to + * the hardware address associated with the window @win. @re is expected/ + * assumed to be true for NX windows. + */ +int vas_paste_crb(struct vas_window *win, int offset, bool re); + +#endif /* __ASM_POWERPC_VAS_H */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index c23ff4389ca2..371fbebf1ec9 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -45,6 +45,7 @@ struct xive_irq_data { void __iomem *trig_mmio; u32 esb_shift; int src_chip; + u32 hw_irq; /* Setup/used by frontend */ int target; @@ -55,6 +56,7 @@ struct xive_irq_data { #define XIVE_IRQ_FLAG_SHIFT_BUG 0x04 #define XIVE_IRQ_FLAG_MASK_FW 0x08 #define XIVE_IRQ_FLAG_EOI_FW 0x10 +#define XIVE_IRQ_FLAG_H_INT_ESB 0x20 #define XIVE_INVALID_CHIP_ID -1 @@ -110,11 +112,13 @@ extern bool __xive_enabled; static inline bool xive_enabled(void) { return __xive_enabled; } +extern bool xive_spapr_init(void); extern bool xive_native_init(void); extern void xive_smp_probe(void); extern int xive_smp_prepare_cpu(unsigned int cpu); extern void xive_smp_setup_cpu(void); extern void xive_smp_disable_cpu(void); +extern void xive_teardown_cpu(void); extern void xive_kexec_teardown_cpu(int secondary); extern void xive_shutdown(void); extern void xive_flush_interrupt(void); @@ -147,6 +151,7 @@ extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id) static inline bool xive_enabled(void) { return false; } +static inline bool xive_spapr_init(void) { return false; } static inline bool xive_native_init(void) { return false; } static inline void xive_smp_probe(void) { } extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; } diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index ab45cc2f3101..03c06ba7464f 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -29,20 +29,4 @@ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ -/* - * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2), - * encode the log2 of the huge page size. A value of zero indicates that the - * default huge page size should be used. To use a non-default huge page size, - * one of these defines can be used, or the size can be encoded by hand. Note - * that on most systems only a subset, or possibly none, of these sizes will be - * available. - */ -#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */ -#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */ -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */ -#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */ -#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */ -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */ -#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */ - #endif /* _UAPI_ASM_POWERPC_MMAN_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 4aa7c147e447..91960f83039c 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o @@ -83,7 +83,7 @@ extra-y := head_$(BITS).o extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o -extra-$(CONFIG_8xx) := head_8xx.o +extra-$(CONFIG_PPC_8xx) := head_8xx.o extra-y += vmlinux.lds obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index ec7a8b099dd9..26b9994d27ee 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -27,6 +27,7 @@ #include <asm/switch_to.h> #include <asm/disassemble.h> #include <asm/cpu_has_feature.h> +#include <asm/sstep.h> struct aligninfo { unsigned char len; @@ -40,364 +41,9 @@ struct aligninfo { #define LD 0 /* load */ #define ST 1 /* store */ #define SE 2 /* sign-extend value, or FP ld/st as word */ -#define F 4 /* to/from fp regs */ -#define U 8 /* update index register */ -#define M 0x10 /* multiple load/store */ #define SW 0x20 /* byte swap */ -#define S 0x40 /* single-precision fp or... */ -#define SX 0x40 /* ... byte count in XER */ -#define HARD 0x80 /* string, stwcx. */ #define E4 0x40 /* SPE endianness is word */ #define E8 0x80 /* SPE endianness is double word */ -#define SPLT 0x80 /* VSX SPLAT load */ - -/* DSISR bits reported for a DCBZ instruction: */ -#define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */ - -/* - * The PowerPC stores certain bits of the instruction that caused the - * alignment exception in the DSISR register. This array maps those - * bits to information about the operand length and what the - * instruction would do. - */ -static struct aligninfo aligninfo[128] = { - { 4, LD }, /* 00 0 0000: lwz / lwarx */ - INVALID, /* 00 0 0001 */ - { 4, ST }, /* 00 0 0010: stw */ - INVALID, /* 00 0 0011 */ - { 2, LD }, /* 00 0 0100: lhz */ - { 2, LD+SE }, /* 00 0 0101: lha */ - { 2, ST }, /* 00 0 0110: sth */ - { 4, LD+M }, /* 00 0 0111: lmw */ - { 4, LD+F+S }, /* 00 0 1000: lfs */ - { 8, LD+F }, /* 00 0 1001: lfd */ - { 4, ST+F+S }, /* 00 0 1010: stfs */ - { 8, ST+F }, /* 00 0 1011: stfd */ - { 16, LD }, /* 00 0 1100: lq */ - { 8, LD }, /* 00 0 1101: ld/ldu/lwa */ - INVALID, /* 00 0 1110 */ - { 8, ST }, /* 00 0 1111: std/stdu */ - { 4, LD+U }, /* 00 1 0000: lwzu */ - INVALID, /* 00 1 0001 */ - { 4, ST+U }, /* 00 1 0010: stwu */ - INVALID, /* 00 1 0011 */ - { 2, LD+U }, /* 00 1 0100: lhzu */ - { 2, LD+SE+U }, /* 00 1 0101: lhau */ - { 2, ST+U }, /* 00 1 0110: sthu */ - { 4, ST+M }, /* 00 1 0111: stmw */ - { 4, LD+F+S+U }, /* 00 1 1000: lfsu */ - { 8, LD+F+U }, /* 00 1 1001: lfdu */ - { 4, ST+F+S+U }, /* 00 1 1010: stfsu */ - { 8, ST+F+U }, /* 00 1 1011: stfdu */ - { 16, LD+F }, /* 00 1 1100: lfdp */ - INVALID, /* 00 1 1101 */ - { 16, ST+F }, /* 00 1 1110: stfdp */ - INVALID, /* 00 1 1111 */ - { 8, LD }, /* 01 0 0000: ldx */ - INVALID, /* 01 0 0001 */ - { 8, ST }, /* 01 0 0010: stdx */ - INVALID, /* 01 0 0011 */ - INVALID, /* 01 0 0100 */ - { 4, LD+SE }, /* 01 0 0101: lwax */ - INVALID, /* 01 0 0110 */ - INVALID, /* 01 0 0111 */ - { 4, LD+M+HARD+SX }, /* 01 0 1000: lswx */ - { 4, LD+M+HARD }, /* 01 0 1001: lswi */ - { 4, ST+M+HARD+SX }, /* 01 0 1010: stswx */ - { 4, ST+M+HARD }, /* 01 0 1011: stswi */ - INVALID, /* 01 0 1100 */ - { 8, LD+U }, /* 01 0 1101: ldu */ - INVALID, /* 01 0 1110 */ - { 8, ST+U }, /* 01 0 1111: stdu */ - { 8, LD+U }, /* 01 1 0000: ldux */ - INVALID, /* 01 1 0001 */ - { 8, ST+U }, /* 01 1 0010: stdux */ - INVALID, /* 01 1 0011 */ - INVALID, /* 01 1 0100 */ - { 4, LD+SE+U }, /* 01 1 0101: lwaux */ - INVALID, /* 01 1 0110 */ - INVALID, /* 01 1 0111 */ - INVALID, /* 01 1 1000 */ - INVALID, /* 01 1 1001 */ - INVALID, /* 01 1 1010 */ - INVALID, /* 01 1 1011 */ - INVALID, /* 01 1 1100 */ - INVALID, /* 01 1 1101 */ - INVALID, /* 01 1 1110 */ - INVALID, /* 01 1 1111 */ - INVALID, /* 10 0 0000 */ - INVALID, /* 10 0 0001 */ - INVALID, /* 10 0 0010: stwcx. */ - INVALID, /* 10 0 0011 */ - INVALID, /* 10 0 0100 */ - INVALID, /* 10 0 0101 */ - INVALID, /* 10 0 0110 */ - INVALID, /* 10 0 0111 */ - { 4, LD+SW }, /* 10 0 1000: lwbrx */ - INVALID, /* 10 0 1001 */ - { 4, ST+SW }, /* 10 0 1010: stwbrx */ - INVALID, /* 10 0 1011 */ - { 2, LD+SW }, /* 10 0 1100: lhbrx */ - { 4, LD+SE }, /* 10 0 1101 lwa */ - { 2, ST+SW }, /* 10 0 1110: sthbrx */ - { 16, ST }, /* 10 0 1111: stq */ - INVALID, /* 10 1 0000 */ - INVALID, /* 10 1 0001 */ - INVALID, /* 10 1 0010 */ - INVALID, /* 10 1 0011 */ - INVALID, /* 10 1 0100 */ - INVALID, /* 10 1 0101 */ - INVALID, /* 10 1 0110 */ - INVALID, /* 10 1 0111 */ - INVALID, /* 10 1 1000 */ - INVALID, /* 10 1 1001 */ - INVALID, /* 10 1 1010 */ - INVALID, /* 10 1 1011 */ - INVALID, /* 10 1 1100 */ - INVALID, /* 10 1 1101 */ - INVALID, /* 10 1 1110 */ - { 0, ST+HARD }, /* 10 1 1111: dcbz */ - { 4, LD }, /* 11 0 0000: lwzx */ - INVALID, /* 11 0 0001 */ - { 4, ST }, /* 11 0 0010: stwx */ - INVALID, /* 11 0 0011 */ - { 2, LD }, /* 11 0 0100: lhzx */ - { 2, LD+SE }, /* 11 0 0101: lhax */ - { 2, ST }, /* 11 0 0110: sthx */ - INVALID, /* 11 0 0111 */ - { 4, LD+F+S }, /* 11 0 1000: lfsx */ - { 8, LD+F }, /* 11 0 1001: lfdx */ - { 4, ST+F+S }, /* 11 0 1010: stfsx */ - { 8, ST+F }, /* 11 0 1011: stfdx */ - { 16, LD+F }, /* 11 0 1100: lfdpx */ - { 4, LD+F+SE }, /* 11 0 1101: lfiwax */ - { 16, ST+F }, /* 11 0 1110: stfdpx */ - { 4, ST+F }, /* 11 0 1111: stfiwx */ - { 4, LD+U }, /* 11 1 0000: lwzux */ - INVALID, /* 11 1 0001 */ - { 4, ST+U }, /* 11 1 0010: stwux */ - INVALID, /* 11 1 0011 */ - { 2, LD+U }, /* 11 1 0100: lhzux */ - { 2, LD+SE+U }, /* 11 1 0101: lhaux */ - { 2, ST+U }, /* 11 1 0110: sthux */ - INVALID, /* 11 1 0111 */ - { 4, LD+F+S+U }, /* 11 1 1000: lfsux */ - { 8, LD+F+U }, /* 11 1 1001: lfdux */ - { 4, ST+F+S+U }, /* 11 1 1010: stfsux */ - { 8, ST+F+U }, /* 11 1 1011: stfdux */ - INVALID, /* 11 1 1100 */ - { 4, LD+F }, /* 11 1 1101: lfiwzx */ - INVALID, /* 11 1 1110 */ - INVALID, /* 11 1 1111 */ -}; - -/* - * The dcbz (data cache block zero) instruction - * gives an alignment fault if used on non-cacheable - * memory. We handle the fault mainly for the - * case when we are running with the cache disabled - * for debugging. - */ -static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) -{ - long __user *p; - int i, size; - -#ifdef __powerpc64__ - size = ppc64_caches.l1d.block_size; -#else - size = L1_CACHE_BYTES; -#endif - p = (long __user *) (regs->dar & -size); - if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size)) - return -EFAULT; - for (i = 0; i < size / sizeof(long); ++i) - if (__put_user_inatomic(0, p+i)) - return -EFAULT; - return 1; -} - -/* - * Emulate load & store multiple instructions - * On 64-bit machines, these instructions only affect/use the - * bottom 4 bytes of each register, and the loads clear the - * top 4 bytes of the affected register. - */ -#ifdef __BIG_ENDIAN__ -#ifdef CONFIG_PPC64 -#define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4) -#else -#define REG_BYTE(rp, i) *((u8 *)(rp) + (i)) -#endif -#else -#define REG_BYTE(rp, i) (*(((u8 *)((rp) + ((i)>>2)) + ((i)&3)))) -#endif - -#define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) - -static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, - unsigned int reg, unsigned int nb, - unsigned int flags, unsigned int instr, - unsigned long swiz) -{ - unsigned long *rptr; - unsigned int nb0, i, bswiz; - unsigned long p; - - /* - * We do not try to emulate 8 bytes multiple as they aren't really - * available in our operating environments and we don't try to - * emulate multiples operations in kernel land as they should never - * be used/generated there at least not on unaligned boundaries - */ - if (unlikely((nb > 4) || !user_mode(regs))) - return 0; - - /* lmw, stmw, lswi/x, stswi/x */ - nb0 = 0; - if (flags & HARD) { - if (flags & SX) { - nb = regs->xer & 127; - if (nb == 0) - return 1; - } else { - unsigned long pc = regs->nip ^ (swiz & 4); - - if (__get_user_inatomic(instr, - (unsigned int __user *)pc)) - return -EFAULT; - if (swiz == 0 && (flags & SW)) - instr = cpu_to_le32(instr); - nb = (instr >> 11) & 0x1f; - if (nb == 0) - nb = 32; - } - if (nb + reg * 4 > 128) { - nb0 = nb + reg * 4 - 128; - nb = 128 - reg * 4; - } -#ifdef __LITTLE_ENDIAN__ - /* - * String instructions are endian neutral but the code - * below is not. Force byte swapping on so that the - * effects of swizzling are undone in the load/store - * loops below. - */ - flags ^= SW; -#endif - } else { - /* lwm, stmw */ - nb = (32 - reg) * 4; - } - - if (!access_ok((flags & ST ? VERIFY_WRITE: VERIFY_READ), addr, nb+nb0)) - return -EFAULT; /* bad address */ - - rptr = ®s->gpr[reg]; - p = (unsigned long) addr; - bswiz = (flags & SW)? 3: 0; - - if (!(flags & ST)) { - /* - * This zeroes the top 4 bytes of the affected registers - * in 64-bit mode, and also zeroes out any remaining - * bytes of the last register for lsw*. - */ - memset(rptr, 0, ((nb + 3) / 4) * sizeof(unsigned long)); - if (nb0 > 0) - memset(®s->gpr[0], 0, - ((nb0 + 3) / 4) * sizeof(unsigned long)); - - for (i = 0; i < nb; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - if (nb0 > 0) { - rptr = ®s->gpr[0]; - addr += nb; - for (i = 0; i < nb0; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - } - - } else { - for (i = 0; i < nb; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - if (nb0 > 0) { - rptr = ®s->gpr[0]; - addr += nb; - for (i = 0; i < nb0; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - } - } - return 1; -} - -/* - * Emulate floating-point pair loads and stores. - * Only POWER6 has these instructions, and it does true little-endian, - * so we don't need the address swizzling. - */ -static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, - unsigned int flags) -{ - char *ptr0 = (char *) ¤t->thread.TS_FPR(reg); - char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); - int i, ret, sw = 0; - - if (reg & 1) - return 0; /* invalid form: FRS/FRT must be even */ - if (flags & SW) - sw = 7; - ret = 0; - for (i = 0; i < 8; ++i) { - if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); - } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); - } - } - if (ret) - return -EFAULT; - return 1; /* exception handled and fixed up */ -} - -#ifdef CONFIG_PPC64 -static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr, - unsigned int reg, unsigned int flags) -{ - char *ptr0 = (char *)®s->gpr[reg]; - char *ptr1 = (char *)®s->gpr[reg+1]; - int i, ret, sw = 0; - - if (reg & 1) - return 0; /* invalid form: GPR must be even */ - if (flags & SW) - sw = 7; - ret = 0; - for (i = 0; i < 8; ++i) { - if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); - } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); - } - } - if (ret) - return -EFAULT; - return 1; /* exception handled and fixed up */ -} -#endif /* CONFIG_PPC64 */ #ifdef CONFIG_SPE @@ -636,133 +282,21 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, } #endif /* CONFIG_SPE */ -#ifdef CONFIG_VSX -/* - * Emulate VSX instructions... - */ -static int emulate_vsx(unsigned char __user *addr, unsigned int reg, - unsigned int areg, struct pt_regs *regs, - unsigned int flags, unsigned int length, - unsigned int elsize) -{ - char *ptr; - unsigned long *lptr; - int ret = 0; - int sw = 0; - int i, j; - - /* userland only */ - if (unlikely(!user_mode(regs))) - return 0; - - flush_vsx_to_thread(current); - - if (reg < 32) - ptr = (char *) ¤t->thread.fp_state.fpr[reg][0]; - else - ptr = (char *) ¤t->thread.vr_state.vr[reg - 32]; - - lptr = (unsigned long *) ptr; - -#ifdef __LITTLE_ENDIAN__ - if (flags & SW) { - elsize = length; - sw = length-1; - } else { - /* - * The elements are BE ordered, even in LE mode, so process - * them in reverse order. - */ - addr += length - elsize; - - /* 8 byte memory accesses go in the top 8 bytes of the VR */ - if (length == 8) - ptr += 8; - } -#else - if (flags & SW) - sw = elsize-1; -#endif - - for (j = 0; j < length; j += elsize) { - for (i = 0; i < elsize; ++i) { - if (flags & ST) - ret |= __put_user(ptr[i^sw], addr + i); - else - ret |= __get_user(ptr[i^sw], addr + i); - } - ptr += elsize; -#ifdef __LITTLE_ENDIAN__ - addr -= elsize; -#else - addr += elsize; -#endif - } - -#ifdef __BIG_ENDIAN__ -#define VSX_HI 0 -#define VSX_LO 1 -#else -#define VSX_HI 1 -#define VSX_LO 0 -#endif - - if (!ret) { - if (flags & U) - regs->gpr[areg] = regs->dar; - - /* Splat load copies the same data to top and bottom 8 bytes */ - if (flags & SPLT) - lptr[VSX_LO] = lptr[VSX_HI]; - /* For 8 byte loads, zero the low 8 bytes */ - else if (!(flags & ST) && (8 == length)) - lptr[VSX_LO] = 0; - } else - return -EFAULT; - - return 1; -} -#endif - /* * Called on alignment exception. Attempts to fixup * * Return 1 on success * Return 0 if unable to handle the interrupt * Return -EFAULT if data address is bad + * Other negative return values indicate that the instruction can't + * be emulated, and the process should be given a SIGBUS. */ int fix_alignment(struct pt_regs *regs) { - unsigned int instr, nb, flags, instruction = 0; - unsigned int reg, areg; - unsigned int dsisr; - unsigned char __user *addr; - unsigned long p, swiz; - int ret, i; - union data { - u64 ll; - double dd; - unsigned char v[8]; - struct { -#ifdef __LITTLE_ENDIAN__ - int low32; - unsigned hi32; -#else - unsigned hi32; - int low32; -#endif - } x32; - struct { -#ifdef __LITTLE_ENDIAN__ - short low16; - unsigned char hi48[6]; -#else - unsigned char hi48[6]; - short low16; -#endif - } x16; - } data; + unsigned int instr; + struct instruction_op op; + int r, type; /* * We require a complete register set, if not, then our assembly @@ -770,121 +304,23 @@ int fix_alignment(struct pt_regs *regs) */ CHECK_FULL_REGS(regs); - dsisr = regs->dsisr; - - /* Some processors don't provide us with a DSISR we can use here, - * let's make one up from the instruction - */ - if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) { - unsigned long pc = regs->nip; - - if (cpu_has_feature(CPU_FTR_PPC_LE) && (regs->msr & MSR_LE)) - pc ^= 4; - if (unlikely(__get_user_inatomic(instr, - (unsigned int __user *)pc))) - return -EFAULT; - if (cpu_has_feature(CPU_FTR_REAL_LE) && (regs->msr & MSR_LE)) - instr = cpu_to_le32(instr); - dsisr = make_dsisr(instr); - instruction = instr; + if (unlikely(__get_user(instr, (unsigned int __user *)regs->nip))) + return -EFAULT; + if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { + /* We don't handle PPC little-endian any more... */ + if (cpu_has_feature(CPU_FTR_PPC_LE)) + return -EIO; + instr = swab32(instr); } - /* extract the operation and registers from the dsisr */ - reg = (dsisr >> 5) & 0x1f; /* source/dest register */ - areg = dsisr & 0x1f; /* register to update */ - #ifdef CONFIG_SPE if ((instr >> 26) == 0x4) { + int reg = (instr >> 21) & 0x1f; PPC_WARN_ALIGNMENT(spe, regs); return emulate_spe(regs, reg, instr); } #endif - instr = (dsisr >> 10) & 0x7f; - instr |= (dsisr >> 13) & 0x60; - - /* Lookup the operation in our table */ - nb = aligninfo[instr].len; - flags = aligninfo[instr].flags; - - /* - * Handle some cases which give overlaps in the DSISR values. - */ - if (IS_XFORM(instruction)) { - switch (get_xop(instruction)) { - case 532: /* ldbrx */ - nb = 8; - flags = LD+SW; - break; - case 660: /* stdbrx */ - nb = 8; - flags = ST+SW; - break; - case 20: /* lwarx */ - case 84: /* ldarx */ - case 116: /* lharx */ - case 276: /* lqarx */ - return 0; /* not emulated ever */ - } - } - - /* Byteswap little endian loads and stores */ - swiz = 0; - if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { - flags ^= SW; -#ifdef __BIG_ENDIAN__ - /* - * So-called "PowerPC little endian" mode works by - * swizzling addresses rather than by actually doing - * any byte-swapping. To emulate this, we XOR each - * byte address with 7. We also byte-swap, because - * the processor's address swizzling depends on the - * operand size (it xors the address with 7 for bytes, - * 6 for halfwords, 4 for words, 0 for doublewords) but - * we will xor with 7 and load/store each byte separately. - */ - if (cpu_has_feature(CPU_FTR_PPC_LE)) - swiz = 7; -#endif - } - - /* DAR has the operand effective address */ - addr = (unsigned char __user *)regs->dar; - -#ifdef CONFIG_VSX - if ((instruction & 0xfc00003e) == 0x7c000018) { - unsigned int elsize; - - /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */ - reg |= (instruction & 0x1) << 5; - /* Simple inline decoder instead of a table */ - /* VSX has only 8 and 16 byte memory accesses */ - nb = 8; - if (instruction & 0x200) - nb = 16; - - /* Vector stores in little-endian mode swap individual - elements, so process them separately */ - elsize = 4; - if (instruction & 0x80) - elsize = 8; - - flags = 0; - if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) - flags |= SW; - if (instruction & 0x100) - flags |= ST; - if (instruction & 0x040) - flags |= U; - /* splat load needs a special decoder */ - if ((instruction & 0x400) == 0){ - flags |= SPLT; - nb = 8; - } - PPC_WARN_ALIGNMENT(vsx, regs); - return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize); - } -#endif /* * ISA 3.0 (such as P9) copy, copy_first, paste and paste_last alignment @@ -896,173 +332,27 @@ int fix_alignment(struct pt_regs *regs) * when pasting to a co-processor. Furthermore, paste_last is the * synchronisation point for preceding copy/paste sequences. */ - if ((instruction & 0xfc0006fe) == PPC_INST_COPY) + if ((instr & 0xfc0006fe) == PPC_INST_COPY) return -EIO; - /* A size of 0 indicates an instruction we don't support, with - * the exception of DCBZ which is handled as a special case here - */ - if (instr == DCBZ) { - PPC_WARN_ALIGNMENT(dcbz, regs); - return emulate_dcbz(regs, addr); - } - if (unlikely(nb == 0)) - return 0; - - /* Load/Store Multiple instructions are handled in their own - * function - */ - if (flags & M) { - PPC_WARN_ALIGNMENT(multiple, regs); - return emulate_multiple(regs, addr, reg, nb, - flags, instr, swiz); - } - - /* Verify the address of the operand */ - if (unlikely(user_mode(regs) && - !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ), - addr, nb))) - return -EFAULT; - - /* Force the fprs into the save area so we can reference them */ - if (flags & F) { - /* userland only */ - if (unlikely(!user_mode(regs))) - return 0; - flush_fp_to_thread(current); - } + r = analyse_instr(&op, regs, instr); + if (r < 0) + return -EINVAL; - if (nb == 16) { - if (flags & F) { - /* Special case for 16-byte FP loads and stores */ - PPC_WARN_ALIGNMENT(fp_pair, regs); - return emulate_fp_pair(addr, reg, flags); - } else { -#ifdef CONFIG_PPC64 - /* Special case for 16-byte loads and stores */ - PPC_WARN_ALIGNMENT(lq_stq, regs); - return emulate_lq_stq(regs, addr, reg, flags); -#else - return 0; -#endif - } - } - - PPC_WARN_ALIGNMENT(unaligned, regs); - - /* If we are loading, get the data from user space, else - * get it from register values - */ - if (!(flags & ST)) { - unsigned int start = 0; - - switch (nb) { - case 4: - start = offsetof(union data, x32.low32); - break; - case 2: - start = offsetof(union data, x16.low16); - break; - } - - data.ll = 0; - ret = 0; - p = (unsigned long)addr; - - for (i = 0; i < nb; i++) - ret |= __get_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; - - } else if (flags & F) { - data.ll = current->thread.TS_FPR(reg); - if (flags & S) { - /* Single-precision FP store requires conversion... */ -#ifdef CONFIG_PPC_FPU - preempt_disable(); - enable_kernel_fp(); - cvt_df(&data.dd, (float *)&data.x32.low32); - disable_kernel_fp(); - preempt_enable(); -#else - return 0; -#endif - } - } else - data.ll = regs->gpr[reg]; - - if (flags & SW) { - switch (nb) { - case 8: - data.ll = swab64(data.ll); - break; - case 4: - data.x32.low32 = swab32(data.x32.low32); - break; - case 2: - data.x16.low16 = swab16(data.x16.low16); - break; - } - } - - /* Perform other misc operations like sign extension - * or floating point single precision conversion - */ - switch (flags & ~(U|SW)) { - case LD+SE: /* sign extending integer loads */ - case LD+F+SE: /* sign extend for lfiwax */ - if ( nb == 2 ) - data.ll = data.x16.low16; - else /* nb must be 4 */ - data.ll = data.x32.low32; - break; - - /* Single-precision FP load requires conversion... */ - case LD+F+S: -#ifdef CONFIG_PPC_FPU - preempt_disable(); - enable_kernel_fp(); - cvt_fd((float *)&data.x32.low32, &data.dd); - disable_kernel_fp(); - preempt_enable(); -#else - return 0; -#endif - break; + type = op.type & INSTR_TYPE_MASK; + if (!OP_IS_LOAD_STORE(type)) { + if (type != CACHEOP + DCBZ) + return -EINVAL; + PPC_WARN_ALIGNMENT(dcbz, regs); + r = emulate_dcbz(op.ea, regs); + } else { + if (type == LARX || type == STCX) + return -EIO; + PPC_WARN_ALIGNMENT(unaligned, regs); + r = emulate_loadstore(regs, &op); } - /* Store result to memory or update registers */ - if (flags & ST) { - unsigned int start = 0; - - switch (nb) { - case 4: - start = offsetof(union data, x32.low32); - break; - case 2: - start = offsetof(union data, x16.low16); - break; - } - - ret = 0; - p = (unsigned long)addr; - - for (i = 0; i < nb; i++) - ret |= __put_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; - } else if (flags & F) - current->thread.TS_FPR(reg) = data.ll; - else - regs->gpr[reg] = data.ll; - - /* Update RA as needed */ - if (flags & U) - regs->gpr[areg] = regs->dar; - - return 1; + if (!r) + return 1; + return r; } diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6e95c2c19a7e..8cfb20e38cfe 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -746,6 +746,14 @@ int main(void) OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); +#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) + STOP_SPR(STOP_PID, pid); + STOP_SPR(STOP_LDBAR, ldbar); + STOP_SPR(STOP_FSCR, fscr); + STOP_SPR(STOP_HFSCR, hfscr); + STOP_SPR(STOP_MMCR1, mmcr1); + STOP_SPR(STOP_MMCR2, mmcr2); + STOP_SPR(STOP_MMCRA, mmcra); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 8275858a434d..3f46ca1c59f9 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -253,7 +253,7 @@ int __init btext_find_display(int allow_nonstdout) for_each_node_by_type(np, "display") { if (of_get_property(np, "linux,opened", NULL)) { - printk("trying %s ...\n", np->full_name); + printk("trying %pOF ...\n", np); rc = btext_initialize(np); printk("result: %d\n", rc); } diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index c641983bbdd6..a8f20e5928e1 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -167,10 +167,10 @@ static void release_cache_debugcheck(struct cache *cache) list_for_each_entry(iter, &cache_list, list) WARN_ONCE(iter->next_local == cache, - "cache for %s(%s) refers to cache for %s(%s)\n", - iter->ofnode->full_name, + "cache for %pOF(%s) refers to cache for %pOF(%s)\n", + iter->ofnode, cache_type_string(iter), - cache->ofnode->full_name, + cache->ofnode, cache_type_string(cache)); } @@ -179,8 +179,8 @@ static void release_cache(struct cache *cache) if (!cache) return; - pr_debug("freeing L%d %s cache for %s\n", cache->level, - cache_type_string(cache), cache->ofnode->full_name); + pr_debug("freeing L%d %s cache for %pOF\n", cache->level, + cache_type_string(cache), cache->ofnode); release_cache_debugcheck(cache); list_del(&cache->list); @@ -194,8 +194,8 @@ static void cache_cpu_set(struct cache *cache, int cpu) while (next) { WARN_ONCE(cpumask_test_cpu(cpu, &next->shared_cpu_map), - "CPU %i already accounted in %s(%s)\n", - cpu, next->ofnode->full_name, + "CPU %i already accounted in %pOF(%s)\n", + cpu, next->ofnode, cache_type_string(next)); cpumask_set_cpu(cpu, &next->shared_cpu_map); next = next->next_local; @@ -355,7 +355,7 @@ static int cache_is_unified_d(const struct device_node *np) */ static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level) { - pr_debug("creating L%d ucache for %s\n", level, node->full_name); + pr_debug("creating L%d ucache for %pOF\n", level, node); return new_cache(cache_is_unified_d(node), level, node); } @@ -365,8 +365,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node, { struct cache *dcache, *icache; - pr_debug("creating L%d dcache and icache for %s\n", level, - node->full_name); + pr_debug("creating L%d dcache and icache for %pOF\n", level, + node); dcache = new_cache(CACHE_TYPE_DATA, level, node); icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); @@ -679,7 +679,6 @@ static struct kobj_type cache_index_type = { static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) { - const char *cache_name; const char *cache_type; struct cache *cache; char *buf; @@ -690,7 +689,6 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) return; cache = dir->cache; - cache_name = cache->ofnode->full_name; cache_type = cache_type_string(cache); /* We don't want to create an attribute that can't provide a @@ -707,14 +705,14 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) rc = attr->show(&dir->kobj, attr, buf); if (rc <= 0) { pr_debug("not creating %s attribute for " - "%s(%s) (rc = %zd)\n", - attr->attr.name, cache_name, + "%pOF(%s) (rc = %zd)\n", + attr->attr.name, cache->ofnode, cache_type, rc); continue; } if (sysfs_create_file(&dir->kobj, &attr->attr)) - pr_debug("could not create %s attribute for %s(%s)\n", - attr->attr.name, cache_name, cache_type); + pr_debug("could not create %s attribute for %pOF(%s)\n", + attr->attr.name, cache->ofnode, cache_type); } kfree(buf); @@ -831,8 +829,8 @@ static void cache_cpu_clear(struct cache *cache, int cpu) struct cache *next = cache->next_local; WARN_ONCE(!cpumask_test_cpu(cpu, &cache->shared_cpu_map), - "CPU %i not accounted in %s(%s)\n", - cpu, cache->ofnode->full_name, + "CPU %i not accounted in %pOF(%s)\n", + cpu, cache->ofnode, cache_type_string(cache)); cpumask_clear_cpu(cpu, &cache->shared_cpu_map); diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 6f849832a669..760872916013 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -1259,10 +1259,10 @@ static struct cpu_spec __initdata cpu_specs[] = { .platform = "ppc603", }, #endif /* CONFIG_PPC_BOOK3S_32 */ -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx { /* 8xx */ .pvr_mask = 0xffff0000, - .pvr_value = 0x00500000, + .pvr_value = PVR_8xx, .cpu_name = "8xx", /* CPU_FTR_MAYBE_CAN_DOZE is possible, * if the 8xx code is there.... */ @@ -1274,7 +1274,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_8xx, .platform = "ppc823", }, -#endif /* CONFIG_8xx */ +#endif /* CONFIG_PPC_8xx */ #ifdef CONFIG_40x { /* 403GC */ .pvr_mask = 0xffffff00, @@ -1936,6 +1936,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_440A, .platform = "ppc440", }, +#ifdef CONFIG_PPC_47x { /* 476 DD2 core */ .pvr_mask = 0xffffffff, .pvr_value = 0x11a52080, @@ -1992,6 +1993,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_47x, .platform = "ppc470", }, +#endif /* CONFIG_PPC_47x */ { /* default match */ .pvr_mask = 0x00000000, .pvr_value = 0x00000000, diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 63992b2d8e15..9e816787c0d4 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -44,6 +44,7 @@ #include <asm/machdep.h> #include <asm/ppc-pci.h> #include <asm/rtas.h> +#include <asm/pte-walk.h> /** Overview: @@ -169,10 +170,10 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) char buffer[128]; n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg); @@ -352,8 +353,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) * worried about _PAGE_SPLITTING/collapse. Also we will not hit * page table free, because of init_mm. */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(token, &hugepage_shift); if (!ptep) return token; WARN_ON(hugepage_shift); @@ -435,7 +435,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) int ret; int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); unsigned long flags; - struct pci_dn *pdn; + struct device_node *dn; struct pci_dev *dev; struct eeh_pe *pe, *parent_pe, *phb_pe; int rc = 0; @@ -493,9 +493,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev) if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; if (pe->check_count % EEH_MAX_FAILS == 0) { - pdn = eeh_dev_to_pdn(edev); - if (pdn->node) - location = of_get_property(pdn->node, "ibm,loc-code", NULL); + dn = pci_device_to_OF_node(dev); + if (dn) + location = of_get_property(dn, "ibm,loc-code", + NULL); printk(KERN_ERR "EEH: %d reads ignored for recovering device at " "location=%s driver=%s pci addr=%s\n", pe->check_count, @@ -1064,7 +1065,7 @@ core_initcall_sync(eeh_init); */ void eeh_add_device_early(struct pci_dn *pdn) { - struct pci_controller *phb; + struct pci_controller *phb = pdn ? pdn->phb : NULL; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); if (!edev) @@ -1074,7 +1075,6 @@ void eeh_add_device_early(struct pci_dn *pdn) return; /* USB Bus children of PCI devices will not have BUID's */ - phb = edev->phb; if (NULL == phb || (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid)) return; diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index d6b2ca70d14d..ad04ecd63c20 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -50,21 +50,16 @@ */ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) { - struct pci_controller *phb = pdn->phb; struct eeh_dev *edev; /* Allocate EEH device */ edev = kzalloc(sizeof(*edev), GFP_KERNEL); - if (!edev) { - pr_warn("%s: out of memory\n", - __func__); + if (!edev) return NULL; - } /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; - edev->phb = phb; INIT_LIST_HEAD(&edev->list); INIT_LIST_HEAD(&edev->rmv_list); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index c405c79e50cd..8b840191df59 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -428,7 +428,7 @@ static void *eeh_add_virt_device(void *data, void *userdata) if (!(edev->physfn)) { pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", - __func__, edev->phb->global_number, pdn->busno, + __func__, pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); return NULL; } diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index cc4b206f77e4..2e8d1b2b5af4 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -230,10 +230,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root, * Bus/Device/Function number. The extra data referred by flag * indicates which type of address should be used. */ +struct eeh_pe_get_flag { + int pe_no; + int config_addr; +}; + static void *__eeh_pe_get(void *data, void *flag) { struct eeh_pe *pe = (struct eeh_pe *)data; - struct eeh_dev *edev = (struct eeh_dev *)flag; + struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag; /* Unexpected PHB PE */ if (pe->type & EEH_PE_PHB) @@ -244,17 +249,17 @@ static void *__eeh_pe_get(void *data, void *flag) * have non-zero PE address */ if (eeh_has_flag(EEH_VALID_PE_ZERO)) { - if (edev->pe_config_addr == pe->addr) + if (tmp->pe_no == pe->addr) return pe; } else { - if (edev->pe_config_addr && - (edev->pe_config_addr == pe->addr)) + if (tmp->pe_no && + (tmp->pe_no == pe->addr)) return pe; } /* Try BDF address */ - if (edev->config_addr && - (edev->config_addr == pe->config_addr)) + if (tmp->config_addr && + (tmp->config_addr == pe->config_addr)) return pe; return NULL; @@ -262,7 +267,9 @@ static void *__eeh_pe_get(void *data, void *flag) /** * eeh_pe_get - Search PE based on the given address - * @edev: EEH device + * @phb: PCI controller + * @pe_no: PE number + * @config_addr: Config address * * Search the corresponding PE based on the specified address which * is included in the eeh device. The function is used to check if @@ -271,12 +278,14 @@ static void *__eeh_pe_get(void *data, void *flag) * which is composed of PCI bus/device/function number, or unified * PE address. */ -struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) +struct eeh_pe *eeh_pe_get(struct pci_controller *phb, + int pe_no, int config_addr) { - struct eeh_pe *root = eeh_phb_pe_get(edev->phb); + struct eeh_pe *root = eeh_phb_pe_get(phb); + struct eeh_pe_get_flag tmp = { pe_no, config_addr }; struct eeh_pe *pe; - pe = eeh_pe_traverse(root, __eeh_pe_get, edev); + pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp); return pe; } @@ -330,11 +339,13 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) int eeh_add_to_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + int config_addr = (pdn->busno << 8) | (pdn->devfn); /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n", - __func__, edev->config_addr, edev->phb->global_number); + __func__, config_addr, pdn->phb->global_number); return -EINVAL; } @@ -344,7 +355,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * PE should be composed of PCI bus and its subordinate * components. */ - pe = eeh_pe_get(edev); + pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); if (pe && !(pe->type & EEH_PE_INVALID)) { /* Mark the PE as type of PCI bus */ pe->type = EEH_PE_BUS; @@ -353,11 +364,11 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Put the edev to PE */ list_add_tail(&edev->list, &pe->edevs); pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), - pe->addr); + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), + pe->addr); return 0; } else if (pe && (pe->type & EEH_PE_INVALID)) { list_add_tail(&edev->list, &pe->edevs); @@ -376,25 +387,25 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device " "PE#%x, Parent PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), - pe->addr, pe->parent->addr); + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), + pe->addr, pe->parent->addr); return 0; } /* Create a new EEH PE */ if (edev->physfn) - pe = eeh_pe_alloc(edev->phb, EEH_PE_VF); + pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF); else - pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); + pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE); if (!pe) { pr_err("%s: out of memory!\n", __func__); return -ENOMEM; } pe->addr = edev->pe_config_addr; - pe->config_addr = edev->config_addr; + pe->config_addr = config_addr; /* * Put the new EEH PE into hierarchy tree. If the parent @@ -404,10 +415,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) */ parent = eeh_pe_get_parent(edev); if (!parent) { - parent = eeh_phb_pe_get(edev->phb); + parent = eeh_phb_pe_get(pdn->phb); if (!parent) { pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", - __func__, edev->phb->global_number); + __func__, pdn->phb->global_number); edev->pe = NULL; kfree(pe); return -EEXIST; @@ -424,10 +435,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) edev->pe = pe; pr_debug("EEH: Add %04x:%02x:%02x.%01x to " "Device PE#%x, Parent PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), pe->addr, pe->parent->addr); return 0; @@ -446,13 +457,14 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; int cnt; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); if (!edev->pe) { pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", - __func__, edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF)); + __func__, pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn)); return -EEXIST; } @@ -712,10 +724,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) return; pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", - __func__, edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF)); + __func__, pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn)); /* Check slot status */ cap = edev->pcie_cap; diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 1ceecdda810b..797549289798 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -51,7 +51,6 @@ static ssize_t eeh_show_##_name(struct device *dev, \ static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); EEH_SHOW_ATTR(eeh_mode, mode, "0x%x"); -EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x"); EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x"); static ssize_t eeh_pe_state_show(struct device *dev, @@ -103,7 +102,6 @@ void eeh_sysfs_add_device(struct pci_dev *pdev) return; rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_state); @@ -128,7 +126,6 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev) } device_remove_file(&pdev->dev, &dev_attr_eeh_mode); - device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 8587059ad848..e780e1fbf6c2 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -43,6 +43,13 @@ #define LOAD_MSR_KERNEL(r, x) li r,(x) #endif +/* + * Align to 4k in order to ensure that all functions modyfing srr0/srr1 + * fit into one page in order to not encounter a TLB miss between the + * modification of srr0/srr1 and the associated rfi. + */ + .align 12 + #ifdef CONFIG_BOOKE .globl mcheck_transfer_to_handler mcheck_transfer_to_handler: @@ -586,6 +593,10 @@ ppc_swapcontext: handle_page_fault: stw r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_6xx + andis. r0,r5,DSISR_DABRMATCH@h + bne- handle_dabr_fault +#endif bl do_page_fault cmpwi r3,0 beq+ ret_from_except @@ -599,6 +610,17 @@ handle_page_fault: bl bad_page_fault b ret_from_except_full +#ifdef CONFIG_6xx + /* We have a data breakpoint exception - handle it */ +handle_dabr_fault: + SAVE_NVGPRS(r1) + lwz r0,_TRAP(r1) + clrrwi r0,r0,1 + stw r0,_TRAP(r1) + bl do_break + b ret_from_except_full +#endif + /* * This routine switches between two different tasks. The process * state of one is saved on its kernel stack. Then the state diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e925c1c99c71..4a0fd4f40245 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -966,16 +966,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) #ifdef CONFIG_PPC_BOOK3E cmpwi cr0,r3,0x280 #else - BEGIN_FTR_SECTION - cmpwi cr0,r3,0xe80 - FTR_SECTION_ELSE - cmpwi cr0,r3,0xa00 - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + cmpwi cr0,r3,0xa00 #endif /* CONFIG_PPC_BOOK3E */ bne 1f addi r3,r1,STACK_FRAME_OVERHEAD; bl doorbell_exception - b ret_from_except #endif /* CONFIG_PPC_DOORBELL */ 1: b ret_from_except /* What else to do here ? */ @@ -1109,7 +1104,7 @@ _ASM_NOKPROBE_SYMBOL(__enter_rtas) _ASM_NOKPROBE_SYMBOL(rtas_return_loc) .align 3 -1: .llong rtas_restore_regs +1: .8byte rtas_restore_regs rtas_restore_regs: /* relocation is on at this point */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index f14f3c04ec7e..48da0f5d2f7f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -541,7 +541,7 @@ EXC_COMMON_BEGIN(instruction_access_common) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) - andis. r4,r12,0x5820 + andis. r4,r12,DSISR_BAD_FAULT_64S@h li r5,0x400 std r3,_DAR(r1) std r4,_DSISR(r1) @@ -1314,7 +1314,7 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) && defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH) +#ifdef CONFIG_PPC_WATCHDOG #define MASKED_DEC_HANDLER_LABEL 3f @@ -1343,10 +1343,10 @@ EXC_COMMON_BEGIN(soft_nmi_common) ADD_NVGPRS;ADD_RECONCILE) b ret_from_except -#else +#else /* CONFIG_PPC_WATCHDOG */ #define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ #define MASKED_DEC_HANDLER(_H) -#endif +#endif /* CONFIG_PPC_WATCHDOG */ /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -1370,19 +1370,16 @@ masked_##_H##interrupt: \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ b MASKED_DEC_HANDLER_LABEL; \ -1: cmpwi r10,PACA_IRQ_DBELL; \ - beq 2f; \ - cmpwi r10,PACA_IRQ_HMI; \ - beq 2f; \ +1: andi. r10,r10,(PACA_IRQ_DBELL|PACA_IRQ_HMI); \ + bne 2f; \ mfspr r10,SPRN_##_H##SRR1; \ - rldicl r10,r10,48,1; /* clear MSR_EE */ \ - rotldi r10,r10,16; \ + xori r10,r10,MSR_EE; /* clear MSR_EE */ \ mtspr SPRN_##_H##SRR1,r10; \ 2: mtcrf 0x80,r9; \ ld r9,PACA_EXGEN+EX_R9(r13); \ ld r10,PACA_EXGEN+EX_R10(r13); \ ld r11,PACA_EXGEN+EX_R11(r13); \ - GET_SCRATCH0(r13); \ + /* returns to kernel where r13 must be set up, so don't restore it */ \ ##_H##rfid; \ b .; \ MASKED_DEC_HANDLER(_H) @@ -1485,8 +1482,10 @@ USE_TEXT_SECTION() */ .balign IFETCH_ALIGN_BYTES do_hash_page: -#ifdef CONFIG_PPC_STD_MMU_64 - andis. r0,r4,0xa450 /* weird error? */ + #ifdef CONFIG_PPC_STD_MMU_64 + lis r0,DSISR_BAD_FAULT_64S@h + ori r0,r0,DSISR_BAD_FAULT_64S@l + and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ CURRENT_THREAD_INFO(r11, r1) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ @@ -1669,25 +1668,27 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - LOAD_REG_ADDR(r11, 1f) + LOAD_REG_ADDR(r11, replay_interrupt_return) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 beq decrementer_common cmpwi r3,0x500 +BEGIN_FTR_SECTION + beq h_virt_irq_common +FTR_SECTION_ELSE beq hardware_interrupt_common +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300) BEGIN_FTR_SECTION - cmpwi r3,0xe80 + cmpwi r3,0xa00 beq h_doorbell_common_msgclr - cmpwi r3,0xea0 - beq h_virt_irq_common cmpwi r3,0xe60 beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -1: +replay_interrupt_return: blr _ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index dc0c49cfd90a..e1431800bfb9 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -125,6 +125,13 @@ int is_fadump_boot_memory_area(u64 addr, ulong size) return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; } +int should_fadump_crash(void) +{ + if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + return 0; + return 1; +} + int is_fadump_active(void) { return fw_dump.dump_active; @@ -518,7 +525,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) struct fadump_crash_info_header *fdh = NULL; int old_cpu, this_cpu; - if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + if (!should_fadump_crash()) return; /* @@ -1446,6 +1453,25 @@ static void fadump_init_files(void) return; } +static int fadump_panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything + * else. If this returns, then fadump was not registered, so + * go through the rest of the panic path. + */ + crash_fadump(NULL, ptr); + + return NOTIFY_DONE; +} + +static struct notifier_block fadump_panic_block = { + .notifier_call = fadump_panic_event, + .priority = INT_MIN /* may not return; must be done last */ +}; + /* * Prepare for firmware-assisted dump. */ @@ -1478,6 +1504,9 @@ int __init setup_fadump(void) init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); fadump_init_files(); + atomic_notifier_chain_register(&panic_notifier_list, + &fadump_panic_block); + return 1; } subsys_initcall(setup_fadump); diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e22734278458..8c54166491e7 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -388,7 +388,7 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) - andis. r0,r10,0xa470 /* weird error? */ + andis. r0,r10,DSISR_BAD_FAULT_32S@h bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ @@ -403,13 +403,13 @@ DataAccess: DO_KVM 0x400 InstructionAccess: EXCEPTION_PROLOG - andis. r0,r9,0x4000 /* no pte found? */ + andis. r0,r9,SRR1_ISI_NOPT@h /* no pte found? */ beq 1f /* if so, try to put a PTE */ li r3,0 /* into the hash table */ mr r4,r12 /* SRR0 is fault address */ bl hash_page 1: mr r4,r12 - mr r5,r9 + andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 0ddc602b33a4..ff8511d6d8ea 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -92,13 +92,13 @@ END_FTR_SECTION(0, 1) .balign 8 .globl __secondary_hold_spinloop __secondary_hold_spinloop: - .llong 0x0 + .8byte 0x0 /* Secondary processors write this value with their cpu # */ /* after they enter the spin loop immediately below. */ .globl __secondary_hold_acknowledge __secondary_hold_acknowledge: - .llong 0x0 + .8byte 0x0 #ifdef CONFIG_RELOCATABLE /* This flag is set to 1 by a loader if the kernel should run @@ -650,7 +650,7 @@ __after_prom_start: bctr .balign 8 -p_end: .llong _end - copy_to_here +p_end: .8byte _end - copy_to_here 4: /* @@ -892,7 +892,7 @@ _GLOBAL(relative_toc) blr .balign 8 -p_toc: .llong __toc_start + 0x8000 - 0b +p_toc: .8byte __toc_start + 0x8000 - 0b /* * This is where the main kernel code starts. diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index c032fe8c2d26..4fee00d414e8 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -50,18 +50,20 @@ mtspr spr, reg #endif -/* Macro to test if an address is a kernel address */ #if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000 -#define IS_KERNEL(tmp, addr) \ - andis. tmp, addr, 0x8000 /* Address >= 0x80000000 */ -#define BRANCH_UNLESS_KERNEL(label) beq label -#else -#define IS_KERNEL(tmp, addr) \ - rlwinm tmp, addr, 16, 16, 31; \ - cmpli cr0, tmp, PAGE_OFFSET >> 16 -#define BRANCH_UNLESS_KERNEL(label) blt label +/* By simply checking Address >= 0x80000000, we know if its a kernel address */ +#define SIMPLE_KERNEL_ADDRESS 1 #endif +/* + * We need an ITLB miss handler for kernel addresses if: + * - Either we have modules + * - Or we have not pinned the first 8M + */ +#if defined(CONFIG_MODULES) || !defined(CONFIG_PIN_TLB_TEXT) || \ + defined(CONFIG_DEBUG_PAGEALLOC) +#define ITLB_MISS_KERNEL 1 +#endif /* * Value for the bits that have fixed value in RPN entries. @@ -123,7 +125,6 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC rfi /* enables MMU */ /* @@ -170,7 +171,7 @@ turn_on_mmu: stw r1,0(r11); \ tovirt(r1,r11); /* set new kernel sp */ \ li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \ - MTMSRD(r10); /* (except for mach check in rtas) */ \ + mtmsr r10; \ stw r0,GPR0(r11); \ SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) @@ -300,7 +301,7 @@ SystemCall: /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. */ - EXCEPTION(0x1000, SoftEmu, SoftwareEmulation, EXC_XFER_STD) + EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) . = 0x1100 /* @@ -325,7 +326,7 @@ SystemCall: #endif InstructionTLBMiss: -#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mtspr SPRN_SPRG_SCRATCH2, r3 #endif EXCEPTION_PROLOG_0 @@ -343,15 +344,32 @@ InstructionTLBMiss: INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10) /* Only modules will cause ITLB Misses as we always * pin the first 8MB of kernel memory */ -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mfcr r3 #endif -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) - IS_KERNEL(r11, r10) +#ifdef ITLB_MISS_KERNEL +#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) + andis. r11, r10, 0x8000 /* Address >= 0x80000000 */ +#else + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h +#ifndef CONFIG_PIN_TLB_TEXT + /* It is assumed that kernel code fits into the first 8M page */ +_ENTRY(ITLBMiss_cmp) + cmpli cr7, r11, (PAGE_OFFSET + 0x0800000)@h +#endif +#endif #endif mfspr r11, SPRN_M_TW /* Get level 1 table */ -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) - BRANCH_UNLESS_KERNEL(3f) +#ifdef ITLB_MISS_KERNEL +#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) + beq+ 3f +#else + blt+ 3f +#endif +#ifndef CONFIG_PIN_TLB_TEXT + blt cr7, ITLBMissLinear +#endif lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha 3: #endif @@ -369,7 +387,7 @@ InstructionTLBMiss: rlwimi r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */ lwz r10, 0(r10) /* Get the pte */ 4: -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mtcr r3 #endif /* Insert the APG into the TWC from the Linux PTE. */ @@ -400,7 +418,7 @@ InstructionTLBMiss: MTSPR_CPU6(SPRN_MI_RPN, r10, r3) /* Update TLB entry */ /* Restore registers */ -#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mfspr r3, SPRN_SPRG_SCRATCH2 #endif EXCEPTION_EPILOG_0 @@ -447,23 +465,23 @@ DataStoreTLBMiss: * kernel page tables. */ mfspr r10, SPRN_MD_EPN - rlwinm r10, r10, 16, 0xfff8 - cmpli cr0, r10, PAGE_OFFSET@h + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TW /* Get level 1 table */ blt+ 3f + rlwinm r11, r10, 16, 0xfff8 #ifndef CONFIG_PIN_TLB_IMMR - cmpli cr0, r10, VIRT_IMMR_BASE@h + cmpli cr0, r11, VIRT_IMMR_BASE@h #endif _ENTRY(DTLBMiss_cmp) - cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h - lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha + cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h #ifndef CONFIG_PIN_TLB_IMMR _ENTRY(DTLBMiss_jmp) beq- DTLBMissIMMR #endif blt cr7, DTLBMissLinear + lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha 3: - mfspr r10, SPRN_MD_EPN /* Insert level 1 index */ rlwimi r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29 @@ -569,8 +587,8 @@ _ENTRY(DTLBMiss_jmp) InstructionTLBError: EXCEPTION_PROLOG mr r4,r12 - mr r5,r9 - andis. r10,r5,0x4000 + andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ + andis. r10,r9,SRR1_ISI_NOPT@h beq+ 1f tlbie r4 itlbie: @@ -595,7 +613,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */ mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) mfspr r4,SPRN_DAR - andis. r10,r5,0x4000 + andis. r10,r5,DSISR_NOHPTE@h beq+ 1f tlbie r4 dtlbie: @@ -684,7 +702,7 @@ DTLBMissLinear: /* Set 8M byte page and mark it valid */ li r11, MD_PS8MEG | MD_SVALID MTSPR_CPU6(SPRN_MD_TWC, r11, r3) - rlwinm r10, r10, 16, 0x0f800000 /* 8xx supports max 256Mb RAM */ + rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \ _PAGE_PRESENT MTSPR_CPU6(SPRN_MD_RPN, r10, r11) /* Update TLB entry */ @@ -695,6 +713,22 @@ DTLBMissLinear: EXCEPTION_EPILOG_0 rfi +#ifndef CONFIG_PIN_TLB_TEXT +ITLBMissLinear: + mtcr r3 + /* Set 8M byte page and mark it valid */ + li r11, MI_PS8MEG | MI_SVALID | _PAGE_EXEC + MTSPR_CPU6(SPRN_MI_TWC, r11, r3) + rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ + ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \ + _PAGE_PRESENT + MTSPR_CPU6(SPRN_MI_RPN, r10, r11) /* Update TLB entry */ + + mfspr r3, SPRN_SPRG_SCRATCH2 + EXCEPTION_EPILOG_0 + rfi +#endif + /* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions * by decoding the registers used by the dcbx instruction and adding them. * DAR is set to the calculated address. @@ -705,9 +739,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ mtspr SPRN_SPRG_SCRATCH2, r10 /* fetch instruction from memory. */ mfspr r10, SPRN_SRR0 - IS_KERNEL(r11, r10) + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TW /* Get level 1 table */ - BRANCH_UNLESS_KERNEL(3f) + blt+ 3f rlwinm r11, r10, 16, 0xfff8 _ENTRY(FixupDAR_cmp) cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h @@ -915,10 +950,8 @@ start_here: rfi /* Load up the kernel context */ 2: - SYNC /* Force all PTE updates to finish */ tlbia /* Clear all TLB entries */ sync /* wait for tlbia/tlbie to finish */ - TLBSYNC /* ... on all CPUs */ /* set up the PTE pointers for the Abatron bdiGDB. */ @@ -955,15 +988,14 @@ initial_mmu: mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ tlbia /* Invalidate all TLB entries */ -/* Always pin the first 8 MB ITLB to prevent ITLB - misses while mucking around with SRR0/SRR1 in asm -*/ +#ifdef CONFIG_PIN_TLB_TEXT lis r8, MI_RSV4I@h ori r8, r8, 0x1c00 mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ +#endif -#ifdef CONFIG_PIN_TLB +#ifdef CONFIG_PIN_TLB_DATA oris r10, r10, MD_RSV4I@h mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ #endif @@ -989,6 +1021,7 @@ initial_mmu: * internal registers (among other things). */ #ifdef CONFIG_PIN_TLB_IMMR + oris r10, r10, MD_RSV4I@h ori r10, r10, 0x1c00 mtspr SPRN_MD_CTR, r10 diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index e6252c5a57a4..1125c9be9e06 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -85,7 +85,61 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) std r3,_WORT(r1) mfspr r3,SPRN_WORC std r3,_WORC(r1) +/* + * On POWER9, there are idle states such as stop4, invoked via cpuidle, + * that lose hypervisor resources. In such cases, we need to save + * additional SPRs before entering those idle states so that they can + * be restored to their older values on wakeup from the idle state. + * + * On POWER8, the only such deep idle state is winkle which is used + * only in the context of CPU-Hotplug, where these additional SPRs are + * reinitiazed to a sane value. Hence there is no need to save/restore + * these SPRs. + */ +BEGIN_FTR_SECTION + blr +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + +power9_save_additional_sprs: + mfspr r3, SPRN_PID + mfspr r4, SPRN_LDBAR + std r3, STOP_PID(r13) + std r4, STOP_LDBAR(r13) + mfspr r3, SPRN_FSCR + mfspr r4, SPRN_HFSCR + std r3, STOP_FSCR(r13) + std r4, STOP_HFSCR(r13) + + mfspr r3, SPRN_MMCRA + mfspr r4, SPRN_MMCR1 + std r3, STOP_MMCRA(r13) + std r4, STOP_MMCR1(r13) + + mfspr r3, SPRN_MMCR2 + std r3, STOP_MMCR2(r13) + blr + +power9_restore_additional_sprs: + ld r3,_LPCR(r1) + ld r4, STOP_PID(r13) + mtspr SPRN_LPCR,r3 + mtspr SPRN_PID, r4 + + ld r3, STOP_LDBAR(r13) + ld r4, STOP_FSCR(r13) + mtspr SPRN_LDBAR, r3 + mtspr SPRN_FSCR, r4 + + ld r3, STOP_HFSCR(r13) + ld r4, STOP_MMCRA(r13) + mtspr SPRN_HFSCR, r3 + mtspr SPRN_MMCRA, r4 + /* We have already restored PACA_MMCR0 */ + ld r3, STOP_MMCR1(r13) + ld r4, STOP_MMCR2(r13) + mtspr SPRN_MMCR1, r3 + mtspr SPRN_MMCR2, r4 blr /* @@ -141,7 +195,16 @@ pnv_powersave_common: std r5,_CCR(r1) std r1,PACAR1(r13) +BEGIN_FTR_SECTION + /* + * POWER9 does not require real mode to stop, and presently does not + * set hwthread_state for KVM (threads don't share MMU context), so + * we can remain in virtual mode for this. + */ + bctr +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) /* + * POWER8 * Go to real mode to do the nap, as required by the architecture. * Also, we need to be in real mode before setting hwthread_state, * because as soon as we do that, another thread can switch @@ -151,6 +214,20 @@ pnv_powersave_common: mtmsrd r7,0 bctr +/* + * This is the sequence required to execute idle instructions, as + * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0. + */ +#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ + /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ + std r0,0(r1); \ + ptesync; \ + ld r0,0(r1); \ +236: cmpd cr0,r0,r0; \ + bne 236b; \ + IDLE_INST; + + .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -242,20 +319,27 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ -power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* Tell KVM we're entering idle */ +power_enter_stop_kvm_rm: + /* + * This is currently unused because POWER9 KVM does not have to + * gather secondary threads into sibling mode, but the code is + * here in case that function is required. + * + * Tell KVM we're entering idle. + */ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif +power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ bne .Lhandle_esl_ec_set - IDLE_STATE_ENTER_SEQ(PPC_STOP) + PPC_STOP li r3,0 /* Since we didn't lose state, return 0 */ /* @@ -288,7 +372,8 @@ power_enter_stop: ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) cmpd r3,r4 bge .Lhandle_deep_stop - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP /* Does not return (system reset interrupt) */ + .Lhandle_deep_stop: /* * Entering deep idle state. @@ -310,7 +395,7 @@ lwarx_loop_stop: bl save_sprs_to_stack - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP /* Does not return (system reset interrupt) */ /* * Entered with MSR[EE]=0 and no soft-masked interrupts pending. @@ -411,6 +496,18 @@ pnv_powersave_wakeup_mce: b pnv_powersave_wakeup +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm_start_guest_check: + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beqlr + b kvm_start_guest +#endif + /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss @@ -435,15 +532,9 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mr r3,r12 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beq 1f - b kvm_start_guest -1: +BEGIN_FTR_SECTION + bl kvm_start_guest_check +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ @@ -809,9 +900,16 @@ no_segments: mtctr r12 bctrl +/* + * On POWER9, we can come here on wakeup from a cpuidle stop state. + * Hence restore the additional SPRs to the saved value. + * + * On POWER8, we come here only on winkle. Since winkle is used + * only in the case of CPU-Hotplug, we don't need to restore + * the additional SPRs. + */ BEGIN_FTR_SECTION - ld r4,_LPCR(r1) - mtspr SPRN_LPCR,r4 + bl power9_restore_additional_sprs END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index a582e0d42525..aa9f1b8261db 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -19,6 +19,8 @@ #include <asm/pgtable.h> #include <asm/ppc-pci.h> #include <asm/io-workarounds.h> +#include <asm/pte-walk.h> + #define IOWA_MAX_BUS 8 @@ -75,8 +77,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) * We won't find huge pages here (iomem). Also can't hit * a page table free due to init_mm */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(vaddr, &hugepage_shift); if (ptep == NULL) paddr = 0; else { @@ -192,7 +193,7 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops, if (iowa_bus_count >= IOWA_MAX_BUS) { pr_err("IOWA:Too many pci bridges, " - "workarounds disabled for %s\n", np->full_name); + "workarounds disabled for %pOF\n", np); return; } @@ -207,6 +208,6 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops, iowa_bus_count++; - pr_debug("IOWA:[%d]Add bus, %s.\n", iowa_bus_count-1, np->full_name); + pr_debug("IOWA:[%d]Add bus, %pOF.\n", iowa_bus_count-1, np); } diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 233ca3fe4754..af7a20dc6e09 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -127,8 +127,7 @@ static ssize_t fail_iommu_store(struct device *dev, return count; } -static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show, - fail_iommu_store); +static DEVICE_ATTR_RW(fail_iommu); static int fail_iommu_bus_notify(struct notifier_block *nb, unsigned long action, void *data) @@ -190,7 +189,7 @@ static unsigned long iommu_range_alloc(struct device *dev, unsigned int pool_nr; struct iommu_pool *pool; - align_mask = 0xffffffffffffffffl >> (64 - align_order); + align_mask = (1ull << align_order) - 1; /* This allocator was derived from x86_64's bit string search */ @@ -208,7 +207,7 @@ static unsigned long iommu_range_alloc(struct device *dev, * We don't need to disable preemption here because any CPU can * safely use any IOMMU pool. */ - pool_nr = __this_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); + pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); if (largealloc) pool = &(tbl->large_pool); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index f291f7826abc..4e65bf82f5e0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -24,7 +24,7 @@ * mask register (of which only 16 are defined), hence the weird shifting * and complement of the cached_irq_mask. I want to be able to stuff * this right into the SIU SMASK register. - * Many of the prep/chrp functions are conditional compiled on CONFIG_8xx + * Many of the prep/chrp functions are conditional compiled on CONFIG_PPC_8xx * to reduce code space and undefined function references. */ @@ -143,9 +143,10 @@ notrace unsigned int __check_irq_replay(void) */ unsigned char happened = local_paca->irq_happened; - /* Clear bit 0 which we wouldn't clear otherwise */ - local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; if (happened & PACA_IRQ_HARD_DIS) { + /* Clear bit 0 which we wouldn't clear otherwise */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + /* * We may have missed a decrementer interrupt if hard disabled. * Check the decrementer register in case we had a rollover @@ -173,41 +174,39 @@ notrace unsigned int __check_irq_replay(void) * This is a higher priority interrupt than the others, so * replay it first. */ - local_paca->irq_happened &= ~PACA_IRQ_HMI; - if (happened & PACA_IRQ_HMI) + if (happened & PACA_IRQ_HMI) { + local_paca->irq_happened &= ~PACA_IRQ_HMI; return 0xe60; + } - /* - * We may have missed a decrementer interrupt. We check the - * decrementer itself rather than the paca irq_happened field - * in case we also had a rollover while hard disabled - */ - local_paca->irq_happened &= ~PACA_IRQ_DEC; - if (happened & PACA_IRQ_DEC) + if (happened & PACA_IRQ_DEC) { + local_paca->irq_happened &= ~PACA_IRQ_DEC; return 0x900; + } - /* Finally check if an external interrupt happened */ - local_paca->irq_happened &= ~PACA_IRQ_EE; - if (happened & PACA_IRQ_EE) + if (happened & PACA_IRQ_EE) { + local_paca->irq_happened &= ~PACA_IRQ_EE; return 0x500; + } #ifdef CONFIG_PPC_BOOK3E - /* Finally check if an EPR external interrupt happened - * this bit is typically set if we need to handle another - * "edge" interrupt from within the MPIC "EPR" handler + /* + * Check if an EPR external interrupt happened this bit is typically + * set if we need to handle another "edge" interrupt from within the + * MPIC "EPR" handler. */ - local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; - if (happened & PACA_IRQ_EE_EDGE) + if (happened & PACA_IRQ_EE_EDGE) { + local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; return 0x500; + } - local_paca->irq_happened &= ~PACA_IRQ_DBELL; - if (happened & PACA_IRQ_DBELL) + if (happened & PACA_IRQ_DBELL) { + local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0x280; + } #else - local_paca->irq_happened &= ~PACA_IRQ_DBELL; if (happened & PACA_IRQ_DBELL) { - if (cpu_has_feature(CPU_FTR_HVMODE)) - return 0xe80; + local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0xa00; } #endif /* CONFIG_PPC_BOOK3E */ @@ -483,6 +482,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " Hypervisor Maintenance Interrupts\n"); } + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs); + seq_printf(p, " System Reset interrupts\n"); + +#ifdef CONFIG_PPC_WATCHDOG + seq_printf(p, "%*s: ", prec, "WDG"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs); + seq_printf(p, " Watchdog soft-NMI interrupts\n"); +#endif + #ifdef CONFIG_PPC_DOORBELL if (cpu_has_feature(CPU_FTR_DBELL)) { seq_printf(p, "%*s: ", prec, "DBL"); @@ -507,6 +518,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += per_cpu(irq_stat, cpu).spurious_irqs; sum += per_cpu(irq_stat, cpu).timer_irqs_others; sum += per_cpu(irq_stat, cpu).hmi_exceptions; + sum += per_cpu(irq_stat, cpu).sreset_irqs; +#ifdef CONFIG_PPC_WATCHDOG + sum += per_cpu(irq_stat, cpu).soft_nmi_irqs; +#endif #ifdef CONFIG_PPC_DOORBELL sum += per_cpu(irq_stat, cpu).doorbell_irqs; #endif diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index bb6f8993412e..1df6c74aa731 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -164,7 +164,7 @@ void __init isa_bridge_find_early(struct pci_controller *hose) /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; - pr_debug("ISA bridge (early) is %s\n", np->full_name); + pr_debug("ISA bridge (early) is %pOF\n", np); } /** @@ -187,15 +187,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np) pna = of_n_addr_cells(np); if (of_property_read_u32(np, "#address-cells", &na) || of_property_read_u32(np, "#size-cells", &ns)) { - pr_warn("ISA: Non-PCI bridge %s is missing address format\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF is missing address format\n", + np); return; } /* Check it's a supported address format */ if (na != 2 || ns != 1) { - pr_warn("ISA: Non-PCI bridge %s has unsupported address format\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has unsupported address format\n", + np); return; } rs = na + ns + pna; @@ -203,8 +203,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Grab the ranges property */ ranges = of_get_property(np, "ranges", &rlen); if (ranges == NULL || rlen < rs) { - pr_warn("ISA: Non-PCI bridge %s has absent or invalid ranges\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has absent or invalid ranges\n", + np); return; } @@ -220,8 +220,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Got something ? */ if (!size || !pbasep) { - pr_warn("ISA: Non-PCI bridge %s has no usable IO range\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has no usable IO range\n", + np); return; } @@ -233,15 +233,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Map pbase */ pbase = of_translate_address(np, pbasep); if (pbase == OF_BAD_ADDR) { - pr_warn("ISA: Non-PCI bridge %s failed to translate IO base\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF failed to translate IO base\n", + np); return; } /* We need page alignment */ if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) { - pr_warn("ISA: Non-PCI bridge %s has non aligned IO range\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has non aligned IO range\n", + np); return; } @@ -255,7 +255,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) __ioremap_at(pbase, (void *)ISA_IO_BASE, size, pgprot_val(pgprot_noncached(__pgprot(0)))); - pr_debug("ISA: Non-PCI bridge is %s\n", np->full_name); + pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } /** @@ -277,8 +277,8 @@ static void isa_bridge_find_late(struct pci_dev *pdev, /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; - pr_debug("ISA bridge (late) is %s on %s\n", - devnode->full_name, pci_name(pdev)); + pr_debug("ISA bridge (late) is %pOF on %s\n", + devnode, pci_name(pdev)); } /** diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index dbf098121ce6..35e240a0a408 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -67,9 +67,9 @@ static struct hard_trap_info #endif #else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ -#if defined(CONFIG_8xx) +#if defined(CONFIG_PPC_8xx) { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ -#else /* ! CONFIG_8xx */ +#else /* ! CONFIG_PPC_8xx */ { 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */ { 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */ { 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */ diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 1086ea37c832..9ad37f827a97 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -25,7 +25,6 @@ #include <linux/kvm_para.h> #include <linux/slab.h> #include <linux/of.h> -#include <linux/nmi.h> /* hardlockup_detector_disable() */ #include <asm/reg.h> #include <asm/sections.h> @@ -719,12 +718,6 @@ static __init void kvm_free_tmp(void) static int __init kvm_guest_init(void) { - /* - * The hardlockup detector is likely to get false positives in - * KVM guests, so disable it by default. - */ - hardlockup_detector_disable(); - if (!kvm_para_available()) goto free_tmp; diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S index 97ec8557f974..6408f09dbbd9 100644 --- a/arch/powerpc/kernel/l2cr_6xx.S +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -181,7 +181,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) mtctr r4 li r4,0 1: - lwzx r0,r0,r4 + lwzx r0,0,r4 addi r4,r4,32 /* Go to start of next cache line */ bdnz 1b isync @@ -328,7 +328,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) mtctr r4 li r4,0 1: - lwzx r0,r0,r4 + lwzx r0,0,r4 dcbf 0,r4 addi r4,r4,32 /* Go to start of next cache line */ bdnz 1b diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 0694d20f85b6..5e5a64a8b4e4 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -147,8 +147,8 @@ static int __init add_legacy_port(struct device_node *np, int want_index, legacy_serial_ports[index].serial_out = tsi_serial_out; } - printk(KERN_DEBUG "Found legacy serial port %d for %s\n", - index, np->full_name); + printk(KERN_DEBUG "Found legacy serial port %d for %pOF\n", + index, np); printk(KERN_DEBUG " %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n", (iotype == UPIO_PORT) ? "port" : "mem", (unsigned long long)base, (unsigned long long)taddr, irq, @@ -207,7 +207,7 @@ static int __init add_legacy_isa_port(struct device_node *np, int index = -1; u64 taddr; - DBG(" -> add_legacy_isa_port(%s)\n", np->full_name); + DBG(" -> add_legacy_isa_port(%pOF)\n", np); /* Get the ISA port number */ reg = of_get_property(np, "reg", NULL); @@ -256,7 +256,7 @@ static int __init add_legacy_pci_port(struct device_node *np, unsigned int flags; int iotype, index = -1, lindex = 0; - DBG(" -> add_legacy_pci_port(%s)\n", np->full_name); + DBG(" -> add_legacy_pci_port(%pOF)\n", np); /* We only support ports that have a clock frequency properly * encoded in the device-tree (that is have an fcode). Anything @@ -374,7 +374,7 @@ void __init find_legacy_serial_ports(void) if (path != NULL) { stdout = of_find_node_by_path(path); if (stdout) - DBG("stdout is %s\n", stdout->full_name); + DBG("stdout is %pOF\n", stdout); } else { DBG(" no linux,stdout-path !\n"); } @@ -603,7 +603,7 @@ static int __init check_legacy_serial_console(void) DBG(" can't find stdout package %s !\n", name); return -ENODEV; } - DBG("stdout is %s\n", prom_stdout->full_name); + DBG("stdout is %pOF\n", prom_stdout); name = of_get_property(prom_stdout, "name", NULL); if (!name) { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index e0e131e662ed..9b2ea7e71c06 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -22,11 +22,14 @@ #undef DEBUG #define pr_fmt(fmt) "mce: " fmt +#include <linux/hardirq.h> #include <linux/types.h> #include <linux/ptrace.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/irq_work.h> + +#include <asm/machdep.h> #include <asm/mce.h> static DEFINE_PER_CPU(int, mce_nest_count); @@ -446,3 +449,33 @@ uint64_t get_mce_fault_addr(struct machine_check_event *evt) return 0; } EXPORT_SYMBOL(get_mce_fault_addr); + +/* + * This function is called in real mode. Strictly no printk's please. + * + * regs->nip and regs->msr contains srr0 and ssr1. + */ +long machine_check_early(struct pt_regs *regs) +{ + long handled = 0; + + __this_cpu_inc(irq_stat.mce_exceptions); + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + return handled; +} + +long hmi_exception_realmode(struct pt_regs *regs) +{ + __this_cpu_inc(irq_stat.hmi_exceptions); + + wait_for_subcore_guest_exit(); + + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(regs); + + wait_for_tb_resync(); + + return 0; +} diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 34aeac54f120..becaec990140 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -45,7 +45,7 @@ static int of_pci_phb_probe(struct platform_device *dev) if (ppc_md.pci_setup_phb == NULL) return -ENODEV; - pr_info("Setting up PCI bus %s\n", dev->dev.of_node->full_name); + pr_info("Setting up PCI bus %pOF\n", dev->dev.of_node); /* Alloc and setup PHB data structure */ phb = pcibios_alloc_controller(dev->dev.of_node); diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index 4937bef7652f..52fc864cdec4 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -60,10 +60,6 @@ optprobe_template_entry: std r5,_CCR(r1) lbz r5,PACASOFTIRQEN(r13) std r5,SOFTE(r1) - mfdar r5 - std r5,_DAR(r1) - mfdsisr r5 - std r5,_DSISR(r1) /* * We may get here from a module, so load the kernel TOC in r2. @@ -122,10 +118,6 @@ optprobe_template_call_emulate: mtxer r5 ld r5,_CCR(r1) mtcr r5 - ld r5,_DAR(r1) - mtdar r5 - ld r5,_DSISR(r1) - mtdsisr r5 REST_GPR(0,r1) REST_10GPRS(2,r1) REST_10GPRS(12,r1) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 8d63627e067f..70f073d6c3b2 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -99,18 +99,27 @@ static inline void free_lppacas(void) { } * If you make the number of persistent SLB entries dynamic, please also * update PR KVM to flush and restore them accordingly. */ -static struct slb_shadow *slb_shadow; +static struct slb_shadow * __initdata slb_shadow; static void __init allocate_slb_shadows(int nr_cpus, int limit) { int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus); + + if (early_radix_enabled()) + return; + slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit)); memset(slb_shadow, 0, size); } static struct slb_shadow * __init init_slb_shadow(int cpu) { - struct slb_shadow *s = &slb_shadow[cpu]; + struct slb_shadow *s; + + if (early_radix_enabled()) + return NULL; + + s = &slb_shadow[cpu]; /* * When we come through here to initialise boot_paca, the slb_shadow diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 341a7469cab8..02831a396419 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -373,9 +373,8 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) if (virq) irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW); } else { - pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n", - oirq.args_count, oirq.args[0], oirq.args[1], - of_node_full_name(oirq.np)); + pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %pOF\n", + oirq.args_count, oirq.args[0], oirq.args[1], oirq.np); virq = irq_create_of_mapping(&oirq); } @@ -741,8 +740,8 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, struct of_pci_range range; struct of_pci_range_parser parser; - printk(KERN_INFO "PCI host bridge %s %s ranges:\n", - dev->full_name, primary ? "(primary)" : ""); + printk(KERN_INFO "PCI host bridge %pOF %s ranges:\n", + dev, primary ? "(primary)" : ""); /* Check for ranges property */ if (of_pci_range_parser_init(&parser, dev)) @@ -1556,8 +1555,8 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose, if (!res->flags) { pr_debug("PCI: I/O resource not set for host" - " bridge %s (domain %d)\n", - hose->dn->full_name, hose->global_number); + " bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); } else { offset = pcibios_io_space_offset(hose); @@ -1668,7 +1667,7 @@ void pcibios_scan_phb(struct pci_controller *hose) struct device_node *node = hose->dn; int mode; - pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node)); + pr_debug("PCI: Scanning PHB %pOF\n", node); /* Get some IO space for the new PHB */ pcibios_setup_phb_io_space(hose); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 41c86c6b6e4d..1d817f4d97d9 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -79,8 +79,8 @@ make_one_node_map(struct device_node* node, u8 pci_bus) return; bus_range = of_get_property(node, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s, " - "assuming it starts at 0\n", node->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, " + "assuming it starts at 0\n", node); pci_to_OF_bus_map[pci_bus] = 0; } else pci_to_OF_bus_map[pci_bus] = bus_range[0]; diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index ed5e9ff61a68..932b9741aa8f 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -111,7 +111,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus) if (hose->io_base_alloc == NULL) return 0; - pr_debug("IO unmapping for PHB %s\n", hose->dn->full_name); + pr_debug("IO unmapping for PHB %pOF\n", hose->dn); pr_debug(" alloc=0x%p\n", hose->io_base_alloc); /* This is a PHB, we fully unmap the IO area */ @@ -151,7 +151,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) hose->io_base_virt = (void __iomem *)(area->addr + hose->io_base_phys - phys_page); - pr_debug("IO mapping for PHB %s\n", hose->dn->full_name); + pr_debug("IO mapping for PHB %pOF\n", hose->dn); pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", hose->io_base_phys, hose->io_base_virt, hose->io_base_alloc); pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 592693437070..0e395afbf0f4 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -139,7 +139,6 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) #ifdef CONFIG_PCI_IOV static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, - struct pci_dev *pdev, int vf_index, int busno, int devfn) { @@ -150,10 +149,8 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, return NULL; pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); - if (!pdn) { - dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__); + if (!pdn) return NULL; - } pdn->phb = parent->phb; pdn->parent = parent; @@ -167,13 +164,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, INIT_LIST_HEAD(&pdn->list); list_add_tail(&pdn->list, &parent->child_list); - /* - * If we already have PCI device instance, lets - * bind them. - */ - if (pdev) - pdev->dev.archdata.pci_data = pdn; - return pdn; } #endif @@ -201,7 +191,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { struct eeh_dev *edev __maybe_unused; - pdn = add_one_dev_pci_data(parent, NULL, i, + pdn = add_one_dev_pci_data(parent, i, pci_iov_virtfn_bus(pdev, i), pci_iov_virtfn_devfn(pdev, i)); if (!pdn) { @@ -303,7 +293,6 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, if (pdn == NULL) return NULL; dn->data = pdn; - pdn->node = dn; pdn->phb = hose; #ifdef CONFIG_PPC_POWERNV pdn->pe_number = IODA_INVALID_PE; @@ -352,6 +341,7 @@ EXPORT_SYMBOL_GPL(pci_add_device_node_info); void pci_remove_device_node_info(struct device_node *dn) { struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; + struct device_node *parent; #ifdef CONFIG_EEH struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -364,8 +354,10 @@ void pci_remove_device_node_info(struct device_node *dn) WARN_ON(!list_empty(&pdn->child_list)); list_del(&pdn->list); - if (pdn->parent) - of_node_put(pdn->parent->node); + + parent = of_get_parent(dn); + if (parent) + of_node_put(parent); dn->data = NULL; kfree(pdn); diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index ea3d98115b88..0d790f8432d2 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -211,19 +211,19 @@ void of_scan_pci_bridge(struct pci_dev *dev) unsigned int flags; u64 size; - pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); + pr_debug("of_scan_pci_bridge(%pOF)\n", node); /* parse bus-range property */ busrange = of_get_property(node, "bus-range", &len); if (busrange == NULL || len != 8) { - printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", - node->full_name); + printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %pOF\n", + node); return; } ranges = of_get_property(node, "ranges", &len); if (ranges == NULL) { - printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", - node->full_name); + printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %pOF\n", + node); return; } @@ -233,8 +233,8 @@ void of_scan_pci_bridge(struct pci_dev *dev) bus = pci_add_new_bus(dev->bus, dev, of_read_number(busrange, 1)); if (!bus) { - printk(KERN_ERR "Failed to create pci bus for %s\n", - node->full_name); + printk(KERN_ERR "Failed to create pci bus for %pOF\n", + node); return; } } @@ -262,13 +262,13 @@ void of_scan_pci_bridge(struct pci_dev *dev) res = bus->resource[0]; if (res->flags) { printk(KERN_ERR "PCI: ignoring extra I/O range" - " for bridge %s\n", node->full_name); + " for bridge %pOF\n", node); continue; } } else { if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { printk(KERN_ERR "PCI: too many memory ranges" - " for bridge %s\n", node->full_name); + " for bridge %pOF\n", node); continue; } res = bus->resource[i]; @@ -307,7 +307,7 @@ static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus, struct eeh_dev *edev = pdn_to_eeh_dev(PCI_DN(dn)); #endif - pr_debug(" * %s\n", dn->full_name); + pr_debug(" * %pOF\n", dn); if (!of_device_is_available(dn)) return NULL; @@ -350,8 +350,8 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, struct device_node *child; struct pci_dev *dev; - pr_debug("of_scan_bus(%s) bus no %d...\n", - node->full_name, bus->number); + pr_debug("of_scan_bus(%pOF) bus no %d...\n", + node, bus->number); /* Scan direct children */ for_each_child_of_node(node, child) { diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 1f0fd361e09b..a0c74bbf3454 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -230,7 +230,8 @@ void enable_kernel_fp(void) } EXPORT_SYMBOL(enable_kernel_fp); -static int restore_fp(struct task_struct *tsk) { +static int restore_fp(struct task_struct *tsk) +{ if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; @@ -330,11 +331,19 @@ static inline int restore_altivec(struct task_struct *tsk) { return 0; } #ifdef CONFIG_VSX static void __giveup_vsx(struct task_struct *tsk) { - if (tsk->thread.regs->msr & MSR_FP) + unsigned long msr = tsk->thread.regs->msr; + + /* + * We should never be ssetting MSR_VSX without also setting + * MSR_FP and MSR_VEC + */ + WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC))); + + /* __giveup_fpu will clear MSR_VSX */ + if (msr & MSR_FP) __giveup_fpu(tsk); - if (tsk->thread.regs->msr & MSR_VEC) + if (msr & MSR_VEC) __giveup_altivec(tsk); - tsk->thread.regs->msr &= ~MSR_VSX; } static void giveup_vsx(struct task_struct *tsk) @@ -346,14 +355,6 @@ static void giveup_vsx(struct task_struct *tsk) msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); } -static void save_vsx(struct task_struct *tsk) -{ - if (tsk->thread.regs->msr & MSR_FP) - save_fpu(tsk); - if (tsk->thread.regs->msr & MSR_VEC) - save_altivec(tsk); -} - void enable_kernel_vsx(void) { unsigned long cpumsr; @@ -374,10 +375,6 @@ void enable_kernel_vsx(void) */ if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) return; - if (current->thread.regs->msr & MSR_FP) - __giveup_fpu(current); - if (current->thread.regs->msr & MSR_VEC) - __giveup_altivec(current); __giveup_vsx(current); } } @@ -407,7 +404,6 @@ static int restore_vsx(struct task_struct *tsk) } #else static inline int restore_vsx(struct task_struct *tsk) { return 0; } -static inline void save_vsx(struct task_struct *tsk) { } #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE @@ -487,6 +483,8 @@ void giveup_all(struct task_struct *tsk) msr_check_and_set(msr_all_available); check_if_tm_restore_required(tsk); + WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); + #ifdef CONFIG_PPC_FPU if (usermsr & MSR_FP) __giveup_fpu(tsk); @@ -495,10 +493,6 @@ void giveup_all(struct task_struct *tsk) if (usermsr & MSR_VEC) __giveup_altivec(tsk); #endif -#ifdef CONFIG_VSX - if (usermsr & MSR_VSX) - __giveup_vsx(tsk); -#endif #ifdef CONFIG_SPE if (usermsr & MSR_SPE) __giveup_spe(tsk); @@ -553,19 +547,13 @@ void save_all(struct task_struct *tsk) msr_check_and_set(msr_all_available); - /* - * Saving the way the register space is in hardware, save_vsx boils - * down to a save_fpu() and save_altivec() - */ - if (usermsr & MSR_VSX) { - save_vsx(tsk); - } else { - if (usermsr & MSR_FP) - save_fpu(tsk); + WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); - if (usermsr & MSR_VEC) - save_altivec(tsk); - } + if (usermsr & MSR_FP) + save_fpu(tsk); + + if (usermsr & MSR_VEC) + save_altivec(tsk); if (usermsr & MSR_SPE) __giveup_spe(tsk); @@ -1392,13 +1380,13 @@ void show_regs(struct pt_regs * regs) show_regs_print_info(KERN_DEFAULT); - printk("NIP: "REG" LR: "REG" CTR: "REG"\n", + printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %p TRAP: %04lx %s (%s)\n", regs, regs->trap, print_tainted(), init_utsname()->release); - printk("MSR: "REG" ", regs->msr); + printk("MSR: "REG" ", regs->msr); print_msr_bits(regs->msr); - printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); + pr_cont(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); trap = TRAP(regs); if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) pr_cont("CFAR: "REG" ", regs->orig_gpr3); @@ -1991,11 +1979,25 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) void notrace __ppc64_runlatch_on(void) { struct thread_info *ti = current_thread_info(); - unsigned long ctrl; - ctrl = mfspr(SPRN_CTRLF); - ctrl |= CTRL_RUNLATCH; - mtspr(SPRN_CTRLT, ctrl); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + /* + * Least significant bit (RUN) is the only writable bit of + * the CTRL register, so we can avoid mfspr. 2.06 is not the + * earliest ISA where this is the case, but it's convenient. + */ + mtspr(SPRN_CTRLT, CTRL_RUNLATCH); + } else { + unsigned long ctrl; + + /* + * Some architectures (e.g., Cell) have writable fields other + * than RUN, so do the read-modify-write. + */ + ctrl = mfspr(SPRN_CTRLF); + ctrl |= CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); + } ti->local_flags |= _TLF_RUNLATCH; } @@ -2004,13 +2006,18 @@ void notrace __ppc64_runlatch_on(void) void notrace __ppc64_runlatch_off(void) { struct thread_info *ti = current_thread_info(); - unsigned long ctrl; ti->local_flags &= ~_TLF_RUNLATCH; - ctrl = mfspr(SPRN_CTRLF); - ctrl &= ~CTRL_RUNLATCH; - mtspr(SPRN_CTRLT, ctrl); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + mtspr(SPRN_CTRLT, 0); + } else { + unsigned long ctrl; + + ctrl = mfspr(SPRN_CTRLF); + ctrl &= ~CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); + } } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 613f79f03877..02190e90c7ae 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -177,6 +177,7 @@ struct platform_support { bool hash_mmu; bool radix_mmu; bool radix_gtse; + bool xive; }; /* Platforms codes are now obsolete in the kernel. Now only used within this @@ -1041,6 +1042,27 @@ static void __init prom_parse_mmu_model(u8 val, } } +static void __init prom_parse_xive_model(u8 val, + struct platform_support *support) +{ + switch (val) { + case OV5_FEAT(OV5_XIVE_EITHER): /* Either Available */ + prom_debug("XIVE - either mode supported\n"); + support->xive = true; + break; + case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */ + prom_debug("XIVE - exploitation mode supported\n"); + support->xive = true; + break; + case OV5_FEAT(OV5_XIVE_LEGACY): /* Only Legacy mode */ + prom_debug("XIVE - legacy mode supported\n"); + break; + default: + prom_debug("Unknown xive support option: 0x%x\n", val); + break; + } +} + static void __init prom_parse_platform_support(u8 index, u8 val, struct platform_support *support) { @@ -1054,6 +1076,10 @@ static void __init prom_parse_platform_support(u8 index, u8 val, support->radix_gtse = true; } break; + case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */ + prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT), + support); + break; } } @@ -1062,7 +1088,8 @@ static void __init prom_check_platform_support(void) struct platform_support supported = { .hash_mmu = false, .radix_mmu = false, - .radix_gtse = false + .radix_gtse = false, + .xive = false }; int prop_len = prom_getproplen(prom.chosen, "ibm,arch-vec-5-platform-support"); @@ -1095,6 +1122,11 @@ static void __init prom_check_platform_support(void) /* We're probably on a legacy hypervisor */ prom_debug("Assuming legacy hash support\n"); } + + if (supported.xive) { + prom_debug("Asking for XIVE\n"); + ibm_architecture_vec.vec5.intarch = OV5_FEAT(OV5_XIVE_EXPLOIT); + } } static void __init prom_send_capabilities(void) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 660ed39e9c9a..07cd22e35405 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1594,11 +1594,8 @@ static int ppr_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); } static int ppr_set(struct task_struct *target, @@ -1606,11 +1603,8 @@ static int ppr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); } static int dscr_get(struct task_struct *target, @@ -1618,22 +1612,16 @@ static int dscr_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); } static int dscr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); } #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -1642,22 +1630,16 @@ static int tar_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); } static int tar_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); } static int ebb_active(struct task_struct *target, diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S index d88736fbece6..e8cfc69f59ae 100644 --- a/arch/powerpc/kernel/reloc_64.S +++ b/arch/powerpc/kernel/reloc_64.S @@ -82,7 +82,7 @@ _GLOBAL(relocate) 6: blr .balign 8 -p_dyn: .llong __dynamic_start - 0b -p_rela: .llong __rela_dyn_start - 0b -p_st: .llong _stext - 0b +p_dyn: .8byte __dynamic_start - 0b +p_rela: .8byte __rela_dyn_start - 0b +p_st: .8byte _stext - 0b diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 73f1934582c2..c2b148b1634a 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -91,26 +91,14 @@ static int rtas_pci_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { - struct device_node *busdn, *dn; struct pci_dn *pdn; - bool found = false; int ret; - /* Search only direct children of the bus */ *val = 0xFFFFFFFF; - busdn = pci_bus_to_OF_node(bus); - for (dn = busdn->child; dn; dn = dn->sibling) { - pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) { - found = true; - break; - } - } - if (!found) - return PCIBIOS_DEVICE_NOT_FOUND; + pdn = pci_get_pdn_by_devfn(bus, devfn); + /* Validity of pdn is checked in here */ ret = rtas_read_config(pdn, where, size, val); if (*val == EEH_IO_ERROR_VALUE(size) && eeh_dev_check_failure(pdn_to_eeh_dev(pdn))) @@ -153,24 +141,11 @@ static int rtas_pci_write_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { - struct device_node *busdn, *dn; struct pci_dn *pdn; - bool found = false; - - /* Search only direct children of the bus */ - busdn = pci_bus_to_OF_node(bus); - for (dn = busdn->child; dn; dn = dn->sibling) { - pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) { - found = true; - break; - } - } - if (!found) - return PCIBIOS_DEVICE_NOT_FOUND; + pdn = pci_get_pdn_by_devfn(bus, devfn); + /* Validity of pdn is checked in here. */ return rtas_write_config(pdn, where, size, val); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 94a948207cd2..7de73589d8e2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -481,7 +481,7 @@ void __init smp_setup_cpu_maps(void) __be32 cpu_be; int j, len; - DBG(" * %s...\n", dn->full_name); + DBG(" * %pOF...\n", dn); intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); @@ -704,30 +704,6 @@ int check_legacy_ioport(unsigned long base_port) } EXPORT_SYMBOL(check_legacy_ioport); -static int ppc_panic_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - /* - * If firmware-assisted dump has been registered then trigger - * firmware-assisted dump and let firmware handle everything else. - */ - crash_fadump(NULL, ptr); - ppc_md.panic(ptr); /* May not return */ - return NOTIFY_DONE; -} - -static struct notifier_block ppc_panic_block = { - .notifier_call = ppc_panic_event, - .priority = INT_MIN /* may not return; must be done last */ -}; - -void __init setup_panic(void) -{ - if (!ppc_md.panic) - return; - atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); -} - #ifdef CONFIG_CHECK_CACHE_COHERENCY /* * For platforms that have configurable cache-coherency. This function @@ -872,9 +848,6 @@ void __init setup_arch(char **cmdline_p) /* Probe the machine type, establish ppc_md. */ probe_machine(); - /* Setup panic notifier if requested by the platform. */ - setup_panic(); - /* * Configure ppc_md.power_save (ppc32 only, 64-bit machines do * it from their respective probe() function. @@ -916,13 +889,6 @@ void __init setup_arch(char **cmdline_p) /* Reserve large chunks of memory for use by CMA for KVM. */ kvm_cma_reserve(); - /* - * Reserve any gigantic pages requested on the command line. - * memblock needs to have been initialized by the time this is - * called since this will reserve memory. - */ - reserve_hugetlb_gpages(); - klp_init_thread_info(&init_thread_info); init_mm.start_code = (unsigned long)_stext; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 2f88f6cf1a42..51ebc01fff52 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -98,6 +98,9 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ notrace void __init machine_init(u64 dt_ptr) { + unsigned int *addr = &memset_nocache_branch; + unsigned long insn; + /* Configure static keys first, now that we're relocated. */ setup_feature_keys(); @@ -105,7 +108,9 @@ notrace void __init machine_init(u64 dt_ptr) udbg_early_init(); patch_instruction((unsigned int *)&memcpy, PPC_INST_NOP); - patch_instruction(&memset_nocache_branch, PPC_INST_NOP); + + insn = create_cond_branch(addr, branch_target(addr), 0x820000); + patch_instruction(addr, insn); /* replace b by bne cr0 */ /* Do some early initialization based on the flat device tree */ early_init_devtree(__va(dt_ptr)); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index af23d4b576ec..b89c6aac48c9 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -564,6 +564,9 @@ static __init u64 safe_stack_limit(void) /* Other BookE, we assume the first GB is bolted */ return 1ul << 30; #else + if (early_radix_enabled()) + return ULONG_MAX; + /* BookS, the first segment is bolted */ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) return 1UL << SID_SHIFT_1T; @@ -578,7 +581,8 @@ void __init irqstack_early_init(void) /* * Interrupt stacks must be in the first segment since we - * cannot afford to take SLB misses on them. + * cannot afford to take SLB misses on them. They are not + * accessed in realmode. */ for_each_possible_cpu(i) { softirq_ctx[i] = (struct thread_info *) @@ -649,8 +653,9 @@ void __init emergency_stack_init(void) * aligned. * * Since we use these as temporary stacks during secondary CPU - * bringup, we need to get at them in real mode. This means they - * must also be within the RMO region. + * bringup, machine check, system reset, and HMI, we need to get + * at them in real mode. This means they must also be within the RMO + * region. * * The IRQ stacks allocated elsewhere in this file are zeroed and * initialized in kernel/irq.c. These are initialized here in order @@ -751,3 +756,31 @@ unsigned long memory_block_size_bytes(void) struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif + +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + return ppc_proc_freq * watchdog_thresh; +} +#endif + +/* + * The perf based hardlockup detector breaks PMU event based branches, so + * disable it by default. Book3S has a soft-nmi hardlockup detector based + * on the decrementer interrupt, so it does not suffer from this problem. + * + * It is likely to get false positives in VM guests, so disable it there + * by default too. + */ +static int __init disable_hardlockup_detector(void) +{ +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF + hardlockup_detector_disable(); +#else + if (firmware_has_feature(FW_FEATURE_LPAR)) + hardlockup_detector_disable(); +#endif + + return 0; +} +early_initcall(disable_hardlockup_detector); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8d3320562c70..e0a4c1f82e25 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -75,9 +75,11 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; struct thread_info *secondary_ti; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* SMP operations for this machine */ @@ -571,6 +573,26 @@ static void smp_store_cpu_info(int id) #endif } +/* + * Relationships between CPUs are maintained in a set of per-cpu cpumasks so + * rather than just passing around the cpumask we pass around a function that + * returns the that cpumask for the given CPU. + */ +static void set_cpus_related(int i, int j, struct cpumask *(*get_cpumask)(int)) +{ + cpumask_set_cpu(i, get_cpumask(j)); + cpumask_set_cpu(j, get_cpumask(i)); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void set_cpus_unrelated(int i, int j, + struct cpumask *(*get_cpumask)(int)) +{ + cpumask_clear_cpu(i, get_cpumask(j)); + cpumask_clear_cpu(j, get_cpumask(i)); +} +#endif + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu; @@ -590,6 +612,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); /* @@ -602,7 +626,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } + /* Init the cpumasks so the boot CPU is related to itself */ cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); + cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); if (smp_ops && smp_ops->probe) @@ -828,33 +854,6 @@ int cpu_first_thread_of_core(int core) } EXPORT_SYMBOL_GPL(cpu_first_thread_of_core); -static void traverse_siblings_chip_id(int cpu, bool add, int chipid) -{ - const struct cpumask *mask; - struct device_node *np; - int i, plen; - const __be32 *prop; - - mask = add ? cpu_online_mask : cpu_present_mask; - for_each_cpu(i, mask) { - np = of_get_cpu_node(i, NULL); - if (!np) - continue; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int) && - of_read_number(prop, 1) == chipid) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } - of_node_put(np); - } -} - /* Must be called when no change can occur to cpu_present_mask, * i.e. during cpu online or offline. */ @@ -877,52 +876,93 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } -static void traverse_core_siblings(int cpu, bool add) +static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) { struct device_node *l2_cache, *np; - const struct cpumask *mask; - int i, chip, plen; - const __be32 *prop; - - /* First see if we have ibm,chip-id properties in cpu nodes */ - np = of_get_cpu_node(cpu, NULL); - if (np) { - chip = -1; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int)) - chip = of_read_number(prop, 1); - of_node_put(np); - if (chip >= 0) { - traverse_siblings_chip_id(cpu, add, chip); - return; - } - } + int i; l2_cache = cpu_to_l2cache(cpu); - mask = add ? cpu_online_mask : cpu_present_mask; - for_each_cpu(i, mask) { + if (!l2_cache) + return false; + + for_each_cpu(i, cpu_online_mask) { + /* + * when updating the marks the current CPU has not been marked + * online, but we need to update the cache masks + */ np = cpu_to_l2cache(i); if (!np) continue; - if (np == l2_cache) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } + + if (np == l2_cache) + set_cpus_related(cpu, i, mask_fn); + of_node_put(np); } of_node_put(l2_cache); + + return true; } +#ifdef CONFIG_HOTPLUG_CPU +static void remove_cpu_from_masks(int cpu) +{ + int i; + + /* NB: cpu_core_mask is a superset of the others */ + for_each_cpu(i, cpu_core_mask(cpu)) { + set_cpus_unrelated(cpu, i, cpu_core_mask); + set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); + set_cpus_unrelated(cpu, i, cpu_sibling_mask); + } +} +#endif + +static void add_cpu_to_masks(int cpu) +{ + int first_thread = cpu_first_thread_sibling(cpu); + int chipid = cpu_to_chip_id(cpu); + int i; + + /* + * This CPU will not be in the online mask yet so we need to manually + * add it to it's own thread sibling mask. + */ + cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) + if (cpu_online(i)) + set_cpus_related(i, cpu, cpu_sibling_mask); + + /* + * Copy the thread sibling mask into the cache sibling mask + * and mark any CPUs that share an L2 with this CPU. + */ + for_each_cpu(i, cpu_sibling_mask(cpu)) + set_cpus_related(cpu, i, cpu_l2_cache_mask); + update_mask_by_l2(cpu, cpu_l2_cache_mask); + + /* + * Copy the cache sibling mask into core sibling mask and mark + * any CPUs on the same chip as this CPU. + */ + for_each_cpu(i, cpu_l2_cache_mask(cpu)) + set_cpus_related(cpu, i, cpu_core_mask); + + if (chipid == -1) + return; + + for_each_cpu(i, cpu_online_mask) + if (cpu_to_chip_id(i) == chipid) + set_cpus_related(cpu, i, cpu_core_mask); +} + +static bool shared_caches; + /* Activate a secondary processor. */ void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); - int i, base; mmgrab(&init_mm); current->active_mm = &init_mm; @@ -945,22 +985,15 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif - /* Update sibling maps */ - base = cpu_first_thread_sibling(cpu); - for (i = 0; i < threads_per_core; i++) { - if (cpu_is_offline(base + i) && (cpu != base + i)) - continue; - cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); + /* Update topology CPU masks */ + add_cpu_to_masks(cpu); - /* cpu_core_map should be a superset of - * cpu_sibling_map even if we don't have cache - * information, so update the former here, too. - */ - cpumask_set_cpu(cpu, cpu_core_mask(base + i)); - cpumask_set_cpu(base + i, cpu_core_mask(cpu)); - } - traverse_core_siblings(cpu, true); + /* + * Check for any shared caches. Note that this must be done on a + * per-core basis because one core in the pair might be disabled. + */ + if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu))) + shared_caches = true; set_numa_node(numa_cpu_lookup_table[cpu]); set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); @@ -1003,6 +1036,35 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; +/* + * P9 has a slightly odd architecture where pairs of cores share an L2 cache. + * This topology makes it *much* cheaper to migrate tasks between adjacent cores + * since the migrated task remains cache hot. We want to take advantage of this + * at the scheduler level so an extra topology level is required. + */ +static int powerpc_shared_cache_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} + +/* + * We can't just pass cpu_l2_cache_mask() directly because + * returns a non-const pointer and the compiler barfs on that. + */ +static const struct cpumask *shared_cache_mask(int cpu) +{ + return cpu_l2_cache_mask(cpu); +} + +static struct sched_domain_topology_level power9_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { /* @@ -1016,14 +1078,23 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); - set_sched_topology(powerpc_topology); + /* + * If any CPU detects that it's sharing a cache with another CPU then + * use the deeper topology that is aware of this sharing. + */ + if (shared_caches) { + pr_info("Using shared cache scheduler topology\n"); + set_sched_topology(power9_topology); + } else { + pr_info("Using standard scheduler topology\n"); + set_sched_topology(powerpc_topology); + } } #ifdef CONFIG_HOTPLUG_CPU int __cpu_disable(void) { int cpu = smp_processor_id(); - int base, i; int err; if (!smp_ops->cpu_disable) @@ -1034,14 +1105,7 @@ int __cpu_disable(void) return err; /* Update sibling maps */ - base = cpu_first_thread_sibling(cpu); - for (i = 0; i < threads_per_core && base + i < nr_cpu_ids; i++) { - cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu)); - cpumask_clear_cpu(cpu, cpu_core_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_core_mask(cpu)); - } - traverse_core_siblings(cpu, false); + remove_cpu_from_masks(cpu); return 0; } diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index 988f38dced0f..82d8aae81c6a 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -179,7 +179,7 @@ nothing_to_copy: sld r3, r3, r0 li r0, 0 1: - dcbf r0,r3 + dcbf 0,r3 addi r3,r3,0x20 bdnz 1b diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 4d6b1d3a747f..7ccb7f81f8db 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -17,13 +17,13 @@ #include <asm/ppc_asm.h> #ifdef CONFIG_PPC64 -#define SYSCALL(func) .llong DOTSYM(sys_##func),DOTSYM(sys_##func) -#define COMPAT_SYS(func) .llong DOTSYM(sys_##func),DOTSYM(compat_sys_##func) -#define PPC_SYS(func) .llong DOTSYM(ppc_##func),DOTSYM(ppc_##func) -#define OLDSYS(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) -#define SYS32ONLY(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) -#define PPC64ONLY(func) .llong DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall) -#define SYSX(f, f3264, f32) .llong DOTSYM(f),DOTSYM(f3264) +#define SYSCALL(func) .8byte DOTSYM(sys_##func),DOTSYM(sys_##func) +#define COMPAT_SYS(func) .8byte DOTSYM(sys_##func),DOTSYM(compat_sys_##func) +#define PPC_SYS(func) .8byte DOTSYM(ppc_##func),DOTSYM(ppc_##func) +#define OLDSYS(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) +#define SYS32ONLY(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) +#define PPC64ONLY(func) .8byte DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall) +#define SYSX(f, f3264, f32) .8byte DOTSYM(f),DOTSYM(f3264) #else #define SYSCALL(func) .long sys_##func #define COMPAT_SYS(func) .long sys_##func diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index bfcfd9ef09f2..ec74e203ee04 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -114,6 +114,28 @@ static void pmac_backlight_unblank(void) static inline void pmac_backlight_unblank(void) { } #endif +/* + * If oops/die is expected to crash the machine, return true here. + * + * This should not be expected to be 100% accurate, there may be + * notifiers registered or other unexpected conditions that may bring + * down the kernel. Or if the current process in the kernel is holding + * locks or has other critical state, the kernel may become effectively + * unusable anyway. + */ +bool die_will_crash(void) +{ + if (should_fadump_crash()) + return true; + if (kexec_should_crash(current)) + return true; + if (in_interrupt() || panic_on_oops || + !current->pid || is_global_init(current)) + return true; + + return false; +} + static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -162,21 +184,9 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, crash_fadump(regs, "die oops"); - /* - * A system reset (0x100) is a request to dump, so we always send - * it through the crashdump code. - */ - if (kexec_should_crash(current) || (TRAP(regs) == 0x100)) { + if (kexec_should_crash(current)) crash_kexec(regs); - /* - * We aren't the primary crash CPU. We need to send it - * to a holding pattern to avoid it ending up in the panic - * code. - */ - crash_kexec_secondary(regs); - } - if (!signr) return; @@ -202,18 +212,25 @@ NOKPROBE_SYMBOL(oops_end); static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP NR_CPUS=%d ", NR_CPUS); -#endif + + if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + printk("LE "); + else + printk("BE "); + + if (IS_ENABLED(CONFIG_PREEMPT)) + pr_cont("PREEMPT "); + + if (IS_ENABLED(CONFIG_SMP)) + pr_cont("SMP NR_CPUS=%d ", NR_CPUS); + if (debug_pagealloc_enabled()) - printk("DEBUG_PAGEALLOC "); -#ifdef CONFIG_NUMA - printk("NUMA "); -#endif - printk("%s\n", ppc_md.name ? ppc_md.name : ""); + pr_cont("DEBUG_PAGEALLOC "); + + if (IS_ENABLED(CONFIG_NUMA)) + pr_cont("NUMA "); + + pr_cont("%s\n", ppc_md.name ? ppc_md.name : ""); if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) return 1; @@ -288,23 +305,52 @@ void system_reset_exception(struct pt_regs *regs) if (!nested) nmi_enter(); + __this_cpu_inc(irq_stat.sreset_irqs); + /* See if any machine dependent calls */ if (ppc_md.system_reset_exception) { if (ppc_md.system_reset_exception(regs)) goto out; } - die("System Reset", regs, SIGABRT); + if (debugger(regs)) + goto out; + + /* + * A system reset is a request to dump, so we always send + * it through the crashdump code (if fadump or kdump are + * registered). + */ + crash_fadump(regs, "System Reset"); + + crash_kexec(regs); + + /* + * We aren't the primary crash CPU. We need to send it + * to a holding pattern to avoid it ending up in the panic + * code. + */ + crash_kexec_secondary(regs); + + /* + * No debugger or crash dump registered, print logs then + * panic. + */ + __die("System Reset", regs, SIGABRT); + + mdelay(2*MSEC_PER_SEC); /* Wait a little while for others to print */ + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + nmi_panic(regs, "System Reset"); out: #ifdef CONFIG_PPC_BOOK3S_64 BUG_ON(get_paca()->in_nmi == 0); if (get_paca()->in_nmi > 1) - panic("Unrecoverable nested System Reset"); + nmi_panic(regs, "Unrecoverable nested System Reset"); #endif /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) - panic("Unrecoverable System Reset"); + nmi_panic(regs, "Unrecoverable System Reset"); if (!nested) nmi_exit(); @@ -312,39 +358,6 @@ out: /* What should we do here? We could issue a shutdown or hard reset. */ } -#ifdef CONFIG_PPC64 -/* - * This function is called in real mode. Strictly no printk's please. - * - * regs->nip and regs->msr contains srr0 and ssr1. - */ -long machine_check_early(struct pt_regs *regs) -{ - long handled = 0; - - __this_cpu_inc(irq_stat.mce_exceptions); - - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) - handled = cur_cpu_spec->machine_check_early(regs); - return handled; -} - -long hmi_exception_realmode(struct pt_regs *regs) -{ - __this_cpu_inc(irq_stat.hmi_exceptions); - - wait_for_subcore_guest_exit(); - - if (ppc_md.hmi_exception_early) - ppc_md.hmi_exception_early(regs); - - wait_for_tb_resync(); - - return 0; -} - -#endif - /* * I/O accesses can cause machine checks on powermacs. * Check if the NIP corresponds to the address of a sync @@ -397,11 +410,6 @@ static inline int check_io_access(struct pt_regs *regs) /* On 4xx, the reason for the machine check or program exception is in the ESR. */ #define get_reason(regs) ((regs)->dsisr) -#ifndef CONFIG_FSL_BOOKE -#define get_mc_reason(regs) ((regs)->dsisr) -#else -#define get_mc_reason(regs) (mfspr(SPRN_MCSR)) -#endif #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR @@ -415,108 +423,17 @@ static inline int check_io_access(struct pt_regs *regs) /* On non-4xx, the reason for the machine check or program exception is in the MSR. */ #define get_reason(regs) ((regs)->msr) -#define get_mc_reason(regs) ((regs)->msr) -#define REASON_TM 0x200000 -#define REASON_FP 0x100000 -#define REASON_ILLEGAL 0x80000 -#define REASON_PRIVILEGED 0x40000 -#define REASON_TRAP 0x20000 +#define REASON_TM SRR1_PROGTM +#define REASON_FP SRR1_PROGFPE +#define REASON_ILLEGAL SRR1_PROGILL +#define REASON_PRIVILEGED SRR1_PROGPRIV +#define REASON_TRAP SRR1_PROGTRAP #define single_stepping(regs) ((regs)->msr & MSR_SE) #define clear_single_step(regs) ((regs)->msr &= ~MSR_SE) #endif -#if defined(CONFIG_4xx) -int machine_check_4xx(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - if (reason & ESR_IMCP) { - printk("Instruction"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - } else - printk("Data"); - printk(" machine check in kernel mode.\n"); - - return 0; -} - -int machine_check_440A(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - printk("Machine check in kernel mode.\n"); - if (reason & ESR_IMCP){ - printk("Instruction Synchronous Machine Check exception\n"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - } - else { - u32 mcsr = mfspr(SPRN_MCSR); - if (mcsr & MCSR_IB) - printk("Instruction Read PLB Error\n"); - if (mcsr & MCSR_DRB) - printk("Data Read PLB Error\n"); - if (mcsr & MCSR_DWB) - printk("Data Write PLB Error\n"); - if (mcsr & MCSR_TLBP) - printk("TLB Parity Error\n"); - if (mcsr & MCSR_ICP){ - flush_instruction_cache(); - printk("I-Cache Parity Error\n"); - } - if (mcsr & MCSR_DCSP) - printk("D-Cache Search Parity Error\n"); - if (mcsr & MCSR_DCFP) - printk("D-Cache Flush Parity Error\n"); - if (mcsr & MCSR_IMPE) - printk("Machine Check exception is imprecise\n"); - - /* Clear MCSR */ - mtspr(SPRN_MCSR, mcsr); - } - return 0; -} - -int machine_check_47x(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - u32 mcsr; - - printk(KERN_ERR "Machine check in kernel mode.\n"); - if (reason & ESR_IMCP) { - printk(KERN_ERR - "Instruction Synchronous Machine Check exception\n"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - return 0; - } - mcsr = mfspr(SPRN_MCSR); - if (mcsr & MCSR_IB) - printk(KERN_ERR "Instruction Read PLB Error\n"); - if (mcsr & MCSR_DRB) - printk(KERN_ERR "Data Read PLB Error\n"); - if (mcsr & MCSR_DWB) - printk(KERN_ERR "Data Write PLB Error\n"); - if (mcsr & MCSR_TLBP) - printk(KERN_ERR "TLB Parity Error\n"); - if (mcsr & MCSR_ICP) { - flush_instruction_cache(); - printk(KERN_ERR "I-Cache Parity Error\n"); - } - if (mcsr & MCSR_DCSP) - printk(KERN_ERR "D-Cache Search Parity Error\n"); - if (mcsr & PPC47x_MCSR_GPR) - printk(KERN_ERR "GPR Parity Error\n"); - if (mcsr & PPC47x_MCSR_FPR) - printk(KERN_ERR "FPR Parity Error\n"); - if (mcsr & PPC47x_MCSR_IPR) - printk(KERN_ERR "Machine Check exception is imprecise\n"); - - /* Clear MCSR */ - mtspr(SPRN_MCSR, mcsr); - - return 0; -} -#elif defined(CONFIG_E500) +#if defined(CONFIG_E500) int machine_check_e500mc(struct pt_regs *regs) { unsigned long mcsr = mfspr(SPRN_MCSR); @@ -618,7 +535,7 @@ silent_out: int machine_check_e500(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = mfspr(SPRN_MCSR); if (reason & MCSR_BUS_RBERR) { if (fsl_rio_mcheck_exception(regs)) @@ -665,7 +582,7 @@ int machine_check_generic(struct pt_regs *regs) #elif defined(CONFIG_E200) int machine_check_e200(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = mfspr(SPRN_MCSR); printk("Machine check in kernel mode.\n"); printk("Caused by (from MCSR=%lx): ", reason); @@ -687,35 +604,10 @@ int machine_check_e200(struct pt_regs *regs) return 0; } -#elif defined(CONFIG_PPC_8xx) -int machine_check_8xx(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - pr_err("Machine check in kernel mode.\n"); - pr_err("Caused by (from SRR1=%lx): ", reason); - if (reason & 0x40000000) - pr_err("Fetch error at address %lx\n", regs->nip); - else - pr_err("Data access error at address %lx\n", regs->dar); - -#ifdef CONFIG_PCI - /* the qspan pci read routines can cause machine checks -- Cort - * - * yuck !!! that totally needs to go away ! There are better ways - * to deal with that than having a wart in the mcheck handler. - * -- BenH - */ - bad_page_fault(regs, regs->dar, SIGBUS); - return 1; -#else - return 0; -#endif -} -#else +#elif defined(CONFIG_PPC32) int machine_check_generic(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = regs->msr; printk("Machine check in kernel mode.\n"); printk("Caused by (from SRR1=%lx): ", reason); @@ -752,10 +644,14 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { - enum ctx_state prev_state = exception_enter(); int recover = 0; + bool nested = in_nmi(); + if (!nested) + nmi_enter(); - __this_cpu_inc(irq_stat.mce_exceptions); + /* 64s accounts the mce in machine_check_early when in HVMODE */ + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE)) + __this_cpu_inc(irq_stat.mce_exceptions); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -783,10 +679,11 @@ void machine_check_exception(struct pt_regs *regs) /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) - panic("Unrecoverable Machine check"); + nmi_panic(regs, "Unrecoverable Machine check"); bail: - exception_exit(prev_state); + if (!nested) + nmi_exit(); } void SMIException(struct pt_regs *regs) @@ -1672,24 +1569,6 @@ void performance_monitor_exception(struct pt_regs *regs) perf_irq(regs); } -#ifdef CONFIG_8xx -void SoftwareEmulation(struct pt_regs *regs) -{ - CHECK_FULL_REGS(regs); - - if (!user_mode(regs)) { - debugger(regs); - die("Kernel Mode Unimplemented Instruction or SW FPU Emulation", - regs, SIGFPE); - } - - if (!emulate_math(regs)) - return; - - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); -} -#endif /* CONFIG_8xx */ - #ifdef CONFIG_PPC_ADV_DEBUG_REGS static void handle_debug(struct pt_regs *regs, unsigned long debug_status) { diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index 003b20964ea0..5d105b8eeece 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -205,3 +205,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return orig_ret_vaddr; } + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CHAIN_CALL) + return regs->gpr[1] <= ret->stack; + else + return regs->gpr[1] < ret->stack; +} diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 6b2b69616e77..769c2624e0a6 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -232,15 +232,9 @@ __do_get_tspec: lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) /* Get a stable TB value */ -#ifdef CONFIG_8xx -2: mftbu r3 - mftbl r4 - mftbu r0 -#else -2: mfspr r3, SPRN_TBRU - mfspr r4, SPRN_TBRL - mfspr r0, SPRN_TBRU -#endif +2: MFTBU(r3) + MFTBL(r4) + MFTBU(r0) cmplw cr0,r3,r0 bne- 2b diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index b1a250560198..882628fa6987 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,7 +8,7 @@ #include <asm/cache.h> #include <asm/thread_info.h> -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32) #define STRICT_ALIGN_SIZE (1 << 24) #else #define STRICT_ALIGN_SIZE PAGE_SIZE diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 34721a257a77..2f6eadd9408d 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -216,6 +216,9 @@ void soft_nmi_interrupt(struct pt_regs *regs) return; nmi_enter(); + + __this_cpu_inc(irq_stat.soft_nmi_irqs); + tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { per_cpu(wd_timer_tb, cpu) = tb; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b42812e014c0..67075e065ef2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -37,6 +37,7 @@ #include <asm/synch.h> #include <asm/ppc-opcode.h> #include <asm/cputable.h> +#include <asm/pte-walk.h> #include "trace_hv.h" @@ -599,8 +600,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * hugepage split and collapse. */ local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL, NULL); + ptep = find_current_mm_pte(current->mm->pgd, + hva, NULL, NULL); if (ptep) { pte = kvmppc_read_update_linux_pte(ptep, 1); if (__pte_write(pte)) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index f6b3e67c5762..c5d7435455f1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -17,6 +17,7 @@ #include <asm/mmu.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> +#include <asm/pte-walk.h> /* * Supported radix tree geometry. @@ -322,13 +323,13 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, gpa = vcpu->arch.fault_gpa & ~0xfffUL; gpa &= ~0xF000000000000000ul; gfn = gpa >> PAGE_SHIFT; - if (!(dsisr & DSISR_PGDIRFAULT)) + if (!(dsisr & DSISR_PRTABLE_FAULT)) gpa |= ea & 0xfff; memslot = gfn_to_memslot(kvm, gfn); /* No memslot means it's an emulated MMIO region */ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { - if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS | + if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | DSISR_SET_RC)) { /* * Bad address in guest page table tree, or other @@ -359,8 +360,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (writing) pgflags |= _PAGE_DIRTY; local_irq_save(flags); - ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva, - NULL, NULL); + ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); if (ptep) { pte = READ_ONCE(*ptep); if (pte_present(pte) && @@ -374,8 +374,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, spin_unlock(&kvm->mmu_lock); return RESUME_GUEST; } - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, - gpa, NULL, &shift); + /* + * We are walking the secondary page table here. We can do this + * without disabling irq. + */ + ptep = __find_linux_pte(kvm->arch.pgtable, + gpa, NULL, &shift); if (ptep && pte_present(*ptep)) { kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); @@ -427,8 +431,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pgflags |= _PAGE_WRITE; } else { local_irq_save(flags); - ptep = __find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL, NULL); + ptep = find_current_mm_pte(current->mm->pgd, + hva, NULL, NULL); if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) pgflags |= _PAGE_WRITE; local_irq_restore(flags); @@ -499,8 +503,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; unsigned long old; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep)) { old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, gpa, shift); @@ -525,8 +528,7 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; int ref = 0; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) { kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, gpa, shift); @@ -545,8 +547,7 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; int ref = 0; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) ref = 1; return ref; @@ -562,8 +563,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, unsigned int shift; int ret = 0; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { ret = 1; if (shift) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 3adfd2f5301c..c32e9bfe75b1 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -39,6 +39,7 @@ #include <asm/udbg.h> #include <asm/iommu.h> #include <asm/tce.h> +#include <asm/pte-walk.h> #ifdef CONFIG_BUG @@ -353,7 +354,16 @@ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, pte_t *ptep, pte; unsigned shift = 0; - ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift); + /* + * Called in real mode with MSR_EE = 0. We are safe here. + * It is ok to do the lookup with arch.pgdir here, because + * we are doing this on secondary cpus and current task there + * is not the hypervisor. Also this is safe against THP in the + * host, because an IPI to primary thread will wait for the secondary + * to exit which will agains result in the below page table walk + * to finish. + */ + ptep = __find_linux_pte(vcpu->arch.pgdir, ua, NULL, &shift); if (!ptep || !pte_present(*ptep)) return -ENXIO; pte = *ptep; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 359c79cdf0cc..ebcf97cb5c98 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2111,6 +2111,15 @@ static int kvmppc_grab_hwthread(int cpu) struct paca_struct *tpaca; long timeout = 10000; + /* + * ISA v3.0 idle routines do not set hwthread_state or test + * hwthread_req, so they can not grab idle threads. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + WARN(1, "KVM: can not control sibling threads\n"); + return -EBUSY; + } + tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ @@ -2145,10 +2154,12 @@ static void kvmppc_release_hwthread(int cpu) struct paca_struct *tpaca; tpaca = &paca[cpu]; - tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + tpaca->kvm_hstate.hwthread_req = 0; + } static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 584c74c8119f..fedb0139524c 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -22,6 +22,7 @@ #include <asm/hvcall.h> #include <asm/synch.h> #include <asm/ppc-opcode.h> +#include <asm/pte-walk.h> /* Translate address of a vmalloc'd thing to a linear map address */ static void *real_vmalloc_addr(void *x) @@ -31,9 +32,9 @@ static void *real_vmalloc_addr(void *x) /* * assume we don't have huge pages in vmalloc space... * So don't worry about THP collapse/split. Called - * Only in realmode, hence won't need irq_save/restore. + * Only in realmode with MSR_EE = 0, hence won't need irq_save/restore. */ - p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL, NULL); + p = find_init_mm_pte(addr, NULL); if (!p || !pte_present(*p)) return NULL; addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); @@ -230,14 +231,13 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, * If we had a page table table change after lookup, we would * retry via mmu_notifier_retry. */ - if (realmode) - ptep = __find_linux_pte_or_hugepte(pgdir, hva, NULL, - &hpage_shift); - else { + if (!realmode) local_irq_save(irq_flags); - ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, - &hpage_shift); - } + /* + * If called in real mode we have MSR_EE = 0. Otherwise + * we disable irq above. + */ + ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift); if (ptep) { pte_t pte; unsigned int host_pte_size; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9c9c983b864f..2259b6cde119 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -149,9 +149,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) subf r4, r4, r3 mtspr SPRN_DEC, r4 +BEGIN_FTR_SECTION /* hwthread_req may have got set by cede or no vcpu, so clear it */ li r0, 0 stb r0, HSTATE_HWTHREAD_REQ(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* * For external interrupts we need to call the Linux @@ -314,6 +316,7 @@ kvm_novcpu_exit: * Relocation is off and most register values are lost. * r13 points to the PACA. * r3 contains the SRR1 wakeup value, SRR1 is trashed. + * This is not used by ISAv3.0B processors. */ .globl kvm_start_guest kvm_start_guest: @@ -432,6 +435,9 @@ kvm_secondary_got_guest: * While waiting we also need to check if we get given a vcpu to run. */ kvm_no_guest: +BEGIN_FTR_SECTION + twi 31,0,0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r3, HSTATE_HWTHREAD_REQ(r13) cmpwi r3, 0 bne 53f @@ -2512,8 +2518,10 @@ kvm_do_nap: clrrdi r0, r0, 1 mtspr SPRN_CTRLT, r0 +BEGIN_FTR_SECTION li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 77fd043b3ecc..c6c734424c70 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -30,6 +30,7 @@ #include <linux/vmalloc.h> #include <linux/hugetlb.h> #include <asm/kvm_ppc.h> +#include <asm/pte-walk.h> #include "e500.h" #include "timing.h" @@ -476,7 +477,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, * can't run hence pfn won't change. */ local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, NULL); + ptep = find_linux_pte(pgdir, hva, NULL, NULL); if (ptep) { pte_t pte = READ_ONCE(*ptep); diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 3c3146ba62da..50d5bf954cff 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -31,7 +31,8 @@ obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o obj-y += checksum_$(BITS).o checksum_wrappers.o -obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o +obj-y += sstep.o ldstfp.o quad.o +obj64-y += quad.o obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index 8aedbb5f4b86..da425bb6b369 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -67,6 +67,20 @@ CACHELINE_BYTES = L1_CACHE_BYTES LG_CACHELINE_BYTES = L1_CACHE_SHIFT CACHELINE_MASK = (L1_CACHE_BYTES-1) +_GLOBAL(memset16) + rlwinm. r0 ,r5, 31, 1, 31 + addi r6, r3, -4 + beq- 2f + rlwimi r4 ,r4 ,16 ,0 ,15 + mtctr r0 +1: stwu r4, 4(r6) + bdnz 1b +2: andi. r0, r5, 1 + beqlr + sth r4, 4(r6) + blr +EXPORT_SYMBOL(memset16) + /* * Use dcbz on the complete cache lines in the destination * to set them to zero. This requires that the destination @@ -77,22 +91,24 @@ CACHELINE_MASK = (L1_CACHE_BYTES-1) * replaced by a nop once cache is active. This is done in machine_init() */ _GLOBAL(memset) + cmplwi 0,r5,4 + blt 7f + rlwimi r4,r4,8,16,23 rlwimi r4,r4,16,0,15 - addi r6,r3,-4 - cmplwi 0,r5,4 - blt 7f - stwu r4,4(r6) + stw r4,0(r3) beqlr - andi. r0,r6,3 + andi. r0,r3,3 add r5,r0,r5 - subf r6,r0,r6 + subf r6,r0,r3 cmplwi 0,r4,0 - bne 2f /* Use normal procedure if r4 is not zero */ -EXPORT_SYMBOL(memset) + /* + * Skip optimised bloc until cache is enabled. Will be replaced + * by 'bne' during boot to use normal procedure if r4 is not zero + */ _GLOBAL(memset_nocache_branch) - b 2f /* Skip optimised bloc until cache is enabled */ + b 2f clrlwi r7,r6,32-LG_CACHELINE_BYTES add r8,r7,r5 @@ -119,7 +135,6 @@ _GLOBAL(memset_nocache_branch) 1: stwu r4,4(r6) bdnz 1b 6: andi. r5,r5,3 -7: cmpwi 0,r5,0 beqlr mtctr r5 addi r6,r6,3 @@ -127,6 +142,15 @@ _GLOBAL(memset_nocache_branch) bdnz 8b blr +7: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r6,r3,-1 +9: stbu r4,1(r6) + bdnz 9b + blr +EXPORT_SYMBOL(memset) + /* * This version uses dcbz on the complete cache lines in the * destination area to reduce memory traffic. This requires that diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S index a84d333ecb09..ca5fc8fa7efc 100644 --- a/arch/powerpc/lib/copypage_power7.S +++ b/arch/powerpc/lib/copypage_power7.S @@ -45,13 +45,13 @@ _GLOBAL(copypage_power7) .machine push .machine "power4" /* setup read stream 0 */ - dcbt r0,r4,0b01000 /* addr from */ - dcbt r0,r7,0b01010 /* length and depth from */ + dcbt 0,r4,0b01000 /* addr from */ + dcbt 0,r7,0b01010 /* length and depth from */ /* setup write stream 1 */ - dcbtst r0,r9,0b01000 /* addr to */ - dcbtst r0,r10,0b01010 /* length and depth to */ + dcbtst 0,r9,0b01000 /* addr to */ + dcbtst 0,r10,0b01010 /* length and depth to */ eieio - dcbt r0,r8,0b01010 /* all streams GO */ + dcbt 0,r8,0b01010 /* all streams GO */ .machine pop #ifdef CONFIG_ALTIVEC @@ -83,7 +83,7 @@ _GLOBAL(copypage_power7) li r12,112 .align 5 -1: lvx v7,r0,r4 +1: lvx v7,0,r4 lvx v6,r4,r6 lvx v5,r4,r7 lvx v4,r4,r8 @@ -92,7 +92,7 @@ _GLOBAL(copypage_power7) lvx v1,r4,r11 lvx v0,r4,r12 addi r4,r4,128 - stvx v7,r0,r3 + stvx v7,0,r3 stvx v6,r3,r6 stvx v5,r3,r7 stvx v4,r3,r8 diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S index 706b7cc19846..d416a4a66578 100644 --- a/arch/powerpc/lib/copyuser_power7.S +++ b/arch/powerpc/lib/copyuser_power7.S @@ -315,13 +315,13 @@ err1; stb r0,0(r3) .machine push .machine "power4" /* setup read stream 0 */ - dcbt r0,r6,0b01000 /* addr from */ - dcbt r0,r7,0b01010 /* length and depth from */ + dcbt 0,r6,0b01000 /* addr from */ + dcbt 0,r7,0b01010 /* length and depth from */ /* setup write stream 1 */ - dcbtst r0,r9,0b01000 /* addr to */ - dcbtst r0,r10,0b01010 /* length and depth to */ + dcbtst 0,r9,0b01000 /* addr to */ + dcbtst 0,r10,0b01010 /* length and depth to */ eieio - dcbt r0,r8,0b01010 /* all streams GO */ + dcbt 0,r8,0b01010 /* all streams GO */ .machine pop beq cr1,.Lunwind_stack_nonvmx_copy @@ -376,26 +376,26 @@ err3; std r0,0(r3) li r11,48 bf cr7*4+3,5f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 addi r4,r4,16 -err3; stvx v1,r0,r3 +err3; stvx v1,0,r3 addi r3,r3,16 5: bf cr7*4+2,6f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 err3; lvx v0,r4,r9 addi r4,r4,32 -err3; stvx v1,r0,r3 +err3; stvx v1,0,r3 err3; stvx v0,r3,r9 addi r3,r3,32 6: bf cr7*4+1,7f -err3; lvx v3,r0,r4 +err3; lvx v3,0,r4 err3; lvx v2,r4,r9 err3; lvx v1,r4,r10 err3; lvx v0,r4,r11 addi r4,r4,64 -err3; stvx v3,r0,r3 +err3; stvx v3,0,r3 err3; stvx v2,r3,r9 err3; stvx v1,r3,r10 err3; stvx v0,r3,r11 @@ -421,7 +421,7 @@ err3; stvx v0,r3,r11 */ .align 5 8: -err4; lvx v7,r0,r4 +err4; lvx v7,0,r4 err4; lvx v6,r4,r9 err4; lvx v5,r4,r10 err4; lvx v4,r4,r11 @@ -430,7 +430,7 @@ err4; lvx v2,r4,r14 err4; lvx v1,r4,r15 err4; lvx v0,r4,r16 addi r4,r4,128 -err4; stvx v7,r0,r3 +err4; stvx v7,0,r3 err4; stvx v6,r3,r9 err4; stvx v5,r3,r10 err4; stvx v4,r3,r11 @@ -451,29 +451,29 @@ err4; stvx v0,r3,r16 mtocrf 0x01,r6 bf cr7*4+1,9f -err3; lvx v3,r0,r4 +err3; lvx v3,0,r4 err3; lvx v2,r4,r9 err3; lvx v1,r4,r10 err3; lvx v0,r4,r11 addi r4,r4,64 -err3; stvx v3,r0,r3 +err3; stvx v3,0,r3 err3; stvx v2,r3,r9 err3; stvx v1,r3,r10 err3; stvx v0,r3,r11 addi r3,r3,64 9: bf cr7*4+2,10f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 err3; lvx v0,r4,r9 addi r4,r4,32 -err3; stvx v1,r0,r3 +err3; stvx v1,0,r3 err3; stvx v0,r3,r9 addi r3,r3,32 10: bf cr7*4+3,11f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 addi r4,r4,16 -err3; stvx v1,r0,r3 +err3; stvx v1,0,r3 addi r3,r3,16 /* Up to 15B to go */ @@ -553,25 +553,25 @@ err3; lvx v0,0,r4 addi r4,r4,16 bf cr7*4+3,5f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 VPERM(v8,v0,v1,v16) addi r4,r4,16 -err3; stvx v8,r0,r3 +err3; stvx v8,0,r3 addi r3,r3,16 vor v0,v1,v1 5: bf cr7*4+2,6f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 VPERM(v8,v0,v1,v16) err3; lvx v0,r4,r9 VPERM(v9,v1,v0,v16) addi r4,r4,32 -err3; stvx v8,r0,r3 +err3; stvx v8,0,r3 err3; stvx v9,r3,r9 addi r3,r3,32 6: bf cr7*4+1,7f -err3; lvx v3,r0,r4 +err3; lvx v3,0,r4 VPERM(v8,v0,v3,v16) err3; lvx v2,r4,r9 VPERM(v9,v3,v2,v16) @@ -580,7 +580,7 @@ err3; lvx v1,r4,r10 err3; lvx v0,r4,r11 VPERM(v11,v1,v0,v16) addi r4,r4,64 -err3; stvx v8,r0,r3 +err3; stvx v8,0,r3 err3; stvx v9,r3,r9 err3; stvx v10,r3,r10 err3; stvx v11,r3,r11 @@ -606,7 +606,7 @@ err3; stvx v11,r3,r11 */ .align 5 8: -err4; lvx v7,r0,r4 +err4; lvx v7,0,r4 VPERM(v8,v0,v7,v16) err4; lvx v6,r4,r9 VPERM(v9,v7,v6,v16) @@ -623,7 +623,7 @@ err4; lvx v1,r4,r15 err4; lvx v0,r4,r16 VPERM(v15,v1,v0,v16) addi r4,r4,128 -err4; stvx v8,r0,r3 +err4; stvx v8,0,r3 err4; stvx v9,r3,r9 err4; stvx v10,r3,r10 err4; stvx v11,r3,r11 @@ -644,7 +644,7 @@ err4; stvx v15,r3,r16 mtocrf 0x01,r6 bf cr7*4+1,9f -err3; lvx v3,r0,r4 +err3; lvx v3,0,r4 VPERM(v8,v0,v3,v16) err3; lvx v2,r4,r9 VPERM(v9,v3,v2,v16) @@ -653,27 +653,27 @@ err3; lvx v1,r4,r10 err3; lvx v0,r4,r11 VPERM(v11,v1,v0,v16) addi r4,r4,64 -err3; stvx v8,r0,r3 +err3; stvx v8,0,r3 err3; stvx v9,r3,r9 err3; stvx v10,r3,r10 err3; stvx v11,r3,r11 addi r3,r3,64 9: bf cr7*4+2,10f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 VPERM(v8,v0,v1,v16) err3; lvx v0,r4,r9 VPERM(v9,v1,v0,v16) addi r4,r4,32 -err3; stvx v8,r0,r3 +err3; stvx v8,0,r3 err3; stvx v9,r3,r9 addi r3,r3,32 10: bf cr7*4+3,11f -err3; lvx v1,r0,r4 +err3; lvx v1,0,r4 VPERM(v8,v0,v1,v16) addi r4,r4,16 -err3; stvx v8,r0,r3 +err3; stvx v8,0,r3 addi r3,r3,16 /* Up to 15B to go */ diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S index a58777c1b2cb..ae15eba49c1f 100644 --- a/arch/powerpc/lib/ldstfp.S +++ b/arch/powerpc/lib/ldstfp.S @@ -21,27 +21,19 @@ #define STKFRM (PPC_MIN_STKFRM + 16) - .macro inst32 op -reg = 0 - .rept 32 -20: \op reg,0,r4 - b 3f - EX_TABLE(20b,99f) -reg = reg + 1 - .endr - .endm - -/* Get the contents of frN into fr0; N is in r3. */ +/* Get the contents of frN into *p; N is in r3 and p is in r4. */ _GLOBAL(get_fpr) mflr r0 + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync rlwinm r3,r3,3,0xf8 bcl 20,31,1f - blr /* fr0 is already in fr0 */ - nop -reg = 1 - .rept 31 - fmr fr0,reg - blr +reg = 0 + .rept 32 + stfd reg, 0(r4) + b 2f reg = reg + 1 .endr 1: mflr r5 @@ -49,18 +41,23 @@ reg = reg + 1 mtctr r5 mtlr r0 bctr +2: MTMSRD(r6) + isync + blr -/* Put the contents of fr0 into frN; N is in r3. */ +/* Put the contents of *p into frN; N is in r3 and p is in r4. */ _GLOBAL(put_fpr) mflr r0 + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync rlwinm r3,r3,3,0xf8 bcl 20,31,1f - blr /* fr0 is already in fr0 */ - nop -reg = 1 - .rept 31 - fmr reg,fr0 - blr +reg = 0 + .rept 32 + lfd reg, 0(r4) + b 2f reg = reg + 1 .endr 1: mflr r5 @@ -68,127 +65,24 @@ reg = reg + 1 mtctr r5 mtlr r0 bctr - -/* Load FP reg N from float at *p. N is in r3, p in r4. */ -_GLOBAL(do_lfs) - PPC_STLU r1,-STKFRM(r1) - mflr r0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) - mfmsr r6 - ori r7,r6,MSR_FP - cmpwi cr7,r3,0 - MTMSRD(r7) - isync - beq cr7,1f - stfd fr0,STKFRM-16(r1) -1: li r9,-EFAULT -2: lfs fr0,0(r4) - li r9,0 -3: bl put_fpr - beq cr7,4f - lfd fr0,STKFRM-16(r1) -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlr r0 - MTMSRD(r6) - isync - mr r3,r9 - addi r1,r1,STKFRM - blr - EX_TABLE(2b,3b) - -/* Load FP reg N from double at *p. N is in r3, p in r4. */ -_GLOBAL(do_lfd) - PPC_STLU r1,-STKFRM(r1) - mflr r0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) - mfmsr r6 - ori r7,r6,MSR_FP - cmpwi cr7,r3,0 - MTMSRD(r7) - isync - beq cr7,1f - stfd fr0,STKFRM-16(r1) -1: li r9,-EFAULT -2: lfd fr0,0(r4) - li r9,0 -3: beq cr7,4f - bl put_fpr - lfd fr0,STKFRM-16(r1) -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlr r0 - MTMSRD(r6) +2: MTMSRD(r6) isync - mr r3,r9 - addi r1,r1,STKFRM blr - EX_TABLE(2b,3b) -/* Store FP reg N to float at *p. N is in r3, p in r4. */ -_GLOBAL(do_stfs) - PPC_STLU r1,-STKFRM(r1) - mflr r0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) - mfmsr r6 - ori r7,r6,MSR_FP - cmpwi cr7,r3,0 - MTMSRD(r7) - isync - beq cr7,1f - stfd fr0,STKFRM-16(r1) - bl get_fpr -1: li r9,-EFAULT -2: stfs fr0,0(r4) - li r9,0 -3: beq cr7,4f - lfd fr0,STKFRM-16(r1) -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlr r0 - MTMSRD(r6) - isync - mr r3,r9 - addi r1,r1,STKFRM - blr - EX_TABLE(2b,3b) - -/* Store FP reg N to double at *p. N is in r3, p in r4. */ -_GLOBAL(do_stfd) - PPC_STLU r1,-STKFRM(r1) +#ifdef CONFIG_ALTIVEC +/* Get the contents of vrN into *p; N is in r3 and p is in r4. */ +_GLOBAL(get_vr) mflr r0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) mfmsr r6 - ori r7,r6,MSR_FP - cmpwi cr7,r3,0 + oris r7, r6, MSR_VEC@h MTMSRD(r7) isync - beq cr7,1f - stfd fr0,STKFRM-16(r1) - bl get_fpr -1: li r9,-EFAULT -2: stfd fr0,0(r4) - li r9,0 -3: beq cr7,4f - lfd fr0,STKFRM-16(r1) -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlr r0 - MTMSRD(r6) - isync - mr r3,r9 - addi r1,r1,STKFRM - blr - EX_TABLE(2b,3b) - -#ifdef CONFIG_ALTIVEC -/* Get the contents of vrN into v0; N is in r3. */ -_GLOBAL(get_vr) - mflr r0 rlwinm r3,r3,3,0xf8 bcl 20,31,1f - blr /* v0 is already in v0 */ - nop -reg = 1 - .rept 31 - vor v0,reg,reg /* assembler doesn't know vmr? */ - blr +reg = 0 + .rept 32 + stvx reg, 0, r4 + b 2f reg = reg + 1 .endr 1: mflr r5 @@ -196,18 +90,23 @@ reg = reg + 1 mtctr r5 mtlr r0 bctr +2: MTMSRD(r6) + isync + blr -/* Put the contents of v0 into vrN; N is in r3. */ +/* Put the contents of *p into vrN; N is in r3 and p is in r4. */ _GLOBAL(put_vr) mflr r0 + mfmsr r6 + oris r7, r6, MSR_VEC@h + MTMSRD(r7) + isync rlwinm r3,r3,3,0xf8 bcl 20,31,1f - blr /* v0 is already in v0 */ - nop -reg = 1 - .rept 31 - vor reg,v0,v0 - blr +reg = 0 + .rept 32 + lvx reg, 0, r4 + b 2f reg = reg + 1 .endr 1: mflr r5 @@ -215,62 +114,9 @@ reg = reg + 1 mtctr r5 mtlr r0 bctr - -/* Load vector reg N from *p. N is in r3, p in r4. */ -_GLOBAL(do_lvx) - PPC_STLU r1,-STKFRM(r1) - mflr r0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) - mfmsr r6 - oris r7,r6,MSR_VEC@h - cmpwi cr7,r3,0 - li r8,STKFRM-16 - MTMSRD(r7) - isync - beq cr7,1f - stvx v0,r1,r8 -1: li r9,-EFAULT -2: lvx v0,0,r4 - li r9,0 -3: beq cr7,4f - bl put_vr - lvx v0,r1,r8 -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlr r0 - MTMSRD(r6) +2: MTMSRD(r6) isync - mr r3,r9 - addi r1,r1,STKFRM - blr - EX_TABLE(2b,3b) - -/* Store vector reg N to *p. N is in r3, p in r4. */ -_GLOBAL(do_stvx) - PPC_STLU r1,-STKFRM(r1) - mflr r0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) - mfmsr r6 - oris r7,r6,MSR_VEC@h - cmpwi cr7,r3,0 - li r8,STKFRM-16 - MTMSRD(r7) - isync - beq cr7,1f - stvx v0,r1,r8 - bl get_vr -1: li r9,-EFAULT -2: stvx v0,0,r4 - li r9,0 -3: beq cr7,4f - lvx v0,r1,r8 -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlr r0 - MTMSRD(r6) - isync - mr r3,r9 - addi r1,r1,STKFRM blr - EX_TABLE(2b,3b) #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX @@ -313,7 +159,7 @@ reg = reg + 1 bctr /* Load VSX reg N from vector doubleword *p. N is in r3, p in r4. */ -_GLOBAL(do_lxvd2x) +_GLOBAL(load_vsrn) PPC_STLU r1,-STKFRM(r1) mflr r0 PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) @@ -325,49 +171,74 @@ _GLOBAL(do_lxvd2x) isync beq cr7,1f STXVD2X(0,R1,R8) -1: li r9,-EFAULT -2: LXVD2X(0,R0,R4) - li r9,0 -3: beq cr7,4f +1: LXVD2X(0,R0,R4) +#ifdef __LITTLE_ENDIAN__ + XXSWAPD(0,0) +#endif + beq cr7,4f bl put_vsr LXVD2X(0,R1,R8) 4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) mtlr r0 MTMSRD(r6) isync - mr r3,r9 addi r1,r1,STKFRM blr - EX_TABLE(2b,3b) /* Store VSX reg N to vector doubleword *p. N is in r3, p in r4. */ -_GLOBAL(do_stxvd2x) +_GLOBAL(store_vsrn) PPC_STLU r1,-STKFRM(r1) mflr r0 PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) mfmsr r6 oris r7,r6,MSR_VSX@h - cmpwi cr7,r3,0 li r8,STKFRM-16 MTMSRD(r7) isync - beq cr7,1f STXVD2X(0,R1,R8) bl get_vsr -1: li r9,-EFAULT -2: STXVD2X(0,R0,R4) - li r9,0 -3: beq cr7,4f +#ifdef __LITTLE_ENDIAN__ + XXSWAPD(0,0) +#endif + STXVD2X(0,R0,R4) LXVD2X(0,R1,R8) -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) mtlr r0 MTMSRD(r6) isync mr r3,r9 addi r1,r1,STKFRM blr - EX_TABLE(2b,3b) - #endif /* CONFIG_VSX */ +/* Convert single-precision to double, without disturbing FPRs. */ +/* conv_sp_to_dp(float *sp, double *dp) */ +_GLOBAL(conv_sp_to_dp) + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync + stfd fr0, -16(r1) + lfs fr0, 0(r3) + stfd fr0, 0(r4) + lfd fr0, -16(r1) + MTMSRD(r6) + isync + blr + +/* Convert single-precision to double, without disturbing FPRs. */ +/* conv_sp_to_dp(double *dp, float *sp) */ +_GLOBAL(conv_dp_to_sp) + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync + stfd fr0, -16(r1) + lfd fr0, 0(r3) + stfs fr0, 0(r4) + lfd fr0, -16(r1) + MTMSRD(r6) + isync + blr + #endif /* CONFIG_PPC_FPU */ diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S index 85fa9869aec5..ec531de99996 100644 --- a/arch/powerpc/lib/mem_64.S +++ b/arch/powerpc/lib/mem_64.S @@ -13,6 +13,23 @@ #include <asm/ppc_asm.h> #include <asm/export.h> +_GLOBAL(__memset16) + rlwimi r4,r4,16,0,15 + /* fall through */ + +_GLOBAL(__memset32) + rldimi r4,r4,32,0 + /* fall through */ + +_GLOBAL(__memset64) + neg r0,r3 + andi. r0,r0,7 + cmplw cr1,r5,r0 + b .Lms +EXPORT_SYMBOL(__memset16) +EXPORT_SYMBOL(__memset32) +EXPORT_SYMBOL(__memset64) + _GLOBAL(memset) neg r0,r3 rlwimi r4,r4,8,16,23 @@ -20,7 +37,7 @@ _GLOBAL(memset) rlwimi r4,r4,16,0,15 cmplw cr1,r5,r0 /* do we get that far? */ rldimi r4,r4,32,0 - PPC_MTOCRF(1,r0) +.Lms: PPC_MTOCRF(1,r0) mr r6,r3 blt cr1,8f beq+ 3f /* if already 8-byte aligned */ diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index 786234fd4e91..193909abd18b 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S @@ -261,12 +261,12 @@ _GLOBAL(memcpy_power7) .machine push .machine "power4" - dcbt r0,r6,0b01000 - dcbt r0,r7,0b01010 - dcbtst r0,r9,0b01000 - dcbtst r0,r10,0b01010 + dcbt 0,r6,0b01000 + dcbt 0,r7,0b01010 + dcbtst 0,r9,0b01000 + dcbtst 0,r10,0b01010 eieio - dcbt r0,r8,0b01010 /* GO */ + dcbt 0,r8,0b01010 /* GO */ .machine pop beq cr1,.Lunwind_stack_nonvmx_copy @@ -321,26 +321,26 @@ _GLOBAL(memcpy_power7) li r11,48 bf cr7*4+3,5f - lvx v1,r0,r4 + lvx v1,0,r4 addi r4,r4,16 - stvx v1,r0,r3 + stvx v1,0,r3 addi r3,r3,16 5: bf cr7*4+2,6f - lvx v1,r0,r4 + lvx v1,0,r4 lvx v0,r4,r9 addi r4,r4,32 - stvx v1,r0,r3 + stvx v1,0,r3 stvx v0,r3,r9 addi r3,r3,32 6: bf cr7*4+1,7f - lvx v3,r0,r4 + lvx v3,0,r4 lvx v2,r4,r9 lvx v1,r4,r10 lvx v0,r4,r11 addi r4,r4,64 - stvx v3,r0,r3 + stvx v3,0,r3 stvx v2,r3,r9 stvx v1,r3,r10 stvx v0,r3,r11 @@ -366,7 +366,7 @@ _GLOBAL(memcpy_power7) */ .align 5 8: - lvx v7,r0,r4 + lvx v7,0,r4 lvx v6,r4,r9 lvx v5,r4,r10 lvx v4,r4,r11 @@ -375,7 +375,7 @@ _GLOBAL(memcpy_power7) lvx v1,r4,r15 lvx v0,r4,r16 addi r4,r4,128 - stvx v7,r0,r3 + stvx v7,0,r3 stvx v6,r3,r9 stvx v5,r3,r10 stvx v4,r3,r11 @@ -396,29 +396,29 @@ _GLOBAL(memcpy_power7) mtocrf 0x01,r6 bf cr7*4+1,9f - lvx v3,r0,r4 + lvx v3,0,r4 lvx v2,r4,r9 lvx v1,r4,r10 lvx v0,r4,r11 addi r4,r4,64 - stvx v3,r0,r3 + stvx v3,0,r3 stvx v2,r3,r9 stvx v1,r3,r10 stvx v0,r3,r11 addi r3,r3,64 9: bf cr7*4+2,10f - lvx v1,r0,r4 + lvx v1,0,r4 lvx v0,r4,r9 addi r4,r4,32 - stvx v1,r0,r3 + stvx v1,0,r3 stvx v0,r3,r9 addi r3,r3,32 10: bf cr7*4+3,11f - lvx v1,r0,r4 + lvx v1,0,r4 addi r4,r4,16 - stvx v1,r0,r3 + stvx v1,0,r3 addi r3,r3,16 /* Up to 15B to go */ @@ -499,25 +499,25 @@ _GLOBAL(memcpy_power7) addi r4,r4,16 bf cr7*4+3,5f - lvx v1,r0,r4 + lvx v1,0,r4 VPERM(v8,v0,v1,v16) addi r4,r4,16 - stvx v8,r0,r3 + stvx v8,0,r3 addi r3,r3,16 vor v0,v1,v1 5: bf cr7*4+2,6f - lvx v1,r0,r4 + lvx v1,0,r4 VPERM(v8,v0,v1,v16) lvx v0,r4,r9 VPERM(v9,v1,v0,v16) addi r4,r4,32 - stvx v8,r0,r3 + stvx v8,0,r3 stvx v9,r3,r9 addi r3,r3,32 6: bf cr7*4+1,7f - lvx v3,r0,r4 + lvx v3,0,r4 VPERM(v8,v0,v3,v16) lvx v2,r4,r9 VPERM(v9,v3,v2,v16) @@ -526,7 +526,7 @@ _GLOBAL(memcpy_power7) lvx v0,r4,r11 VPERM(v11,v1,v0,v16) addi r4,r4,64 - stvx v8,r0,r3 + stvx v8,0,r3 stvx v9,r3,r9 stvx v10,r3,r10 stvx v11,r3,r11 @@ -552,7 +552,7 @@ _GLOBAL(memcpy_power7) */ .align 5 8: - lvx v7,r0,r4 + lvx v7,0,r4 VPERM(v8,v0,v7,v16) lvx v6,r4,r9 VPERM(v9,v7,v6,v16) @@ -569,7 +569,7 @@ _GLOBAL(memcpy_power7) lvx v0,r4,r16 VPERM(v15,v1,v0,v16) addi r4,r4,128 - stvx v8,r0,r3 + stvx v8,0,r3 stvx v9,r3,r9 stvx v10,r3,r10 stvx v11,r3,r11 @@ -590,7 +590,7 @@ _GLOBAL(memcpy_power7) mtocrf 0x01,r6 bf cr7*4+1,9f - lvx v3,r0,r4 + lvx v3,0,r4 VPERM(v8,v0,v3,v16) lvx v2,r4,r9 VPERM(v9,v3,v2,v16) @@ -599,27 +599,27 @@ _GLOBAL(memcpy_power7) lvx v0,r4,r11 VPERM(v11,v1,v0,v16) addi r4,r4,64 - stvx v8,r0,r3 + stvx v8,0,r3 stvx v9,r3,r9 stvx v10,r3,r10 stvx v11,r3,r11 addi r3,r3,64 9: bf cr7*4+2,10f - lvx v1,r0,r4 + lvx v1,0,r4 VPERM(v8,v0,v1,v16) lvx v0,r4,r9 VPERM(v9,v1,v0,v16) addi r4,r4,32 - stvx v8,r0,r3 + stvx v8,0,r3 stvx v9,r3,r9 addi r3,r3,32 10: bf cr7*4+3,11f - lvx v1,r0,r4 + lvx v1,0,r4 VPERM(v8,v0,v1,v16) addi r4,r4,16 - stvx v8,r0,r3 + stvx v8,0,r3 addi r3,r3,16 /* Up to 15B to go */ diff --git a/arch/powerpc/lib/quad.S b/arch/powerpc/lib/quad.S new file mode 100644 index 000000000000..c4d12fae8724 --- /dev/null +++ b/arch/powerpc/lib/quad.S @@ -0,0 +1,62 @@ +/* + * Quadword loads and stores + * for use in instruction emulation. + * + * Copyright 2017 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <linux/errno.h> + +/* do_lq(unsigned long ea, unsigned long *regs) */ +_GLOBAL(do_lq) +1: lq r6, 0(r3) + std r6, 0(r4) + std r7, 8(r4) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) + +/* do_stq(unsigned long ea, unsigned long val0, unsigned long val1) */ +_GLOBAL(do_stq) +1: stq r4, 0(r3) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) + +/* do_lqarx(unsigned long ea, unsigned long *regs) */ +_GLOBAL(do_lqarx) +1: PPC_LQARX(6, 0, 3, 0) + std r6, 0(r4) + std r7, 8(r4) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) + +/* do_stqcx(unsigned long ea, unsigned long val0, unsigned long val1, + unsigned int *crp) */ + +_GLOBAL(do_stqcx) +1: PPC_STQCX(4, 0, 3) + mfcr r5 + stw r5, 0(r6) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index ee33327686ae..fb9f58b868e7 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -36,14 +36,33 @@ extern char system_call_common[]; /* * Functions in ldstfp.S */ -extern int do_lfs(int rn, unsigned long ea); -extern int do_lfd(int rn, unsigned long ea); -extern int do_stfs(int rn, unsigned long ea); -extern int do_stfd(int rn, unsigned long ea); -extern int do_lvx(int rn, unsigned long ea); -extern int do_stvx(int rn, unsigned long ea); -extern int do_lxvd2x(int rn, unsigned long ea); -extern int do_stxvd2x(int rn, unsigned long ea); +extern void get_fpr(int rn, double *p); +extern void put_fpr(int rn, const double *p); +extern void get_vr(int rn, __vector128 *p); +extern void put_vr(int rn, __vector128 *p); +extern void load_vsrn(int vsr, const void *p); +extern void store_vsrn(int vsr, void *p); +extern void conv_sp_to_dp(const float *sp, double *dp); +extern void conv_dp_to_sp(const double *dp, float *sp); +#endif + +#ifdef __powerpc64__ +/* + * Functions in quad.S + */ +extern int do_lq(unsigned long ea, unsigned long *regs); +extern int do_stq(unsigned long ea, unsigned long val0, unsigned long val1); +extern int do_lqarx(unsigned long ea, unsigned long *regs); +extern int do_stqcx(unsigned long ea, unsigned long val0, unsigned long val1, + unsigned int *crp); +#endif + +#ifdef __LITTLE_ENDIAN__ +#define IS_LE 1 +#define IS_BE 0 +#else +#define IS_LE 0 +#define IS_BE 1 #endif /* @@ -62,15 +81,17 @@ static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr, /* * Determine whether a conditional branch instruction would branch. */ -static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs) +static nokprobe_inline int branch_taken(unsigned int instr, + const struct pt_regs *regs, + struct instruction_op *op) { unsigned int bo = (instr >> 21) & 0x1f; unsigned int bi; if ((bo & 4) == 0) { /* decrement counter */ - --regs->ctr; - if (((bo >> 1) & 1) ^ (regs->ctr == 0)) + op->type |= DECCTR; + if (((bo >> 1) & 1) ^ (regs->ctr == 1)) return 0; } if ((bo & 0x10) == 0) { @@ -82,17 +103,26 @@ static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs return 1; } -static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, int nb) +static nokprobe_inline long address_ok(struct pt_regs *regs, + unsigned long ea, int nb) { if (!user_mode(regs)) return 1; - return __access_ok(ea, nb, USER_DS); + if (__access_ok(ea, nb, USER_DS)) + return 1; + if (__access_ok(ea, 1, USER_DS)) + /* Access overlaps the end of the user region */ + regs->dar = USER_DS.seg; + else + regs->dar = ea; + return 0; } /* * Calculate effective address for a D-form instruction */ -static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs *regs) +static nokprobe_inline unsigned long dform_ea(unsigned int instr, + const struct pt_regs *regs) { int ra; unsigned long ea; @@ -102,14 +132,15 @@ static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs if (ra) ea += regs->gpr[ra]; - return truncate_if_32bit(regs->msr, ea); + return ea; } #ifdef __powerpc64__ /* * Calculate effective address for a DS-form instruction */ -static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_regs *regs) +static nokprobe_inline unsigned long dsform_ea(unsigned int instr, + const struct pt_regs *regs) { int ra; unsigned long ea; @@ -119,7 +150,24 @@ static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_reg if (ra) ea += regs->gpr[ra]; - return truncate_if_32bit(regs->msr, ea); + return ea; +} + +/* + * Calculate effective address for a DQ-form instruction + */ +static nokprobe_inline unsigned long dqform_ea(unsigned int instr, + const struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) (instr & ~0xf); /* sign-extend */ + if (ra) + ea += regs->gpr[ra]; + + return ea; } #endif /* __powerpc64 */ @@ -127,7 +175,7 @@ static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_reg * Calculate effective address for an X-form instruction */ static nokprobe_inline unsigned long xform_ea(unsigned int instr, - struct pt_regs *regs) + const struct pt_regs *regs) { int ra, rb; unsigned long ea; @@ -138,7 +186,7 @@ static nokprobe_inline unsigned long xform_ea(unsigned int instr, if (ra) ea += regs->gpr[ra]; - return truncate_if_32bit(regs->msr, ea); + return ea; } /* @@ -151,7 +199,6 @@ static nokprobe_inline unsigned long max_align(unsigned long x) return x & -x; /* isolates rightmost bit */ } - static nokprobe_inline unsigned long byterev_2(unsigned long x) { return ((x >> 8) & 0xff) | ((x & 0xff) << 8); @@ -170,8 +217,36 @@ static nokprobe_inline unsigned long byterev_8(unsigned long x) } #endif +static nokprobe_inline void do_byte_reverse(void *ptr, int nb) +{ + switch (nb) { + case 2: + *(u16 *)ptr = byterev_2(*(u16 *)ptr); + break; + case 4: + *(u32 *)ptr = byterev_4(*(u32 *)ptr); + break; +#ifdef __powerpc64__ + case 8: + *(unsigned long *)ptr = byterev_8(*(unsigned long *)ptr); + break; + case 16: { + unsigned long *up = (unsigned long *)ptr; + unsigned long tmp; + tmp = byterev_8(up[0]); + up[0] = byterev_8(up[1]); + up[1] = tmp; + break; + } +#endif + default: + WARN_ON_ONCE(1); + } +} + static nokprobe_inline int read_mem_aligned(unsigned long *dest, - unsigned long ea, int nb) + unsigned long ea, int nb, + struct pt_regs *regs) { int err = 0; unsigned long x = 0; @@ -194,59 +269,77 @@ static nokprobe_inline int read_mem_aligned(unsigned long *dest, } if (!err) *dest = x; + else + regs->dar = ea; return err; } -static nokprobe_inline int read_mem_unaligned(unsigned long *dest, - unsigned long ea, int nb, struct pt_regs *regs) +/* + * Copy from userspace to a buffer, using the largest possible + * aligned accesses, up to sizeof(long). + */ +static int nokprobe_inline copy_mem_in(u8 *dest, unsigned long ea, int nb, + struct pt_regs *regs) { - int err; - unsigned long x, b, c; -#ifdef __LITTLE_ENDIAN__ - int len = nb; /* save a copy of the length for byte reversal */ -#endif + int err = 0; + int c; - /* unaligned, do this in pieces */ - x = 0; for (; nb > 0; nb -= c) { -#ifdef __LITTLE_ENDIAN__ - c = 1; -#endif -#ifdef __BIG_ENDIAN__ c = max_align(ea); -#endif if (c > nb) c = max_align(nb); - err = read_mem_aligned(&b, ea, c); - if (err) - return err; - x = (x << (8 * c)) + b; - ea += c; - } -#ifdef __LITTLE_ENDIAN__ - switch (len) { - case 2: - *dest = byterev_2(x); - break; - case 4: - *dest = byterev_4(x); - break; + switch (c) { + case 1: + err = __get_user(*dest, (unsigned char __user *) ea); + break; + case 2: + err = __get_user(*(u16 *)dest, + (unsigned short __user *) ea); + break; + case 4: + err = __get_user(*(u32 *)dest, + (unsigned int __user *) ea); + break; #ifdef __powerpc64__ - case 8: - *dest = byterev_8(x); - break; + case 8: + err = __get_user(*(unsigned long *)dest, + (unsigned long __user *) ea); + break; #endif + } + if (err) { + regs->dar = ea; + return err; + } + dest += c; + ea += c; } -#endif -#ifdef __BIG_ENDIAN__ - *dest = x; -#endif return 0; } +static nokprobe_inline int read_mem_unaligned(unsigned long *dest, + unsigned long ea, int nb, + struct pt_regs *regs) +{ + union { + unsigned long ul; + u8 b[sizeof(unsigned long)]; + } u; + int i; + int err; + + u.ul = 0; + i = IS_BE ? sizeof(unsigned long) - nb : 0; + err = copy_mem_in(&u.b[i], ea, nb, regs); + if (!err) + *dest = u.ul; + return err; +} + /* * Read memory at address ea for nb bytes, return 0 for success - * or -EFAULT if an error occurred. + * or -EFAULT if an error occurred. N.B. nb must be 1, 2, 4 or 8. + * If nb < sizeof(long), the result is right-justified on BE systems. */ static int read_mem(unsigned long *dest, unsigned long ea, int nb, struct pt_regs *regs) @@ -254,13 +347,14 @@ static int read_mem(unsigned long *dest, unsigned long ea, int nb, if (!address_ok(regs, ea, nb)) return -EFAULT; if ((ea & (nb - 1)) == 0) - return read_mem_aligned(dest, ea, nb); + return read_mem_aligned(dest, ea, nb, regs); return read_mem_unaligned(dest, ea, nb, regs); } NOKPROBE_SYMBOL(read_mem); static nokprobe_inline int write_mem_aligned(unsigned long val, - unsigned long ea, int nb) + unsigned long ea, int nb, + struct pt_regs *regs) { int err = 0; @@ -280,51 +374,72 @@ static nokprobe_inline int write_mem_aligned(unsigned long val, break; #endif } + if (err) + regs->dar = ea; return err; } -static nokprobe_inline int write_mem_unaligned(unsigned long val, - unsigned long ea, int nb, struct pt_regs *regs) +/* + * Copy from a buffer to userspace, using the largest possible + * aligned accesses, up to sizeof(long). + */ +static int nokprobe_inline copy_mem_out(u8 *dest, unsigned long ea, int nb, + struct pt_regs *regs) { - int err; - unsigned long c; + int err = 0; + int c; -#ifdef __LITTLE_ENDIAN__ - switch (nb) { - case 2: - val = byterev_2(val); - break; - case 4: - val = byterev_4(val); - break; -#ifdef __powerpc64__ - case 8: - val = byterev_8(val); - break; -#endif - } -#endif - /* unaligned or little-endian, do this in pieces */ for (; nb > 0; nb -= c) { -#ifdef __LITTLE_ENDIAN__ - c = 1; -#endif -#ifdef __BIG_ENDIAN__ c = max_align(ea); -#endif if (c > nb) c = max_align(nb); - err = write_mem_aligned(val >> (nb - c) * 8, ea, c); - if (err) + switch (c) { + case 1: + err = __put_user(*dest, (unsigned char __user *) ea); + break; + case 2: + err = __put_user(*(u16 *)dest, + (unsigned short __user *) ea); + break; + case 4: + err = __put_user(*(u32 *)dest, + (unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __put_user(*(unsigned long *)dest, + (unsigned long __user *) ea); + break; +#endif + } + if (err) { + regs->dar = ea; return err; + } + dest += c; ea += c; } return 0; } +static nokprobe_inline int write_mem_unaligned(unsigned long val, + unsigned long ea, int nb, + struct pt_regs *regs) +{ + union { + unsigned long ul; + u8 b[sizeof(unsigned long)]; + } u; + int i; + + u.ul = val; + i = IS_BE ? sizeof(unsigned long) - nb : 0; + return copy_mem_out(&u.b[i], ea, nb, regs); +} + /* * Write memory at address ea for nb bytes, return 0 for success - * or -EFAULT if an error occurred. + * or -EFAULT if an error occurred. N.B. nb must be 1, 2, 4 or 8. */ static int write_mem(unsigned long val, unsigned long ea, int nb, struct pt_regs *regs) @@ -332,163 +447,465 @@ static int write_mem(unsigned long val, unsigned long ea, int nb, if (!address_ok(regs, ea, nb)) return -EFAULT; if ((ea & (nb - 1)) == 0) - return write_mem_aligned(val, ea, nb); + return write_mem_aligned(val, ea, nb, regs); return write_mem_unaligned(val, ea, nb, regs); } NOKPROBE_SYMBOL(write_mem); #ifdef CONFIG_PPC_FPU /* - * Check the address and alignment, and call func to do the actual - * load or store. + * These access either the real FP register or the image in the + * thread_struct, depending on regs->msr & MSR_FP. */ -static int do_fp_load(int rn, int (*func)(int, unsigned long), - unsigned long ea, int nb, - struct pt_regs *regs) +static int do_fp_load(struct instruction_op *op, unsigned long ea, + struct pt_regs *regs, bool cross_endian) { - int err; + int err, rn, nb; union { - double dbl; - unsigned long ul[2]; - struct { -#ifdef __BIG_ENDIAN__ - unsigned _pad_; - unsigned word; -#endif -#ifdef __LITTLE_ENDIAN__ - unsigned word; - unsigned _pad_; -#endif - } single; - } data; - unsigned long ptr; - + int i; + unsigned int u; + float f; + double d[2]; + unsigned long l[2]; + u8 b[2 * sizeof(double)]; + } u; + + nb = GETSIZE(op->type); if (!address_ok(regs, ea, nb)) return -EFAULT; - if ((ea & 3) == 0) - return (*func)(rn, ea); - ptr = (unsigned long) &data.ul; - if (sizeof(unsigned long) == 8 || nb == 4) { - err = read_mem_unaligned(&data.ul[0], ea, nb, regs); - if (nb == 4) - ptr = (unsigned long)&(data.single.word); - } else { - /* reading a double on 32-bit */ - err = read_mem_unaligned(&data.ul[0], ea, 4, regs); - if (!err) - err = read_mem_unaligned(&data.ul[1], ea + 4, 4, regs); - } + rn = op->reg; + err = copy_mem_in(u.b, ea, nb, regs); if (err) return err; - return (*func)(rn, ptr); + if (unlikely(cross_endian)) { + do_byte_reverse(u.b, min(nb, 8)); + if (nb == 16) + do_byte_reverse(&u.b[8], 8); + } + preempt_disable(); + if (nb == 4) { + if (op->type & FPCONV) + conv_sp_to_dp(&u.f, &u.d[0]); + else if (op->type & SIGNEXT) + u.l[0] = u.i; + else + u.l[0] = u.u; + } + if (regs->msr & MSR_FP) + put_fpr(rn, &u.d[0]); + else + current->thread.TS_FPR(rn) = u.l[0]; + if (nb == 16) { + /* lfdp */ + rn |= 1; + if (regs->msr & MSR_FP) + put_fpr(rn, &u.d[1]); + else + current->thread.TS_FPR(rn) = u.l[1]; + } + preempt_enable(); + return 0; } NOKPROBE_SYMBOL(do_fp_load); -static int do_fp_store(int rn, int (*func)(int, unsigned long), - unsigned long ea, int nb, - struct pt_regs *regs) +static int do_fp_store(struct instruction_op *op, unsigned long ea, + struct pt_regs *regs, bool cross_endian) { - int err; + int rn, nb; union { - double dbl; - unsigned long ul[2]; - struct { -#ifdef __BIG_ENDIAN__ - unsigned _pad_; - unsigned word; -#endif -#ifdef __LITTLE_ENDIAN__ - unsigned word; - unsigned _pad_; -#endif - } single; - } data; - unsigned long ptr; - + unsigned int u; + float f; + double d[2]; + unsigned long l[2]; + u8 b[2 * sizeof(double)]; + } u; + + nb = GETSIZE(op->type); if (!address_ok(regs, ea, nb)) return -EFAULT; - if ((ea & 3) == 0) - return (*func)(rn, ea); - ptr = (unsigned long) &data.ul[0]; - if (sizeof(unsigned long) == 8 || nb == 4) { - if (nb == 4) - ptr = (unsigned long)&(data.single.word); - err = (*func)(rn, ptr); - if (err) - return err; - err = write_mem_unaligned(data.ul[0], ea, nb, regs); - } else { - /* writing a double on 32-bit */ - err = (*func)(rn, ptr); - if (err) - return err; - err = write_mem_unaligned(data.ul[0], ea, 4, regs); - if (!err) - err = write_mem_unaligned(data.ul[1], ea + 4, 4, regs); + rn = op->reg; + preempt_disable(); + if (regs->msr & MSR_FP) + get_fpr(rn, &u.d[0]); + else + u.l[0] = current->thread.TS_FPR(rn); + if (nb == 4) { + if (op->type & FPCONV) + conv_dp_to_sp(&u.d[0], &u.f); + else + u.u = u.l[0]; } - return err; + if (nb == 16) { + rn |= 1; + if (regs->msr & MSR_FP) + get_fpr(rn, &u.d[1]); + else + u.l[1] = current->thread.TS_FPR(rn); + } + preempt_enable(); + if (unlikely(cross_endian)) { + do_byte_reverse(u.b, min(nb, 8)); + if (nb == 16) + do_byte_reverse(&u.b[8], 8); + } + return copy_mem_out(u.b, ea, nb, regs); } NOKPROBE_SYMBOL(do_fp_store); #endif #ifdef CONFIG_ALTIVEC /* For Altivec/VMX, no need to worry about alignment */ -static nokprobe_inline int do_vec_load(int rn, int (*func)(int, unsigned long), - unsigned long ea, struct pt_regs *regs) +static nokprobe_inline int do_vec_load(int rn, unsigned long ea, + int size, struct pt_regs *regs, + bool cross_endian) { + int err; + union { + __vector128 v; + u8 b[sizeof(__vector128)]; + } u = {}; + if (!address_ok(regs, ea & ~0xfUL, 16)) return -EFAULT; - return (*func)(rn, ea); + /* align to multiple of size */ + ea &= ~(size - 1); + err = copy_mem_in(&u.b[ea & 0xf], ea, size, regs); + if (err) + return err; + if (unlikely(cross_endian)) + do_byte_reverse(&u.b[ea & 0xf], size); + preempt_disable(); + if (regs->msr & MSR_VEC) + put_vr(rn, &u.v); + else + current->thread.vr_state.vr[rn] = u.v; + preempt_enable(); + return 0; } -static nokprobe_inline int do_vec_store(int rn, int (*func)(int, unsigned long), - unsigned long ea, struct pt_regs *regs) +static nokprobe_inline int do_vec_store(int rn, unsigned long ea, + int size, struct pt_regs *regs, + bool cross_endian) { + union { + __vector128 v; + u8 b[sizeof(__vector128)]; + } u; + if (!address_ok(regs, ea & ~0xfUL, 16)) return -EFAULT; - return (*func)(rn, ea); + /* align to multiple of size */ + ea &= ~(size - 1); + + preempt_disable(); + if (regs->msr & MSR_VEC) + get_vr(rn, &u.v); + else + u.v = current->thread.vr_state.vr[rn]; + preempt_enable(); + if (unlikely(cross_endian)) + do_byte_reverse(&u.b[ea & 0xf], size); + return copy_mem_out(&u.b[ea & 0xf], ea, size, regs); } #endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_VSX -static nokprobe_inline int do_vsx_load(int rn, int (*func)(int, unsigned long), - unsigned long ea, struct pt_regs *regs) +#ifdef __powerpc64__ +static nokprobe_inline int emulate_lq(struct pt_regs *regs, unsigned long ea, + int reg, bool cross_endian) { int err; - unsigned long val[2]; if (!address_ok(regs, ea, 16)) return -EFAULT; - if ((ea & 3) == 0) - return (*func)(rn, ea); - err = read_mem_unaligned(&val[0], ea, 8, regs); - if (!err) - err = read_mem_unaligned(&val[1], ea + 8, 8, regs); - if (!err) - err = (*func)(rn, (unsigned long) &val[0]); + /* if aligned, should be atomic */ + if ((ea & 0xf) == 0) { + err = do_lq(ea, ®s->gpr[reg]); + } else { + err = read_mem(®s->gpr[reg + IS_LE], ea, 8, regs); + if (!err) + err = read_mem(®s->gpr[reg + IS_BE], ea + 8, 8, regs); + } + if (!err && unlikely(cross_endian)) + do_byte_reverse(®s->gpr[reg], 16); return err; } -static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long), - unsigned long ea, struct pt_regs *regs) +static nokprobe_inline int emulate_stq(struct pt_regs *regs, unsigned long ea, + int reg, bool cross_endian) { int err; - unsigned long val[2]; + unsigned long vals[2]; if (!address_ok(regs, ea, 16)) return -EFAULT; - if ((ea & 3) == 0) - return (*func)(rn, ea); - err = (*func)(rn, (unsigned long) &val[0]); - if (err) - return err; - err = write_mem_unaligned(val[0], ea, 8, regs); + vals[0] = regs->gpr[reg]; + vals[1] = regs->gpr[reg + 1]; + if (unlikely(cross_endian)) + do_byte_reverse(vals, 16); + + /* if aligned, should be atomic */ + if ((ea & 0xf) == 0) + return do_stq(ea, vals[0], vals[1]); + + err = write_mem(vals[IS_LE], ea, 8, regs); if (!err) - err = write_mem_unaligned(val[1], ea + 8, 8, regs); + err = write_mem(vals[IS_BE], ea + 8, 8, regs); return err; } +#endif /* __powerpc64 */ + +#ifdef CONFIG_VSX +void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, + const void *mem, bool rev) +{ + int size, read_size; + int i, j; + const unsigned int *wp; + const unsigned short *hp; + const unsigned char *bp; + + size = GETSIZE(op->type); + reg->d[0] = reg->d[1] = 0; + + switch (op->element_size) { + case 16: + /* whole vector; lxv[x] or lxvl[l] */ + if (size == 0) + break; + memcpy(reg, mem, size); + if (IS_LE && (op->vsx_flags & VSX_LDLEFT)) + rev = !rev; + if (rev) + do_byte_reverse(reg, 16); + break; + case 8: + /* scalar loads, lxvd2x, lxvdsx */ + read_size = (size >= 8) ? 8 : size; + i = IS_LE ? 8 : 8 - read_size; + memcpy(®->b[i], mem, read_size); + if (rev) + do_byte_reverse(®->b[i], 8); + if (size < 8) { + if (op->type & SIGNEXT) { + /* size == 4 is the only case here */ + reg->d[IS_LE] = (signed int) reg->d[IS_LE]; + } else if (op->vsx_flags & VSX_FPCONV) { + preempt_disable(); + conv_sp_to_dp(®->fp[1 + IS_LE], + ®->dp[IS_LE]); + preempt_enable(); + } + } else { + if (size == 16) { + unsigned long v = *(unsigned long *)(mem + 8); + reg->d[IS_BE] = !rev ? v : byterev_8(v); + } else if (op->vsx_flags & VSX_SPLAT) + reg->d[IS_BE] = reg->d[IS_LE]; + } + break; + case 4: + /* lxvw4x, lxvwsx */ + wp = mem; + for (j = 0; j < size / 4; ++j) { + i = IS_LE ? 3 - j : j; + reg->w[i] = !rev ? *wp++ : byterev_4(*wp++); + } + if (op->vsx_flags & VSX_SPLAT) { + u32 val = reg->w[IS_LE ? 3 : 0]; + for (; j < 4; ++j) { + i = IS_LE ? 3 - j : j; + reg->w[i] = val; + } + } + break; + case 2: + /* lxvh8x */ + hp = mem; + for (j = 0; j < size / 2; ++j) { + i = IS_LE ? 7 - j : j; + reg->h[i] = !rev ? *hp++ : byterev_2(*hp++); + } + break; + case 1: + /* lxvb16x */ + bp = mem; + for (j = 0; j < size; ++j) { + i = IS_LE ? 15 - j : j; + reg->b[i] = *bp++; + } + break; + } +} +EXPORT_SYMBOL_GPL(emulate_vsx_load); +NOKPROBE_SYMBOL(emulate_vsx_load); + +void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, + void *mem, bool rev) +{ + int size, write_size; + int i, j; + union vsx_reg buf; + unsigned int *wp; + unsigned short *hp; + unsigned char *bp; + + size = GETSIZE(op->type); + + switch (op->element_size) { + case 16: + /* stxv, stxvx, stxvl, stxvll */ + if (size == 0) + break; + if (IS_LE && (op->vsx_flags & VSX_LDLEFT)) + rev = !rev; + if (rev) { + /* reverse 16 bytes */ + buf.d[0] = byterev_8(reg->d[1]); + buf.d[1] = byterev_8(reg->d[0]); + reg = &buf; + } + memcpy(mem, reg, size); + break; + case 8: + /* scalar stores, stxvd2x */ + write_size = (size >= 8) ? 8 : size; + i = IS_LE ? 8 : 8 - write_size; + if (size < 8 && op->vsx_flags & VSX_FPCONV) { + buf.d[0] = buf.d[1] = 0; + preempt_disable(); + conv_dp_to_sp(®->dp[IS_LE], &buf.fp[1 + IS_LE]); + preempt_enable(); + reg = &buf; + } + memcpy(mem, ®->b[i], write_size); + if (size == 16) + memcpy(mem + 8, ®->d[IS_BE], 8); + if (unlikely(rev)) { + do_byte_reverse(mem, write_size); + if (size == 16) + do_byte_reverse(mem + 8, 8); + } + break; + case 4: + /* stxvw4x */ + wp = mem; + for (j = 0; j < size / 4; ++j) { + i = IS_LE ? 3 - j : j; + *wp++ = !rev ? reg->w[i] : byterev_4(reg->w[i]); + } + break; + case 2: + /* stxvh8x */ + hp = mem; + for (j = 0; j < size / 2; ++j) { + i = IS_LE ? 7 - j : j; + *hp++ = !rev ? reg->h[i] : byterev_2(reg->h[i]); + } + break; + case 1: + /* stvxb16x */ + bp = mem; + for (j = 0; j < size; ++j) { + i = IS_LE ? 15 - j : j; + *bp++ = reg->b[i]; + } + break; + } +} +EXPORT_SYMBOL_GPL(emulate_vsx_store); +NOKPROBE_SYMBOL(emulate_vsx_store); + +static nokprobe_inline int do_vsx_load(struct instruction_op *op, + unsigned long ea, struct pt_regs *regs, + bool cross_endian) +{ + int reg = op->reg; + u8 mem[16]; + union vsx_reg buf; + int size = GETSIZE(op->type); + + if (!address_ok(regs, ea, size) || copy_mem_in(mem, ea, size, regs)) + return -EFAULT; + + emulate_vsx_load(op, &buf, mem, cross_endian); + preempt_disable(); + if (reg < 32) { + /* FP regs + extensions */ + if (regs->msr & MSR_FP) { + load_vsrn(reg, &buf); + } else { + current->thread.fp_state.fpr[reg][0] = buf.d[0]; + current->thread.fp_state.fpr[reg][1] = buf.d[1]; + } + } else { + if (regs->msr & MSR_VEC) + load_vsrn(reg, &buf); + else + current->thread.vr_state.vr[reg - 32] = buf.v; + } + preempt_enable(); + return 0; +} + +static nokprobe_inline int do_vsx_store(struct instruction_op *op, + unsigned long ea, struct pt_regs *regs, + bool cross_endian) +{ + int reg = op->reg; + u8 mem[16]; + union vsx_reg buf; + int size = GETSIZE(op->type); + + if (!address_ok(regs, ea, size)) + return -EFAULT; + + preempt_disable(); + if (reg < 32) { + /* FP regs + extensions */ + if (regs->msr & MSR_FP) { + store_vsrn(reg, &buf); + } else { + buf.d[0] = current->thread.fp_state.fpr[reg][0]; + buf.d[1] = current->thread.fp_state.fpr[reg][1]; + } + } else { + if (regs->msr & MSR_VEC) + store_vsrn(reg, &buf); + else + buf.v = current->thread.vr_state.vr[reg - 32]; + } + preempt_enable(); + emulate_vsx_store(op, &buf, mem, cross_endian); + return copy_mem_out(mem, ea, size, regs); +} #endif /* CONFIG_VSX */ +int emulate_dcbz(unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long i, size; + +#ifdef __powerpc64__ + size = ppc64_caches.l1d.block_size; + if (!(regs->msr & MSR_64BIT)) + ea &= 0xffffffffUL; +#else + size = L1_CACHE_BYTES; +#endif + ea &= ~(size - 1); + if (!address_ok(regs, ea, size)) + return -EFAULT; + for (i = 0; i < size; i += sizeof(long)) { + err = __put_user(0, (unsigned long __user *) (ea + i)); + if (err) { + regs->dar = ea; + return err; + } + } + return 0; +} +NOKPROBE_SYMBOL(emulate_dcbz); + #define __put_user_asmx(x, addr, err, op, cr) \ __asm__ __volatile__( \ "1: " op " %2,0,%3\n" \ @@ -526,24 +943,27 @@ static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long), : "=r" (err) \ : "r" (addr), "i" (-EFAULT), "0" (err)) -static nokprobe_inline void set_cr0(struct pt_regs *regs, int rd) +static nokprobe_inline void set_cr0(const struct pt_regs *regs, + struct instruction_op *op, int rd) { long val = regs->gpr[rd]; - regs->ccr = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); + op->type |= SETCC; + op->ccval = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); #ifdef __powerpc64__ if (!(regs->msr & MSR_64BIT)) val = (int) val; #endif if (val < 0) - regs->ccr |= 0x80000000; + op->ccval |= 0x80000000; else if (val > 0) - regs->ccr |= 0x40000000; + op->ccval |= 0x40000000; else - regs->ccr |= 0x20000000; + op->ccval |= 0x20000000; } -static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd, +static nokprobe_inline void add_with_carry(const struct pt_regs *regs, + struct instruction_op *op, int rd, unsigned long val1, unsigned long val2, unsigned long carry_in) { @@ -551,24 +971,29 @@ static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd, if (carry_in) ++val; - regs->gpr[rd] = val; + op->type = COMPUTE + SETREG + SETXER; + op->reg = rd; + op->val = val; #ifdef __powerpc64__ if (!(regs->msr & MSR_64BIT)) { val = (unsigned int) val; val1 = (unsigned int) val1; } #endif + op->xerval = regs->xer; if (val < val1 || (carry_in && val == val1)) - regs->xer |= XER_CA; + op->xerval |= XER_CA; else - regs->xer &= ~XER_CA; + op->xerval &= ~XER_CA; } -static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2, - int crfld) +static nokprobe_inline void do_cmp_signed(const struct pt_regs *regs, + struct instruction_op *op, + long v1, long v2, int crfld) { unsigned int crval, shift; + op->type = COMPUTE + SETCC; crval = (regs->xer >> 31) & 1; /* get SO bit */ if (v1 < v2) crval |= 8; @@ -577,14 +1002,17 @@ static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2 else crval |= 2; shift = (7 - crfld) * 4; - regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); + op->ccval = (regs->ccr & ~(0xf << shift)) | (crval << shift); } -static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, - unsigned long v2, int crfld) +static nokprobe_inline void do_cmp_unsigned(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, + unsigned long v2, int crfld) { unsigned int crval, shift; + op->type = COMPUTE + SETCC; crval = (regs->xer >> 31) & 1; /* get SO bit */ if (v1 < v2) crval |= 8; @@ -593,7 +1021,90 @@ static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long else crval |= 2; shift = (7 - crfld) * 4; - regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); + op->ccval = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +static nokprobe_inline void do_cmpb(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, unsigned long v2) +{ + unsigned long long out_val, mask; + int i; + + out_val = 0; + for (i = 0; i < 8; i++) { + mask = 0xffUL << (i * 8); + if ((v1 & mask) == (v2 & mask)) + out_val |= mask; + } + op->val = out_val; +} + +/* + * The size parameter is used to adjust the equivalent popcnt instruction. + * popcntb = 8, popcntw = 32, popcntd = 64 + */ +static nokprobe_inline void do_popcnt(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, int size) +{ + unsigned long long out = v1; + + out -= (out >> 1) & 0x5555555555555555; + out = (0x3333333333333333 & out) + (0x3333333333333333 & (out >> 2)); + out = (out + (out >> 4)) & 0x0f0f0f0f0f0f0f0f; + + if (size == 8) { /* popcntb */ + op->val = out; + return; + } + out += out >> 8; + out += out >> 16; + if (size == 32) { /* popcntw */ + op->val = out & 0x0000003f0000003f; + return; + } + + out = (out + (out >> 32)) & 0x7f; + op->val = out; /* popcntd */ +} + +#ifdef CONFIG_PPC64 +static nokprobe_inline void do_bpermd(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, unsigned long v2) +{ + unsigned char perm, idx; + unsigned int i; + + perm = 0; + for (i = 0; i < 8; i++) { + idx = (v1 >> (i * 8)) & 0xff; + if (idx < 64) + if (v2 & PPC_BIT(idx)) + perm |= 1 << i; + } + op->val = perm; +} +#endif /* CONFIG_PPC64 */ +/* + * The size parameter adjusts the equivalent prty instruction. + * prtyw = 32, prtyd = 64 + */ +static nokprobe_inline void do_prty(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v, int size) +{ + unsigned long long res = v ^ (v >> 8); + + res ^= res >> 16; + if (size == 32) { /* prtyw */ + op->val = res & 0x0000000100000001; + return; + } + + res ^= res >> 32; + op->val = res & 1; /*prtyd */ } static nokprobe_inline int trap_compare(long v1, long v2) @@ -629,14 +1140,18 @@ static nokprobe_inline int trap_compare(long v1, long v2) #define ROTATE(x, n) ((n) ? (((x) << (n)) | ((x) >> (8 * sizeof(long) - (n)))) : (x)) /* - * Decode an instruction, and execute it if that can be done just by - * modifying *regs (i.e. integer arithmetic and logical instructions, - * branches, and barrier instructions). - * Returns 1 if the instruction has been executed, or 0 if not. - * Sets *op to indicate what the instruction does. + * Decode an instruction, and return information about it in *op + * without changing *regs. + * Integer arithmetic and logical instructions, branches, and barrier + * instructions can be emulated just using the information in *op. + * + * Return value is 1 if the instruction can be emulated just by + * updating *regs with the information in *op, -1 if we need the + * GPRs but *regs doesn't contain the full register set, or 0 + * otherwise. */ -int analyse_instr(struct instruction_op *op, struct pt_regs *regs, - unsigned int instr) +int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, + unsigned int instr) { unsigned int opcode, ra, rb, rd, spr, u; unsigned long int imm; @@ -653,12 +1168,11 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, imm = (signed short)(instr & 0xfffc); if ((instr & 2) == 0) imm += regs->nip; - regs->nip += 4; - regs->nip = truncate_if_32bit(regs->msr, regs->nip); + op->val = truncate_if_32bit(regs->msr, imm); if (instr & 1) - regs->link = regs->nip; - if (branch_taken(instr, regs)) - regs->nip = truncate_if_32bit(regs->msr, imm); + op->type |= SETLK; + if (branch_taken(instr, regs, op)) + op->type |= BRTAKEN; return 1; #ifdef CONFIG_PPC64 case 17: /* sc */ @@ -669,38 +1183,37 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, return 0; #endif case 18: /* b */ - op->type = BRANCH; + op->type = BRANCH | BRTAKEN; imm = instr & 0x03fffffc; if (imm & 0x02000000) imm -= 0x04000000; if ((instr & 2) == 0) imm += regs->nip; + op->val = truncate_if_32bit(regs->msr, imm); if (instr & 1) - regs->link = truncate_if_32bit(regs->msr, regs->nip + 4); - imm = truncate_if_32bit(regs->msr, imm); - regs->nip = imm; + op->type |= SETLK; return 1; case 19: switch ((instr >> 1) & 0x3ff) { case 0: /* mcrf */ + op->type = COMPUTE + SETCC; rd = 7 - ((instr >> 23) & 0x7); ra = 7 - ((instr >> 18) & 0x7); rd *= 4; ra *= 4; val = (regs->ccr >> ra) & 0xf; - regs->ccr = (regs->ccr & ~(0xfUL << rd)) | (val << rd); - goto instr_done; + op->ccval = (regs->ccr & ~(0xfUL << rd)) | (val << rd); + return 1; case 16: /* bclr */ case 528: /* bcctr */ op->type = BRANCH; imm = (instr & 0x400)? regs->ctr: regs->link; - regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); - imm = truncate_if_32bit(regs->msr, imm); + op->val = truncate_if_32bit(regs->msr, imm); if (instr & 1) - regs->link = regs->nip; - if (branch_taken(instr, regs)) - regs->nip = imm; + op->type |= SETLK; + if (branch_taken(instr, regs, op)) + op->type |= BRTAKEN; return 1; case 18: /* rfid, scary */ @@ -710,9 +1223,8 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, return 0; case 150: /* isync */ - op->type = BARRIER; - isync(); - goto instr_done; + op->type = BARRIER | BARRIER_ISYNC; + return 1; case 33: /* crnor */ case 129: /* crandc */ @@ -722,45 +1234,44 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, case 289: /* creqv */ case 417: /* crorc */ case 449: /* cror */ + op->type = COMPUTE + SETCC; ra = (instr >> 16) & 0x1f; rb = (instr >> 11) & 0x1f; rd = (instr >> 21) & 0x1f; ra = (regs->ccr >> (31 - ra)) & 1; rb = (regs->ccr >> (31 - rb)) & 1; val = (instr >> (6 + ra * 2 + rb)) & 1; - regs->ccr = (regs->ccr & ~(1UL << (31 - rd))) | + op->ccval = (regs->ccr & ~(1UL << (31 - rd))) | (val << (31 - rd)); - goto instr_done; + return 1; } break; case 31: switch ((instr >> 1) & 0x3ff) { case 598: /* sync */ - op->type = BARRIER; + op->type = BARRIER + BARRIER_SYNC; #ifdef __powerpc64__ switch ((instr >> 21) & 3) { case 1: /* lwsync */ - asm volatile("lwsync" : : : "memory"); - goto instr_done; + op->type = BARRIER + BARRIER_LWSYNC; + break; case 2: /* ptesync */ - asm volatile("ptesync" : : : "memory"); - goto instr_done; + op->type = BARRIER + BARRIER_PTESYNC; + break; } #endif - mb(); - goto instr_done; + return 1; case 854: /* eieio */ - op->type = BARRIER; - eieio(); - goto instr_done; + op->type = BARRIER + BARRIER_EIEIO; + return 1; } break; } /* Following cases refer to regs->gpr[], so we need all regs */ if (!FULL_REGS(regs)) - return 0; + return -1; rd = (instr >> 21) & 0x1f; ra = (instr >> 16) & 0x1f; @@ -771,21 +1282,21 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, case 2: /* tdi */ if (rd & trap_compare(regs->gpr[ra], (short) instr)) goto trap; - goto instr_done; + return 1; #endif case 3: /* twi */ if (rd & trap_compare((int)regs->gpr[ra], (short) instr)) goto trap; - goto instr_done; + return 1; case 7: /* mulli */ - regs->gpr[rd] = regs->gpr[ra] * (short) instr; - goto instr_done; + op->val = regs->gpr[ra] * (short) instr; + goto compute_done; case 8: /* subfic */ imm = (short) instr; - add_with_carry(regs, rd, ~regs->gpr[ra], imm, 1); - goto instr_done; + add_with_carry(regs, op, rd, ~regs->gpr[ra], imm, 1); + return 1; case 10: /* cmpli */ imm = (unsigned short) instr; @@ -794,8 +1305,8 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, if ((rd & 1) == 0) val = (unsigned int) val; #endif - do_cmp_unsigned(regs, val, imm, rd >> 2); - goto instr_done; + do_cmp_unsigned(regs, op, val, imm, rd >> 2); + return 1; case 11: /* cmpi */ imm = (short) instr; @@ -804,47 +1315,58 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, if ((rd & 1) == 0) val = (int) val; #endif - do_cmp_signed(regs, val, imm, rd >> 2); - goto instr_done; + do_cmp_signed(regs, op, val, imm, rd >> 2); + return 1; case 12: /* addic */ imm = (short) instr; - add_with_carry(regs, rd, regs->gpr[ra], imm, 0); - goto instr_done; + add_with_carry(regs, op, rd, regs->gpr[ra], imm, 0); + return 1; case 13: /* addic. */ imm = (short) instr; - add_with_carry(regs, rd, regs->gpr[ra], imm, 0); - set_cr0(regs, rd); - goto instr_done; + add_with_carry(regs, op, rd, regs->gpr[ra], imm, 0); + set_cr0(regs, op, rd); + return 1; case 14: /* addi */ imm = (short) instr; if (ra) imm += regs->gpr[ra]; - regs->gpr[rd] = imm; - goto instr_done; + op->val = imm; + goto compute_done; case 15: /* addis */ imm = ((short) instr) << 16; if (ra) imm += regs->gpr[ra]; - regs->gpr[rd] = imm; - goto instr_done; + op->val = imm; + goto compute_done; + + case 19: + if (((instr >> 1) & 0x1f) == 2) { + /* addpcis */ + imm = (short) (instr & 0xffc1); /* d0 + d2 fields */ + imm |= (instr >> 15) & 0x3e; /* d1 field */ + op->val = regs->nip + (imm << 16) + 4; + goto compute_done; + } + op->type = UNKNOWN; + return 0; case 20: /* rlwimi */ mb = (instr >> 6) & 0x1f; me = (instr >> 1) & 0x1f; val = DATA32(regs->gpr[rd]); imm = MASK32(mb, me); - regs->gpr[ra] = (regs->gpr[ra] & ~imm) | (ROTATE(val, rb) & imm); + op->val = (regs->gpr[ra] & ~imm) | (ROTATE(val, rb) & imm); goto logical_done; case 21: /* rlwinm */ mb = (instr >> 6) & 0x1f; me = (instr >> 1) & 0x1f; val = DATA32(regs->gpr[rd]); - regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + op->val = ROTATE(val, rb) & MASK32(mb, me); goto logical_done; case 23: /* rlwnm */ @@ -852,40 +1374,37 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, me = (instr >> 1) & 0x1f; rb = regs->gpr[rb] & 0x1f; val = DATA32(regs->gpr[rd]); - regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + op->val = ROTATE(val, rb) & MASK32(mb, me); goto logical_done; case 24: /* ori */ - imm = (unsigned short) instr; - regs->gpr[ra] = regs->gpr[rd] | imm; - goto instr_done; + op->val = regs->gpr[rd] | (unsigned short) instr; + goto logical_done_nocc; case 25: /* oris */ imm = (unsigned short) instr; - regs->gpr[ra] = regs->gpr[rd] | (imm << 16); - goto instr_done; + op->val = regs->gpr[rd] | (imm << 16); + goto logical_done_nocc; case 26: /* xori */ - imm = (unsigned short) instr; - regs->gpr[ra] = regs->gpr[rd] ^ imm; - goto instr_done; + op->val = regs->gpr[rd] ^ (unsigned short) instr; + goto logical_done_nocc; case 27: /* xoris */ imm = (unsigned short) instr; - regs->gpr[ra] = regs->gpr[rd] ^ (imm << 16); - goto instr_done; + op->val = regs->gpr[rd] ^ (imm << 16); + goto logical_done_nocc; case 28: /* andi. */ - imm = (unsigned short) instr; - regs->gpr[ra] = regs->gpr[rd] & imm; - set_cr0(regs, ra); - goto instr_done; + op->val = regs->gpr[rd] & (unsigned short) instr; + set_cr0(regs, op, ra); + goto logical_done_nocc; case 29: /* andis. */ imm = (unsigned short) instr; - regs->gpr[ra] = regs->gpr[rd] & (imm << 16); - set_cr0(regs, ra); - goto instr_done; + op->val = regs->gpr[rd] & (imm << 16); + set_cr0(regs, op, ra); + goto logical_done_nocc; #ifdef __powerpc64__ case 30: /* rld* */ @@ -896,48 +1415,60 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, val = ROTATE(val, sh); switch ((instr >> 2) & 3) { case 0: /* rldicl */ - regs->gpr[ra] = val & MASK64_L(mb); - goto logical_done; + val &= MASK64_L(mb); + break; case 1: /* rldicr */ - regs->gpr[ra] = val & MASK64_R(mb); - goto logical_done; + val &= MASK64_R(mb); + break; case 2: /* rldic */ - regs->gpr[ra] = val & MASK64(mb, 63 - sh); - goto logical_done; + val &= MASK64(mb, 63 - sh); + break; case 3: /* rldimi */ imm = MASK64(mb, 63 - sh); - regs->gpr[ra] = (regs->gpr[ra] & ~imm) | + val = (regs->gpr[ra] & ~imm) | (val & imm); - goto logical_done; } + op->val = val; + goto logical_done; } else { sh = regs->gpr[rb] & 0x3f; val = ROTATE(val, sh); switch ((instr >> 1) & 7) { case 0: /* rldcl */ - regs->gpr[ra] = val & MASK64_L(mb); + op->val = val & MASK64_L(mb); goto logical_done; case 1: /* rldcr */ - regs->gpr[ra] = val & MASK64_R(mb); + op->val = val & MASK64_R(mb); goto logical_done; } } #endif - break; /* illegal instruction */ + op->type = UNKNOWN; /* illegal instruction */ + return 0; case 31: + /* isel occupies 32 minor opcodes */ + if (((instr >> 1) & 0x1f) == 15) { + mb = (instr >> 6) & 0x1f; /* bc field */ + val = (regs->ccr >> (31 - mb)) & 1; + val2 = (ra) ? regs->gpr[ra] : 0; + + op->val = (val) ? val2 : regs->gpr[rb]; + goto compute_done; + } + switch ((instr >> 1) & 0x3ff) { case 4: /* tw */ if (rd == 0x1f || (rd & trap_compare((int)regs->gpr[ra], (int)regs->gpr[rb]))) goto trap; - goto instr_done; + return 1; #ifdef __powerpc64__ case 68: /* td */ if (rd & trap_compare(regs->gpr[ra], regs->gpr[rb])) goto trap; - goto instr_done; + return 1; #endif case 83: /* mfmsr */ if (regs->msr & MSR_PR) @@ -966,74 +1497,50 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, #endif case 19: /* mfcr */ + imm = 0xffffffffUL; if ((instr >> 20) & 1) { imm = 0xf0000000UL; for (sh = 0; sh < 8; ++sh) { - if (instr & (0x80000 >> sh)) { - regs->gpr[rd] = regs->ccr & imm; + if (instr & (0x80000 >> sh)) break; - } imm >>= 4; } - - goto instr_done; } - - regs->gpr[rd] = regs->ccr; - regs->gpr[rd] &= 0xffffffffUL; - goto instr_done; + op->val = regs->ccr & imm; + goto compute_done; case 144: /* mtcrf */ + op->type = COMPUTE + SETCC; imm = 0xf0000000UL; val = regs->gpr[rd]; + op->val = regs->ccr; for (sh = 0; sh < 8; ++sh) { if (instr & (0x80000 >> sh)) - regs->ccr = (regs->ccr & ~imm) | + op->val = (op->val & ~imm) | (val & imm); imm >>= 4; } - goto instr_done; + return 1; case 339: /* mfspr */ spr = ((instr >> 16) & 0x1f) | ((instr >> 6) & 0x3e0); - switch (spr) { - case SPRN_XER: /* mfxer */ - regs->gpr[rd] = regs->xer; - regs->gpr[rd] &= 0xffffffffUL; - goto instr_done; - case SPRN_LR: /* mflr */ - regs->gpr[rd] = regs->link; - goto instr_done; - case SPRN_CTR: /* mfctr */ - regs->gpr[rd] = regs->ctr; - goto instr_done; - default: - op->type = MFSPR; - op->reg = rd; - op->spr = spr; - return 0; - } - break; + op->type = MFSPR; + op->reg = rd; + op->spr = spr; + if (spr == SPRN_XER || spr == SPRN_LR || + spr == SPRN_CTR) + return 1; + return 0; case 467: /* mtspr */ spr = ((instr >> 16) & 0x1f) | ((instr >> 6) & 0x3e0); - switch (spr) { - case SPRN_XER: /* mtxer */ - regs->xer = (regs->gpr[rd] & 0xffffffffUL); - goto instr_done; - case SPRN_LR: /* mtlr */ - regs->link = regs->gpr[rd]; - goto instr_done; - case SPRN_CTR: /* mtctr */ - regs->ctr = regs->gpr[rd]; - goto instr_done; - default: - op->type = MTSPR; - op->val = regs->gpr[rd]; - op->spr = spr; - return 0; - } - break; + op->type = MTSPR; + op->val = regs->gpr[rd]; + op->spr = spr; + if (spr == SPRN_XER || spr == SPRN_LR || + spr == SPRN_CTR) + return 1; + return 0; /* * Compare instructions @@ -1048,8 +1555,8 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, val2 = (int) val2; } #endif - do_cmp_signed(regs, val, val2, rd >> 2); - goto instr_done; + do_cmp_signed(regs, op, val, val2, rd >> 2); + return 1; case 32: /* cmpl */ val = regs->gpr[ra]; @@ -1061,109 +1568,113 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, val2 = (unsigned int) val2; } #endif - do_cmp_unsigned(regs, val, val2, rd >> 2); - goto instr_done; + do_cmp_unsigned(regs, op, val, val2, rd >> 2); + return 1; + + case 508: /* cmpb */ + do_cmpb(regs, op, regs->gpr[rd], regs->gpr[rb]); + goto logical_done_nocc; /* * Arithmetic instructions */ case 8: /* subfc */ - add_with_carry(regs, rd, ~regs->gpr[ra], + add_with_carry(regs, op, rd, ~regs->gpr[ra], regs->gpr[rb], 1); goto arith_done; #ifdef __powerpc64__ case 9: /* mulhdu */ - asm("mulhdu %0,%1,%2" : "=r" (regs->gpr[rd]) : + asm("mulhdu %0,%1,%2" : "=r" (op->val) : "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); goto arith_done; #endif case 10: /* addc */ - add_with_carry(regs, rd, regs->gpr[ra], + add_with_carry(regs, op, rd, regs->gpr[ra], regs->gpr[rb], 0); goto arith_done; case 11: /* mulhwu */ - asm("mulhwu %0,%1,%2" : "=r" (regs->gpr[rd]) : + asm("mulhwu %0,%1,%2" : "=r" (op->val) : "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); goto arith_done; case 40: /* subf */ - regs->gpr[rd] = regs->gpr[rb] - regs->gpr[ra]; + op->val = regs->gpr[rb] - regs->gpr[ra]; goto arith_done; #ifdef __powerpc64__ case 73: /* mulhd */ - asm("mulhd %0,%1,%2" : "=r" (regs->gpr[rd]) : + asm("mulhd %0,%1,%2" : "=r" (op->val) : "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); goto arith_done; #endif case 75: /* mulhw */ - asm("mulhw %0,%1,%2" : "=r" (regs->gpr[rd]) : + asm("mulhw %0,%1,%2" : "=r" (op->val) : "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); goto arith_done; case 104: /* neg */ - regs->gpr[rd] = -regs->gpr[ra]; + op->val = -regs->gpr[ra]; goto arith_done; case 136: /* subfe */ - add_with_carry(regs, rd, ~regs->gpr[ra], regs->gpr[rb], - regs->xer & XER_CA); + add_with_carry(regs, op, rd, ~regs->gpr[ra], + regs->gpr[rb], regs->xer & XER_CA); goto arith_done; case 138: /* adde */ - add_with_carry(regs, rd, regs->gpr[ra], regs->gpr[rb], - regs->xer & XER_CA); + add_with_carry(regs, op, rd, regs->gpr[ra], + regs->gpr[rb], regs->xer & XER_CA); goto arith_done; case 200: /* subfze */ - add_with_carry(regs, rd, ~regs->gpr[ra], 0L, + add_with_carry(regs, op, rd, ~regs->gpr[ra], 0L, regs->xer & XER_CA); goto arith_done; case 202: /* addze */ - add_with_carry(regs, rd, regs->gpr[ra], 0L, + add_with_carry(regs, op, rd, regs->gpr[ra], 0L, regs->xer & XER_CA); goto arith_done; case 232: /* subfme */ - add_with_carry(regs, rd, ~regs->gpr[ra], -1L, + add_with_carry(regs, op, rd, ~regs->gpr[ra], -1L, regs->xer & XER_CA); goto arith_done; #ifdef __powerpc64__ case 233: /* mulld */ - regs->gpr[rd] = regs->gpr[ra] * regs->gpr[rb]; + op->val = regs->gpr[ra] * regs->gpr[rb]; goto arith_done; #endif case 234: /* addme */ - add_with_carry(regs, rd, regs->gpr[ra], -1L, + add_with_carry(regs, op, rd, regs->gpr[ra], -1L, regs->xer & XER_CA); goto arith_done; case 235: /* mullw */ - regs->gpr[rd] = (unsigned int) regs->gpr[ra] * + op->val = (unsigned int) regs->gpr[ra] * (unsigned int) regs->gpr[rb]; goto arith_done; case 266: /* add */ - regs->gpr[rd] = regs->gpr[ra] + regs->gpr[rb]; + op->val = regs->gpr[ra] + regs->gpr[rb]; goto arith_done; #ifdef __powerpc64__ case 457: /* divdu */ - regs->gpr[rd] = regs->gpr[ra] / regs->gpr[rb]; + op->val = regs->gpr[ra] / regs->gpr[rb]; goto arith_done; #endif case 459: /* divwu */ - regs->gpr[rd] = (unsigned int) regs->gpr[ra] / + op->val = (unsigned int) regs->gpr[ra] / (unsigned int) regs->gpr[rb]; goto arith_done; #ifdef __powerpc64__ case 489: /* divd */ - regs->gpr[rd] = (long int) regs->gpr[ra] / + op->val = (long int) regs->gpr[ra] / (long int) regs->gpr[rb]; goto arith_done; #endif case 491: /* divw */ - regs->gpr[rd] = (int) regs->gpr[ra] / + op->val = (int) regs->gpr[ra] / (int) regs->gpr[rb]; goto arith_done; @@ -1172,57 +1683,79 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, * Logical instructions */ case 26: /* cntlzw */ - asm("cntlzw %0,%1" : "=r" (regs->gpr[ra]) : - "r" (regs->gpr[rd])); + op->val = __builtin_clz((unsigned int) regs->gpr[rd]); goto logical_done; #ifdef __powerpc64__ case 58: /* cntlzd */ - asm("cntlzd %0,%1" : "=r" (regs->gpr[ra]) : - "r" (regs->gpr[rd])); + op->val = __builtin_clzl(regs->gpr[rd]); goto logical_done; #endif case 28: /* and */ - regs->gpr[ra] = regs->gpr[rd] & regs->gpr[rb]; + op->val = regs->gpr[rd] & regs->gpr[rb]; goto logical_done; case 60: /* andc */ - regs->gpr[ra] = regs->gpr[rd] & ~regs->gpr[rb]; + op->val = regs->gpr[rd] & ~regs->gpr[rb]; goto logical_done; + case 122: /* popcntb */ + do_popcnt(regs, op, regs->gpr[rd], 8); + goto logical_done_nocc; + case 124: /* nor */ - regs->gpr[ra] = ~(regs->gpr[rd] | regs->gpr[rb]); + op->val = ~(regs->gpr[rd] | regs->gpr[rb]); goto logical_done; + case 154: /* prtyw */ + do_prty(regs, op, regs->gpr[rd], 32); + goto logical_done_nocc; + + case 186: /* prtyd */ + do_prty(regs, op, regs->gpr[rd], 64); + goto logical_done_nocc; +#ifdef CONFIG_PPC64 + case 252: /* bpermd */ + do_bpermd(regs, op, regs->gpr[rd], regs->gpr[rb]); + goto logical_done_nocc; +#endif case 284: /* xor */ - regs->gpr[ra] = ~(regs->gpr[rd] ^ regs->gpr[rb]); + op->val = ~(regs->gpr[rd] ^ regs->gpr[rb]); goto logical_done; case 316: /* xor */ - regs->gpr[ra] = regs->gpr[rd] ^ regs->gpr[rb]; + op->val = regs->gpr[rd] ^ regs->gpr[rb]; goto logical_done; + case 378: /* popcntw */ + do_popcnt(regs, op, regs->gpr[rd], 32); + goto logical_done_nocc; + case 412: /* orc */ - regs->gpr[ra] = regs->gpr[rd] | ~regs->gpr[rb]; + op->val = regs->gpr[rd] | ~regs->gpr[rb]; goto logical_done; case 444: /* or */ - regs->gpr[ra] = regs->gpr[rd] | regs->gpr[rb]; + op->val = regs->gpr[rd] | regs->gpr[rb]; goto logical_done; case 476: /* nand */ - regs->gpr[ra] = ~(regs->gpr[rd] & regs->gpr[rb]); + op->val = ~(regs->gpr[rd] & regs->gpr[rb]); goto logical_done; - +#ifdef CONFIG_PPC64 + case 506: /* popcntd */ + do_popcnt(regs, op, regs->gpr[rd], 64); + goto logical_done_nocc; +#endif case 922: /* extsh */ - regs->gpr[ra] = (signed short) regs->gpr[rd]; + op->val = (signed short) regs->gpr[rd]; goto logical_done; case 954: /* extsb */ - regs->gpr[ra] = (signed char) regs->gpr[rd]; + op->val = (signed char) regs->gpr[rd]; goto logical_done; #ifdef __powerpc64__ case 986: /* extsw */ - regs->gpr[ra] = (signed int) regs->gpr[rd]; + op->val = (signed int) regs->gpr[rd]; goto logical_done; #endif @@ -1232,75 +1765,83 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, case 24: /* slw */ sh = regs->gpr[rb] & 0x3f; if (sh < 32) - regs->gpr[ra] = (regs->gpr[rd] << sh) & 0xffffffffUL; + op->val = (regs->gpr[rd] << sh) & 0xffffffffUL; else - regs->gpr[ra] = 0; + op->val = 0; goto logical_done; case 536: /* srw */ sh = regs->gpr[rb] & 0x3f; if (sh < 32) - regs->gpr[ra] = (regs->gpr[rd] & 0xffffffffUL) >> sh; + op->val = (regs->gpr[rd] & 0xffffffffUL) >> sh; else - regs->gpr[ra] = 0; + op->val = 0; goto logical_done; case 792: /* sraw */ + op->type = COMPUTE + SETREG + SETXER; sh = regs->gpr[rb] & 0x3f; ival = (signed int) regs->gpr[rd]; - regs->gpr[ra] = ival >> (sh < 32 ? sh : 31); + op->val = ival >> (sh < 32 ? sh : 31); + op->xerval = regs->xer; if (ival < 0 && (sh >= 32 || (ival & ((1ul << sh) - 1)) != 0)) - regs->xer |= XER_CA; + op->xerval |= XER_CA; else - regs->xer &= ~XER_CA; + op->xerval &= ~XER_CA; goto logical_done; case 824: /* srawi */ + op->type = COMPUTE + SETREG + SETXER; sh = rb; ival = (signed int) regs->gpr[rd]; - regs->gpr[ra] = ival >> sh; + op->val = ival >> sh; + op->xerval = regs->xer; if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) - regs->xer |= XER_CA; + op->xerval |= XER_CA; else - regs->xer &= ~XER_CA; + op->xerval &= ~XER_CA; goto logical_done; #ifdef __powerpc64__ case 27: /* sld */ sh = regs->gpr[rb] & 0x7f; if (sh < 64) - regs->gpr[ra] = regs->gpr[rd] << sh; + op->val = regs->gpr[rd] << sh; else - regs->gpr[ra] = 0; + op->val = 0; goto logical_done; case 539: /* srd */ sh = regs->gpr[rb] & 0x7f; if (sh < 64) - regs->gpr[ra] = regs->gpr[rd] >> sh; + op->val = regs->gpr[rd] >> sh; else - regs->gpr[ra] = 0; + op->val = 0; goto logical_done; case 794: /* srad */ + op->type = COMPUTE + SETREG + SETXER; sh = regs->gpr[rb] & 0x7f; ival = (signed long int) regs->gpr[rd]; - regs->gpr[ra] = ival >> (sh < 64 ? sh : 63); + op->val = ival >> (sh < 64 ? sh : 63); + op->xerval = regs->xer; if (ival < 0 && (sh >= 64 || (ival & ((1ul << sh) - 1)) != 0)) - regs->xer |= XER_CA; + op->xerval |= XER_CA; else - regs->xer &= ~XER_CA; + op->xerval &= ~XER_CA; goto logical_done; case 826: /* sradi with sh_5 = 0 */ case 827: /* sradi with sh_5 = 1 */ + op->type = COMPUTE + SETREG + SETXER; sh = rb | ((instr & 2) << 4); ival = (signed long int) regs->gpr[rd]; - regs->gpr[ra] = ival >> sh; + op->val = ival >> sh; + op->xerval = regs->xer; if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) - regs->xer |= XER_CA; + op->xerval |= XER_CA; else - regs->xer &= ~XER_CA; + op->xerval &= ~XER_CA; goto logical_done; #endif /* __powerpc64__ */ @@ -1333,18 +1874,24 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, op->type = MKOP(CACHEOP, ICBI, 0); op->ea = xform_ea(instr, regs); return 0; + + case 1014: /* dcbz */ + op->type = MKOP(CACHEOP, DCBZ, 0); + op->ea = xform_ea(instr, regs); + return 0; } break; } - /* - * Loads and stores. - */ +/* + * Loads and stores. + */ op->type = UNKNOWN; op->update_reg = ra; op->reg = rd; op->val = regs->gpr[rd]; u = (instr >> 20) & UPDATE; + op->vsx_flags = 0; switch (opcode) { case 31: @@ -1368,9 +1915,30 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, op->type = MKOP(STCX, 0, 8); break; - case 21: /* ldx */ - case 53: /* ldux */ - op->type = MKOP(LOAD, u, 8); + case 52: /* lbarx */ + op->type = MKOP(LARX, 0, 1); + break; + + case 694: /* stbcx. */ + op->type = MKOP(STCX, 0, 1); + break; + + case 116: /* lharx */ + op->type = MKOP(LARX, 0, 2); + break; + + case 726: /* sthcx. */ + op->type = MKOP(STCX, 0, 2); + break; + + case 276: /* lqarx */ + if (!((rd & 1) || rd == ra || rd == rb)) + op->type = MKOP(LARX, 0, 16); + break; + + case 182: /* stqcx. */ + if (!(rd & 1)) + op->type = MKOP(STCX, 0, 16); break; #endif @@ -1385,22 +1953,58 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, break; #ifdef CONFIG_ALTIVEC + /* + * Note: for the load/store vector element instructions, + * bits of the EA say which field of the VMX register to use. + */ + case 7: /* lvebx */ + op->type = MKOP(LOAD_VMX, 0, 1); + op->element_size = 1; + break; + + case 39: /* lvehx */ + op->type = MKOP(LOAD_VMX, 0, 2); + op->element_size = 2; + break; + + case 71: /* lvewx */ + op->type = MKOP(LOAD_VMX, 0, 4); + op->element_size = 4; + break; + case 103: /* lvx */ case 359: /* lvxl */ - if (!(regs->msr & MSR_VEC)) - goto vecunavail; op->type = MKOP(LOAD_VMX, 0, 16); + op->element_size = 16; + break; + + case 135: /* stvebx */ + op->type = MKOP(STORE_VMX, 0, 1); + op->element_size = 1; + break; + + case 167: /* stvehx */ + op->type = MKOP(STORE_VMX, 0, 2); + op->element_size = 2; + break; + + case 199: /* stvewx */ + op->type = MKOP(STORE_VMX, 0, 4); + op->element_size = 4; break; case 231: /* stvx */ case 487: /* stvxl */ - if (!(regs->msr & MSR_VEC)) - goto vecunavail; op->type = MKOP(STORE_VMX, 0, 16); break; #endif /* CONFIG_ALTIVEC */ #ifdef __powerpc64__ + case 21: /* ldx */ + case 53: /* ldux */ + op->type = MKOP(LOAD, u, 8); + break; + case 149: /* stdx */ case 181: /* stdux */ op->type = MKOP(STORE, u, 8); @@ -1457,41 +2061,52 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, if (rb == 0) rb = 32; /* # bytes to load */ op->type = MKOP(LOAD_MULTI, 0, rb); - op->ea = 0; - if (ra) - op->ea = truncate_if_32bit(regs->msr, - regs->gpr[ra]); + op->ea = ra ? regs->gpr[ra] : 0; break; #ifdef CONFIG_PPC_FPU case 535: /* lfsx */ case 567: /* lfsux */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; - op->type = MKOP(LOAD_FP, u, 4); + op->type = MKOP(LOAD_FP, u | FPCONV, 4); break; case 599: /* lfdx */ case 631: /* lfdux */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; op->type = MKOP(LOAD_FP, u, 8); break; case 663: /* stfsx */ case 695: /* stfsux */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; - op->type = MKOP(STORE_FP, u, 4); + op->type = MKOP(STORE_FP, u | FPCONV, 4); break; case 727: /* stfdx */ case 759: /* stfdux */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; op->type = MKOP(STORE_FP, u, 8); break; -#endif + +#ifdef __powerpc64__ + case 791: /* lfdpx */ + op->type = MKOP(LOAD_FP, 0, 16); + break; + + case 855: /* lfiwax */ + op->type = MKOP(LOAD_FP, SIGNEXT, 4); + break; + + case 887: /* lfiwzx */ + op->type = MKOP(LOAD_FP, 0, 4); + break; + + case 919: /* stfdpx */ + op->type = MKOP(STORE_FP, 0, 16); + break; + + case 983: /* stfiwx */ + op->type = MKOP(STORE_FP, 0, 4); + break; +#endif /* __powerpc64 */ +#endif /* CONFIG_PPC_FPU */ #ifdef __powerpc64__ case 660: /* stdbrx */ @@ -1509,14 +2124,11 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, op->val = byterev_4(regs->gpr[rd]); break; - case 725: + case 725: /* stswi */ if (rb == 0) rb = 32; /* # bytes to store */ op->type = MKOP(STORE_MULTI, 0, rb); - op->ea = 0; - if (ra) - op->ea = truncate_if_32bit(regs->msr, - regs->gpr[ra]); + op->ea = ra ? regs->gpr[ra] : 0; break; case 790: /* lhbrx */ @@ -1529,20 +2141,184 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, break; #ifdef CONFIG_VSX + case 12: /* lxsiwzx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 8; + break; + + case 76: /* lxsiwax */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, SIGNEXT, 4); + op->element_size = 8; + break; + + case 140: /* stxsiwx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 4); + op->element_size = 8; + break; + + case 268: /* lxvx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 269: /* lxvl */ + case 301: { /* lxvll */ + int nb; + op->reg = rd | ((instr & 1) << 5); + op->ea = ra ? regs->gpr[ra] : 0; + nb = regs->gpr[rb] & 0xff; + if (nb > 16) + nb = 16; + op->type = MKOP(LOAD_VSX, 0, nb); + op->element_size = 16; + op->vsx_flags = ((instr & 0x20) ? VSX_LDLEFT : 0) | + VSX_CHECK_VEC; + break; + } + case 332: /* lxvdsx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 8); + op->element_size = 8; + op->vsx_flags = VSX_SPLAT; + break; + + case 364: /* lxvwsx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 4; + op->vsx_flags = VSX_SPLAT | VSX_CHECK_VEC; + break; + + case 396: /* stxvx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 397: /* stxvl */ + case 429: { /* stxvll */ + int nb; + op->reg = rd | ((instr & 1) << 5); + op->ea = ra ? regs->gpr[ra] : 0; + nb = regs->gpr[rb] & 0xff; + if (nb > 16) + nb = 16; + op->type = MKOP(STORE_VSX, 0, nb); + op->element_size = 16; + op->vsx_flags = ((instr & 0x20) ? VSX_LDLEFT : 0) | + VSX_CHECK_VEC; + break; + } + case 524: /* lxsspx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV; + break; + + case 588: /* lxsdx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 8); + op->element_size = 8; + break; + + case 652: /* stxsspx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV; + break; + + case 716: /* stxsdx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 8); + op->element_size = 8; + break; + + case 780: /* lxvw4x */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 4; + break; + + case 781: /* lxsibzx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 1); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 812: /* lxvh8x */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 2; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 813: /* lxsihzx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 2); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + case 844: /* lxvd2x */ - case 876: /* lxvd2ux */ - if (!(regs->msr & MSR_VSX)) - goto vsxunavail; op->reg = rd | ((instr & 1) << 5); - op->type = MKOP(LOAD_VSX, u, 16); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 8; + break; + + case 876: /* lxvb16x */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 1; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 908: /* stxvw4x */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 4; + break; + + case 909: /* stxsibx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 1); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 940: /* stxvh8x */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 2; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 941: /* stxsihx */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 2); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; break; case 972: /* stxvd2x */ - case 1004: /* stxvd2ux */ - if (!(regs->msr & MSR_VSX)) - goto vsxunavail; op->reg = rd | ((instr & 1) << 5); - op->type = MKOP(STORE_VSX, u, 16); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 8; + break; + + case 1004: /* stxvb16x */ + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 1; + op->vsx_flags = VSX_CHECK_VEC; break; #endif /* CONFIG_VSX */ @@ -1606,38 +2382,63 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, #ifdef CONFIG_PPC_FPU case 48: /* lfs */ case 49: /* lfsu */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; - op->type = MKOP(LOAD_FP, u, 4); + op->type = MKOP(LOAD_FP, u | FPCONV, 4); op->ea = dform_ea(instr, regs); break; case 50: /* lfd */ case 51: /* lfdu */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; op->type = MKOP(LOAD_FP, u, 8); op->ea = dform_ea(instr, regs); break; case 52: /* stfs */ case 53: /* stfsu */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; - op->type = MKOP(STORE_FP, u, 4); + op->type = MKOP(STORE_FP, u | FPCONV, 4); op->ea = dform_ea(instr, regs); break; case 54: /* stfd */ case 55: /* stfdu */ - if (!(regs->msr & MSR_FP)) - goto fpunavail; op->type = MKOP(STORE_FP, u, 8); op->ea = dform_ea(instr, regs); break; #endif #ifdef __powerpc64__ + case 56: /* lq */ + if (!((rd & 1) || (rd == ra))) + op->type = MKOP(LOAD, 0, 16); + op->ea = dqform_ea(instr, regs); + break; +#endif + +#ifdef CONFIG_VSX + case 57: /* lfdp, lxsd, lxssp */ + op->ea = dsform_ea(instr, regs); + switch (instr & 3) { + case 0: /* lfdp */ + if (rd & 1) + break; /* reg must be even */ + op->type = MKOP(LOAD_FP, 0, 16); + break; + case 2: /* lxsd */ + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, 0, 8); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + case 3: /* lxssp */ + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; + break; + } + break; +#endif /* CONFIG_VSX */ + +#ifdef __powerpc64__ case 58: /* ld[u], lwa */ op->ea = dsform_ea(instr, regs); switch (instr & 3) { @@ -1652,7 +2453,57 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, break; } break; +#endif +#ifdef CONFIG_VSX + case 61: /* stfdp, lxv, stxsd, stxssp, stxv */ + switch (instr & 7) { + case 0: /* stfdp with LSB of DS field = 0 */ + case 4: /* stfdp with LSB of DS field = 1 */ + op->ea = dsform_ea(instr, regs); + op->type = MKOP(STORE_FP, 0, 16); + break; + + case 1: /* lxv */ + op->ea = dqform_ea(instr, regs); + if (instr & 8) + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 2: /* stxsd with LSB of DS field = 0 */ + case 6: /* stxsd with LSB of DS field = 1 */ + op->ea = dsform_ea(instr, regs); + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, 0, 8); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 3: /* stxssp with LSB of DS field = 0 */ + case 7: /* stxssp with LSB of DS field = 1 */ + op->ea = dsform_ea(instr, regs); + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; + break; + + case 5: /* stxv */ + op->ea = dqform_ea(instr, regs); + if (instr & 8) + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + } + break; +#endif /* CONFIG_VSX */ + +#ifdef __powerpc64__ case 62: /* std[u] */ op->ea = dsform_ea(instr, regs); switch (instr & 3) { @@ -1662,6 +2513,10 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, case 1: /* stdu */ op->type = MKOP(STORE, UPDATE, 8); break; + case 2: /* stq */ + if (!(rd & 1)) + op->type = MKOP(STORE, 0, 16); + break; } break; #endif /* __powerpc64__ */ @@ -1671,15 +2526,18 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, logical_done: if (instr & 1) - set_cr0(regs, ra); - goto instr_done; + set_cr0(regs, op, ra); + logical_done_nocc: + op->reg = ra; + op->type |= SETREG; + return 1; arith_done: if (instr & 1) - set_cr0(regs, rd); - - instr_done: - regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + set_cr0(regs, op, rd); + compute_done: + op->reg = rd; + op->type |= SETREG; return 1; priv: @@ -1691,24 +2549,6 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs, op->type = INTERRUPT | 0x700; op->val = SRR1_PROGTRAP; return 0; - -#ifdef CONFIG_PPC_FPU - fpunavail: - op->type = INTERRUPT | 0x800; - return 0; -#endif - -#ifdef CONFIG_ALTIVEC - vecunavail: - op->type = INTERRUPT | 0xf20; - return 0; -#endif - -#ifdef CONFIG_VSX - vsxunavail: - op->type = INTERRUPT | 0xf40; - return 0; -#endif } EXPORT_SYMBOL_GPL(analyse_instr); NOKPROBE_SYMBOL(analyse_instr); @@ -1771,190 +2611,412 @@ static nokprobe_inline void do_byterev(unsigned long *valp, int size) } /* - * Emulate instructions that cause a transfer of control, - * loads and stores, and a few other instructions. - * Returns 1 if the step was emulated, 0 if not, - * or -1 if the instruction is one that should not be stepped, - * such as an rfid, or a mtmsrd that would clear MSR_RI. + * Emulate an instruction that can be executed just by updating + * fields in *regs. */ -int emulate_step(struct pt_regs *regs, unsigned int instr) +void emulate_update_regs(struct pt_regs *regs, struct instruction_op *op) { - struct instruction_op op; - int r, err, size; - unsigned long val; - unsigned int cr; - int i, rd, nb; + unsigned long next_pc; + + next_pc = truncate_if_32bit(regs->msr, regs->nip + 4); + switch (op->type & INSTR_TYPE_MASK) { + case COMPUTE: + if (op->type & SETREG) + regs->gpr[op->reg] = op->val; + if (op->type & SETCC) + regs->ccr = op->ccval; + if (op->type & SETXER) + regs->xer = op->xerval; + break; - r = analyse_instr(&op, regs, instr); - if (r != 0) - return r; + case BRANCH: + if (op->type & SETLK) + regs->link = next_pc; + if (op->type & BRTAKEN) + next_pc = op->val; + if (op->type & DECCTR) + --regs->ctr; + break; - err = 0; - size = GETSIZE(op.type); - switch (op.type & INSTR_TYPE_MASK) { - case CACHEOP: - if (!address_ok(regs, op.ea, 8)) - return 0; - switch (op.type & CACHEOP_MASK) { - case DCBST: - __cacheop_user_asmx(op.ea, err, "dcbst"); + case BARRIER: + switch (op->type & BARRIER_MASK) { + case BARRIER_SYNC: + mb(); break; - case DCBF: - __cacheop_user_asmx(op.ea, err, "dcbf"); + case BARRIER_ISYNC: + isync(); break; - case DCBTST: - if (op.reg == 0) - prefetchw((void *) op.ea); + case BARRIER_EIEIO: + eieio(); break; - case DCBT: - if (op.reg == 0) - prefetch((void *) op.ea); + case BARRIER_LWSYNC: + asm volatile("lwsync" : : : "memory"); break; - case ICBI: - __cacheop_user_asmx(op.ea, err, "icbi"); + case BARRIER_PTESYNC: + asm volatile("ptesync" : : : "memory"); break; } - if (err) - return 0; - goto instr_done; + break; + + case MFSPR: + switch (op->spr) { + case SPRN_XER: + regs->gpr[op->reg] = regs->xer & 0xffffffffUL; + break; + case SPRN_LR: + regs->gpr[op->reg] = regs->link; + break; + case SPRN_CTR: + regs->gpr[op->reg] = regs->ctr; + break; + default: + WARN_ON_ONCE(1); + } + break; + + case MTSPR: + switch (op->spr) { + case SPRN_XER: + regs->xer = op->val & 0xffffffffUL; + break; + case SPRN_LR: + regs->link = op->val; + break; + case SPRN_CTR: + regs->ctr = op->val; + break; + default: + WARN_ON_ONCE(1); + } + break; + + default: + WARN_ON_ONCE(1); + } + regs->nip = next_pc; +} + +/* + * Emulate a previously-analysed load or store instruction. + * Return values are: + * 0 = instruction emulated successfully + * -EFAULT = address out of range or access faulted (regs->dar + * contains the faulting address) + * -EACCES = misaligned access, instruction requires alignment + * -EINVAL = unknown operation in *op + */ +int emulate_loadstore(struct pt_regs *regs, struct instruction_op *op) +{ + int err, size, type; + int i, rd, nb; + unsigned int cr; + unsigned long val; + unsigned long ea; + bool cross_endian; + + err = 0; + size = GETSIZE(op->type); + type = op->type & INSTR_TYPE_MASK; + cross_endian = (regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + ea = truncate_if_32bit(regs->msr, op->ea); + switch (type) { case LARX: - if (op.ea & (size - 1)) - break; /* can't handle misaligned */ - if (!address_ok(regs, op.ea, size)) - return 0; + if (ea & (size - 1)) + return -EACCES; /* can't handle misaligned */ + if (!address_ok(regs, ea, size)) + return -EFAULT; err = 0; + val = 0; switch (size) { +#ifdef __powerpc64__ + case 1: + __get_user_asmx(val, ea, err, "lbarx"); + break; + case 2: + __get_user_asmx(val, ea, err, "lharx"); + break; +#endif case 4: - __get_user_asmx(val, op.ea, err, "lwarx"); + __get_user_asmx(val, ea, err, "lwarx"); break; #ifdef __powerpc64__ case 8: - __get_user_asmx(val, op.ea, err, "ldarx"); + __get_user_asmx(val, ea, err, "ldarx"); + break; + case 16: + err = do_lqarx(ea, ®s->gpr[op->reg]); break; #endif default: - return 0; + return -EINVAL; } - if (!err) - regs->gpr[op.reg] = val; - goto ldst_done; + if (err) { + regs->dar = ea; + break; + } + if (size < 16) + regs->gpr[op->reg] = val; + break; case STCX: - if (op.ea & (size - 1)) - break; /* can't handle misaligned */ - if (!address_ok(regs, op.ea, size)) - return 0; + if (ea & (size - 1)) + return -EACCES; /* can't handle misaligned */ + if (!address_ok(regs, ea, size)) + return -EFAULT; err = 0; switch (size) { +#ifdef __powerpc64__ + case 1: + __put_user_asmx(op->val, ea, err, "stbcx.", cr); + break; + case 2: + __put_user_asmx(op->val, ea, err, "stbcx.", cr); + break; +#endif case 4: - __put_user_asmx(op.val, op.ea, err, "stwcx.", cr); + __put_user_asmx(op->val, ea, err, "stwcx.", cr); break; #ifdef __powerpc64__ case 8: - __put_user_asmx(op.val, op.ea, err, "stdcx.", cr); + __put_user_asmx(op->val, ea, err, "stdcx.", cr); + break; + case 16: + err = do_stqcx(ea, regs->gpr[op->reg], + regs->gpr[op->reg + 1], &cr); break; #endif default: - return 0; + return -EINVAL; } if (!err) regs->ccr = (regs->ccr & 0x0fffffff) | (cr & 0xe0000000) | ((regs->xer >> 3) & 0x10000000); - goto ldst_done; + else + regs->dar = ea; + break; case LOAD: - err = read_mem(®s->gpr[op.reg], op.ea, size, regs); +#ifdef __powerpc64__ + if (size == 16) { + err = emulate_lq(regs, ea, op->reg, cross_endian); + break; + } +#endif + err = read_mem(®s->gpr[op->reg], ea, size, regs); if (!err) { - if (op.type & SIGNEXT) - do_signext(®s->gpr[op.reg], size); - if (op.type & BYTEREV) - do_byterev(®s->gpr[op.reg], size); + if (op->type & SIGNEXT) + do_signext(®s->gpr[op->reg], size); + if ((op->type & BYTEREV) == (cross_endian ? 0 : BYTEREV)) + do_byterev(®s->gpr[op->reg], size); } - goto ldst_done; + break; #ifdef CONFIG_PPC_FPU case LOAD_FP: - if (size == 4) - err = do_fp_load(op.reg, do_lfs, op.ea, size, regs); - else - err = do_fp_load(op.reg, do_lfd, op.ea, size, regs); - goto ldst_done; + /* + * If the instruction is in userspace, we can emulate it even + * if the VMX state is not live, because we have the state + * stored in the thread_struct. If the instruction is in + * the kernel, we must not touch the state in the thread_struct. + */ + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_FP)) + return 0; + err = do_fp_load(op, ea, regs, cross_endian); + break; #endif #ifdef CONFIG_ALTIVEC case LOAD_VMX: - err = do_vec_load(op.reg, do_lvx, op.ea & ~0xfUL, regs); - goto ldst_done; + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_VEC)) + return 0; + err = do_vec_load(op->reg, ea, size, regs, cross_endian); + break; #endif #ifdef CONFIG_VSX - case LOAD_VSX: - err = do_vsx_load(op.reg, do_lxvd2x, op.ea, regs); - goto ldst_done; + case LOAD_VSX: { + unsigned long msrbit = MSR_VSX; + + /* + * Some VSX instructions check the MSR_VEC bit rather than MSR_VSX + * when the target of the instruction is a vector register. + */ + if (op->reg >= 32 && (op->vsx_flags & VSX_CHECK_VEC)) + msrbit = MSR_VEC; + if (!(regs->msr & MSR_PR) && !(regs->msr & msrbit)) + return 0; + err = do_vsx_load(op, ea, regs, cross_endian); + break; + } #endif case LOAD_MULTI: - if (regs->msr & MSR_LE) - return 0; - rd = op.reg; + if (!address_ok(regs, ea, size)) + return -EFAULT; + rd = op->reg; for (i = 0; i < size; i += 4) { + unsigned int v32 = 0; + nb = size - i; if (nb > 4) nb = 4; - err = read_mem(®s->gpr[rd], op.ea, nb, regs); + err = copy_mem_in((u8 *) &v32, ea, nb, regs); if (err) - return 0; - if (nb < 4) /* left-justify last bytes */ - regs->gpr[rd] <<= 32 - 8 * nb; - op.ea += 4; - ++rd; + break; + if (unlikely(cross_endian)) + v32 = byterev_4(v32); + regs->gpr[rd] = v32; + ea += 4; + /* reg number wraps from 31 to 0 for lsw[ix] */ + rd = (rd + 1) & 0x1f; } - goto instr_done; + break; case STORE: - if ((op.type & UPDATE) && size == sizeof(long) && - op.reg == 1 && op.update_reg == 1 && +#ifdef __powerpc64__ + if (size == 16) { + err = emulate_stq(regs, ea, op->reg, cross_endian); + break; + } +#endif + if ((op->type & UPDATE) && size == sizeof(long) && + op->reg == 1 && op->update_reg == 1 && !(regs->msr & MSR_PR) && - op.ea >= regs->gpr[1] - STACK_INT_FRAME_SIZE) { - err = handle_stack_update(op.ea, regs); - goto ldst_done; + ea >= regs->gpr[1] - STACK_INT_FRAME_SIZE) { + err = handle_stack_update(ea, regs); + break; } - err = write_mem(op.val, op.ea, size, regs); - goto ldst_done; + if (unlikely(cross_endian)) + do_byterev(&op->val, size); + err = write_mem(op->val, ea, size, regs); + break; #ifdef CONFIG_PPC_FPU case STORE_FP: - if (size == 4) - err = do_fp_store(op.reg, do_stfs, op.ea, size, regs); - else - err = do_fp_store(op.reg, do_stfd, op.ea, size, regs); - goto ldst_done; + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_FP)) + return 0; + err = do_fp_store(op, ea, regs, cross_endian); + break; #endif #ifdef CONFIG_ALTIVEC case STORE_VMX: - err = do_vec_store(op.reg, do_stvx, op.ea & ~0xfUL, regs); - goto ldst_done; + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_VEC)) + return 0; + err = do_vec_store(op->reg, ea, size, regs, cross_endian); + break; #endif #ifdef CONFIG_VSX - case STORE_VSX: - err = do_vsx_store(op.reg, do_stxvd2x, op.ea, regs); - goto ldst_done; + case STORE_VSX: { + unsigned long msrbit = MSR_VSX; + + /* + * Some VSX instructions check the MSR_VEC bit rather than MSR_VSX + * when the target of the instruction is a vector register. + */ + if (op->reg >= 32 && (op->vsx_flags & VSX_CHECK_VEC)) + msrbit = MSR_VEC; + if (!(regs->msr & MSR_PR) && !(regs->msr & msrbit)) + return 0; + err = do_vsx_store(op, ea, regs, cross_endian); + break; + } #endif case STORE_MULTI: - if (regs->msr & MSR_LE) - return 0; - rd = op.reg; + if (!address_ok(regs, ea, size)) + return -EFAULT; + rd = op->reg; for (i = 0; i < size; i += 4) { - val = regs->gpr[rd]; + unsigned int v32 = regs->gpr[rd]; + nb = size - i; if (nb > 4) nb = 4; - else - val >>= 32 - 8 * nb; - err = write_mem(val, op.ea, nb, regs); + if (unlikely(cross_endian)) + v32 = byterev_4(v32); + err = copy_mem_out((u8 *) &v32, ea, nb, regs); if (err) - return 0; - op.ea += 4; - ++rd; + break; + ea += 4; + /* reg number wraps from 31 to 0 for stsw[ix] */ + rd = (rd + 1) & 0x1f; + } + break; + + default: + return -EINVAL; + } + + if (err) + return err; + + if (op->type & UPDATE) + regs->gpr[op->update_reg] = op->ea; + + return 0; +} +NOKPROBE_SYMBOL(emulate_loadstore); + +/* + * Emulate instructions that cause a transfer of control, + * loads and stores, and a few other instructions. + * Returns 1 if the step was emulated, 0 if not, + * or -1 if the instruction is one that should not be stepped, + * such as an rfid, or a mtmsrd that would clear MSR_RI. + */ +int emulate_step(struct pt_regs *regs, unsigned int instr) +{ + struct instruction_op op; + int r, err, type; + unsigned long val; + unsigned long ea; + + r = analyse_instr(&op, regs, instr); + if (r < 0) + return r; + if (r > 0) { + emulate_update_regs(regs, &op); + return 1; + } + + err = 0; + type = op.type & INSTR_TYPE_MASK; + + if (OP_IS_LOAD_STORE(type)) { + err = emulate_loadstore(regs, &op); + if (err) + return 0; + goto instr_done; + } + + switch (type) { + case CACHEOP: + ea = truncate_if_32bit(regs->msr, op.ea); + if (!address_ok(regs, ea, 8)) + return 0; + switch (op.type & CACHEOP_MASK) { + case DCBST: + __cacheop_user_asmx(ea, err, "dcbst"); + break; + case DCBF: + __cacheop_user_asmx(ea, err, "dcbf"); + break; + case DCBTST: + if (op.reg == 0) + prefetchw((void *) ea); + break; + case DCBT: + if (op.reg == 0) + prefetch((void *) ea); + break; + case ICBI: + __cacheop_user_asmx(ea, err, "icbi"); + break; + case DCBZ: + err = emulate_dcbz(ea, regs); + break; + } + if (err) { + regs->dar = ea; + return 0; } goto instr_done; @@ -1998,12 +3060,6 @@ int emulate_step(struct pt_regs *regs, unsigned int instr) } return 0; - ldst_done: - if (err) - return 0; - if (op.type & UPDATE) - regs->gpr[op.update_reg] = op.ea; - instr_done: regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); return 1; diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S index d5b4d9498c54..56aac4c22025 100644 --- a/arch/powerpc/lib/string_64.S +++ b/arch/powerpc/lib/string_64.S @@ -184,7 +184,7 @@ err1; std r0,8(r3) mtctr r6 mr r8,r3 14: -err1; dcbz r0,r3 +err1; dcbz 0,r3 add r3,r3,r9 bdnz 14b diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index f4c6472f2fc4..f29212e40f40 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -22,8 +22,11 @@ extern int __map_without_ltlbs; +static unsigned long block_mapped_ram; + /* - * Return PA for this VA if it is in IMMR area, or 0 + * Return PA for this VA if it is in an area mapped with LTLBs. + * Otherwise, returns 0 */ phys_addr_t v_block_mapped(unsigned long va) { @@ -33,11 +36,13 @@ phys_addr_t v_block_mapped(unsigned long va) return 0; if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) return p + va - VIRT_IMMR_BASE; + if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) + return __pa(va); return 0; } /* - * Return VA for a given PA or 0 if not mapped + * Return VA for a given PA mapped with LTLBs or 0 if not mapped */ unsigned long p_block_mapped(phys_addr_t pa) { @@ -47,6 +52,8 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; if (pa >= p && pa < p + IMMR_SIZE) return VIRT_IMMR_BASE + pa - p; + if (pa < block_mapped_ram) + return (unsigned long)__va(pa); return 0; } @@ -58,7 +65,7 @@ unsigned long p_block_mapped(phys_addr_t pa) void __init MMU_init_hw(void) { /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ -#ifdef CONFIG_PIN_TLB +#ifdef CONFIG_PIN_TLB_DATA unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY; #ifdef CONFIG_PIN_TLB_IMMR @@ -80,7 +87,7 @@ void __init MMU_init_hw(void) #endif } -static void mmu_mapin_immr(void) +static void __init mmu_mapin_immr(void) { unsigned long p = PHYS_IMMR_BASE; unsigned long v = VIRT_IMMR_BASE; @@ -96,8 +103,11 @@ static void mmu_mapin_immr(void) extern unsigned int DTLBMiss_jmp; #endif extern unsigned int DTLBMiss_cmp, FixupDAR_cmp; +#ifndef CONFIG_PIN_TLB_TEXT +extern unsigned int ITLBMiss_cmp; +#endif -void mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped) +static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped) { unsigned int instr = *addr; @@ -116,6 +126,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top) #ifndef CONFIG_PIN_TLB_IMMR patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP); #endif +#ifndef CONFIG_PIN_TLB_TEXT + mmu_patch_cmp_limit(&ITLBMiss_cmp, 0); +#endif } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); } @@ -133,11 +146,13 @@ unsigned long __init mmu_mapin_ram(unsigned long top) if (mapped) memblock_set_current_limit(mapped); + block_mapped_ram = mapped; + return mapped; } -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) +void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) { /* We don't currently support the first MEMBLOCK not mapping 0 * physical on those processors diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 7414034df1c3..fb844d2f266e 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -8,7 +8,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ - init-common.o + init-common.o mmu_context.o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o @@ -22,8 +22,6 @@ ifeq ($(CONFIG_PPC_STD_MMU_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o endif -obj-$(CONFIG_PPC_ICSWX) += icswx.o -obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index b1c144b03fcf..5c4c93dcff19 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -205,7 +205,7 @@ static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r, aps_index = calculate_pagesize(st, aps, "actual"); if (aps_index != 2) seq_printf(st->seq, "LP enc: %lx", lp); - seq_puts(st->seq, "\n"); + seq_putc(st->seq, '\n'); } diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 44fe4833910f..c9282d27b203 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -350,7 +350,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->current_flags, pg_level[st->level].num); - seq_puts(st->seq, "\n"); + seq_putc(st->seq, '\n'); } /* diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 4c422632047b..4797d08581ce 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -45,43 +45,39 @@ #include <asm/siginfo.h> #include <asm/debug.h> -#include "icswx.h" - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) +static inline bool notify_page_fault(struct pt_regs *regs) { - int ret = 0; + bool ret = false; +#ifdef CONFIG_KPROBES /* kprobe_running() needs smp_processor_id() */ if (!user_mode(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 11)) - ret = 1; + ret = true; preempt_enable(); } +#endif /* CONFIG_KPROBES */ + + if (unlikely(debugger_fault_handler(regs))) + ret = true; return ret; } -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif /* * Check whether the instruction at regs->nip is a store using * an update addressing form which will update r1. */ -static int store_updates_sp(struct pt_regs *regs) +static bool store_updates_sp(struct pt_regs *regs) { unsigned int inst; if (get_user(inst, (unsigned int __user *)regs->nip)) - return 0; + return false; /* check for 1 in the rA field */ if (((inst >> 16) & 0x1f) != 1) - return 0; + return false; /* check major opcode */ switch (inst >> 26) { case 37: /* stwu */ @@ -89,7 +85,7 @@ static int store_updates_sp(struct pt_regs *regs) case 45: /* sthu */ case 53: /* stfsu */ case 55: /* stfdu */ - return 1; + return true; case 62: /* std or stdu */ return (inst & 3) == 1; case 31: @@ -101,18 +97,53 @@ static int store_updates_sp(struct pt_regs *regs) case 439: /* sthux */ case 695: /* stfsux */ case 759: /* stfdux */ - return 1; + return true; } } - return 0; + return false; } /* * do_page_fault error handling helpers */ -#define MM_FAULT_RETURN 0 -#define MM_FAULT_CONTINUE -1 -#define MM_FAULT_ERR(sig) (sig) +static int +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code) +{ + /* + * If we are in kernel mode, bail out with a SEGV, this will + * be caught by the assembly which will restore the non-volatile + * registers before calling bad_page_fault() + */ + if (!user_mode(regs)) + return SIGSEGV; + + _exception(SIGSEGV, regs, si_code, address); + + return 0; +} + +static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) +{ + return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); +} + +static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + return __bad_area_nosemaphore(regs, address, si_code); +} + +static noinline int bad_area(struct pt_regs *regs, unsigned long address) +{ + return __bad_area(regs, address, SEGV_MAPERR); +} static int do_sigbus(struct pt_regs *regs, unsigned long address, unsigned int fault) @@ -121,7 +152,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, unsigned int lsb = 0; if (!user_mode(regs)) - return MM_FAULT_ERR(SIGBUS); + return SIGBUS; current->thread.trap_nr = BUS_ADRERR; info.si_signo = SIGBUS; @@ -142,25 +173,17 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, #endif info.si_addr_lsb = lsb; force_sig_info(SIGBUS, &info, current); - return MM_FAULT_RETURN; + return 0; } static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) { /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue the pagefault. + * Kernel page fault interrupted by SIGKILL. We have no reason to + * continue processing. */ - if (fatal_signal_pending(current)) { - /* Coming from kernel, we need to deal with uaccess fixups */ - if (user_mode(regs)) - return MM_FAULT_RETURN; - return MM_FAULT_ERR(SIGKILL); - } - - /* No fault: be happy */ - if (!(fault & VM_FAULT_ERROR)) - return MM_FAULT_CONTINUE; + if (fatal_signal_pending(current) && !user_mode(regs)) + return SIGKILL; /* Out of memory */ if (fault & VM_FAULT_OOM) { @@ -169,19 +192,176 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) * made us unable to handle the page fault gracefully. */ if (!user_mode(regs)) - return MM_FAULT_ERR(SIGKILL); + return SIGSEGV; pagefault_out_of_memory(); - return MM_FAULT_RETURN; + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + return do_sigbus(regs, addr, fault); + else if (fault & VM_FAULT_SIGSEGV) + return bad_area_nosemaphore(regs, addr); + else + BUG(); + } + return 0; +} + +/* Is this a bad kernel fault ? */ +static bool bad_kernel_fault(bool is_exec, unsigned long error_code, + unsigned long address) +{ + if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) { + printk_ratelimited(KERN_CRIT "kernel tried to execute" + " exec-protected page (%lx) -" + "exploit attempt? (uid: %d)\n", + address, from_kuid(&init_user_ns, + current_uid())); } + return is_exec || (address >= TASK_SIZE); +} - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) - return do_sigbus(regs, addr, fault); +static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, + struct vm_area_struct *vma, + bool store_update_sp) +{ + /* + * N.B. The POWER/Open ABI allows programs to access up to + * 288 bytes below the stack pointer. + * The kernel signal delivery code writes up to about 1.5kB + * below the stack pointer (r1) before decrementing it. + * The exec code can write slightly over 640kB to the stack + * before setting the user r1. Thus we allow the stack to + * expand to 1MB without further checks. + */ + if (address + 0x100000 < vma->vm_end) { + /* get user regs even if this fault is in kernel mode */ + struct pt_regs *uregs = current->thread.regs; + if (uregs == NULL) + return true; - /* We don't understand the fault code, this is fatal */ - BUG(); - return MM_FAULT_CONTINUE; + /* + * A user-mode access to an address a long way below + * the stack pointer is only valid if the instruction + * is one which would update the stack pointer to the + * address accessed if the instruction completed, + * i.e. either stwu rs,n(r1) or stwux rs,r1,rb + * (or the byte, halfword, float or double forms). + * + * If we don't check this then any write to the area + * between the last mapped region and the stack will + * expand the stack rather than segfaulting. + */ + if (address + 2048 < uregs->gpr[1] && !store_update_sp) + return true; + } + return false; +} + +static bool access_error(bool is_write, bool is_exec, + struct vm_area_struct *vma) +{ + /* + * Allow execution from readable areas if the MMU does not + * provide separate controls over reading and executing. + * + * Note: That code used to not be enabled for 4xx/BookE. + * It is now as I/D cache coherency for these is done at + * set_pte_at() time and I see no reason why the test + * below wouldn't be valid on those processors. This -may- + * break programs compiled with a really old ABI though. + */ + if (is_exec) { + return !(vma->vm_flags & VM_EXEC) && + (cpu_has_feature(CPU_FTR_NOEXECUTE) || + !(vma->vm_flags & (VM_READ | VM_WRITE))); + } + + if (is_write) { + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return true; + return false; + } + + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return true; + + return false; } +#ifdef CONFIG_PPC_SMLPAR +static inline void cmo_account_page_fault(void) +{ + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); + } +} +#else +static inline void cmo_account_page_fault(void) { } +#endif /* CONFIG_PPC_SMLPAR */ + +#ifdef CONFIG_PPC_STD_MMU +static void sanity_check_fault(bool is_write, unsigned long error_code) +{ + /* + * For hash translation mode, we should never get a + * PROTFAULT. Any update to pte to reduce access will result in us + * removing the hash page table entry, thus resulting in a DSISR_NOHPTE + * fault instead of DSISR_PROTFAULT. + * + * A pte update to relax the access will not result in a hash page table + * entry invalidate and hence can result in DSISR_PROTFAULT. + * ptep_set_access_flags() doesn't do a hpte flush. This is why we have + * the special !is_write in the below conditional. + * + * For platforms that doesn't supports coherent icache and do support + * per page noexec bit, we do setup things such that we do the + * sync between D/I cache via fault. But that is handled via low level + * hash fault code (hash_page_do_lazy_icache()) and we should not reach + * here in such case. + * + * For wrong access that can result in PROTFAULT, the above vma->vm_flags + * check should handle those and hence we should fall to the bad_area + * handling correctly. + * + * For embedded with per page exec support that doesn't support coherent + * icache we do get PROTFAULT and we handle that D/I cache sync in + * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON + * is conditional for server MMU. + * + * For radix, we can get prot fault for autonuma case, because radix + * page table will have them marked noaccess for user. + */ + if (!radix_enabled() && !is_write) + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); +} +#else +static void sanity_check_fault(bool is_write, unsigned long error_code) { } +#endif /* CONFIG_PPC_STD_MMU */ + +/* + * Define the correct "is_write" bit in error_code based + * on the processor family + */ +#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) +#define page_fault_is_write(__err) ((__err) & ESR_DST) +#define page_fault_is_bad(__err) (0) +#else +#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE) +#if defined(CONFIG_PPC_8xx) +#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) +#elif defined(CONFIG_PPC64) +#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S) +#else +#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) +#endif +#endif + /* * For 600- and 800-family processors, the error_code parameter is DSISR * for a data fault, SRR1 for an instruction fault. For 400-family processors @@ -195,92 +375,56 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +static int __do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) { - enum ctx_state prev_state = exception_enter(); struct vm_area_struct * vma; struct mm_struct *mm = current->mm; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - int code = SEGV_MAPERR; - int is_write = 0; - int trap = TRAP(regs); - int is_exec = trap == 0x400; + int is_exec = TRAP(regs) == 0x400; int is_user = user_mode(regs); - int fault; - int rc = 0, store_update_sp = 0; + int is_write = page_fault_is_write(error_code); + int fault, major = 0; + bool store_update_sp = false; -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) - /* - * Fortunately the bit assignments in SRR1 for an instruction - * fault and DSISR for a data fault are mostly the same for the - * bits we are interested in. But there are some bits which - * indicate errors in DSISR but can validly be set in SRR1. - */ - if (is_exec) - error_code &= 0x48200000; - else - is_write = error_code & DSISR_ISSTORE; -#else - is_write = error_code & ESR_DST; -#endif /* CONFIG_4xx || CONFIG_BOOKE */ + if (notify_page_fault(regs)) + return 0; -#ifdef CONFIG_PPC_ICSWX - /* - * we need to do this early because this "data storage - * interrupt" does not update the DAR/DEAR so we don't want to - * look at it - */ - if (error_code & ICSWX_DSI_UCT) { - rc = acop_handle_fault(regs, address, error_code); - if (rc) - goto bail; + if (unlikely(page_fault_is_bad(error_code))) { + if (is_user) { + _exception(SIGBUS, regs, BUS_OBJERR, address); + return 0; + } + return SIGBUS; } -#endif /* CONFIG_PPC_ICSWX */ - - if (notify_page_fault(regs)) - goto bail; - if (unlikely(debugger_fault_handler(regs))) - goto bail; + /* Additional sanity check(s) */ + sanity_check_fault(is_write, error_code); /* * The kernel should never take an execute fault nor should it * take a page fault to a kernel address. */ - if (!is_user && (is_exec || (address >= TASK_SIZE))) { - rc = SIGSEGV; - goto bail; - } + if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address))) + return SIGSEGV; -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ - defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_8xx)) - if (error_code & DSISR_DABRMATCH) { - /* breakpoint match */ - do_break(regs, address, error_code); - goto bail; + /* + * If we're in an interrupt, have no user context or are running + * in a region with pagefaults disabled then we must not take the fault + */ + if (unlikely(faulthandler_disabled() || !mm)) { + if (is_user) + printk_ratelimited(KERN_ERR "Page fault in user mode" + " with faulthandler_disabled()=%d" + " mm=%p\n", + faulthandler_disabled(), mm); + return bad_area_nosemaphore(regs, address); } -#endif /* We restore the interrupt state now */ if (!arch_irq_disabled_regs(regs)) local_irq_enable(); - if (faulthandler_disabled() || mm == NULL) { - if (!is_user) { - rc = SIGSEGV; - goto bail; - } - /* faulthandler_disabled() in user mode is really bad, - as is current->mm == NULL. */ - printk(KERN_EMERG "Page fault in user mode with " - "faulthandler_disabled() = %d mm = %p\n", - faulthandler_disabled(), mm); - printk(KERN_EMERG "NIP = %lx MSR = %lx\n", - regs->nip, regs->msr); - die("Weird page fault", regs, SIGSEGV); - } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -293,6 +437,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (is_user) flags |= FAULT_FLAG_USER; + if (is_write) + flags |= FAULT_FLAG_WRITE; + if (is_exec) + flags |= FAULT_FLAG_INSTRUCTION; /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -309,9 +457,9 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if (!is_user && !search_exception_tables(regs->nip)) - goto bad_area_nosemaphore; + return bad_area_nosemaphore(regs, address); retry: down_read(&mm->mmap_sem); @@ -325,122 +473,24 @@ retry: } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) + return bad_area(regs, address); + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return bad_area(regs, address); - /* - * N.B. The POWER/Open ABI allows programs to access up to - * 288 bytes below the stack pointer. - * The kernel signal delivery code writes up to about 1.5kB - * below the stack pointer (r1) before decrementing it. - * The exec code can write slightly over 640kB to the stack - * before setting the user r1. Thus we allow the stack to - * expand to 1MB without further checks. - */ - if (address + 0x100000 < vma->vm_end) { - /* get user regs even if this fault is in kernel mode */ - struct pt_regs *uregs = current->thread.regs; - if (uregs == NULL) - goto bad_area; + /* The stack is being expanded, check if it's valid */ + if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp))) + return bad_area(regs, address); - /* - * A user-mode access to an address a long way below - * the stack pointer is only valid if the instruction - * is one which would update the stack pointer to the - * address accessed if the instruction completed, - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb - * (or the byte, halfword, float or double forms). - * - * If we don't check this then any write to the area - * between the last mapped region and the stack will - * expand the stack rather than segfaulting. - */ - if (address + 2048 < uregs->gpr[1] && !store_update_sp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; + /* Try to expand it */ + if (unlikely(expand_stack(vma, address))) + return bad_area(regs, address); good_area: - code = SEGV_ACCERR; -#if defined(CONFIG_6xx) - if (error_code & 0x95700000) - /* an error such as lwarx to I/O controller space, - address matching DABR, eciwx, etc. */ - goto bad_area; -#endif /* CONFIG_6xx */ -#if defined(CONFIG_8xx) - /* The MPC8xx seems to always set 0x80000000, which is - * "undefined". Of those that can be set, this is the only - * one which seems bad. - */ - if (error_code & 0x10000000) - /* Guarded storage error. */ - goto bad_area; -#endif /* CONFIG_8xx */ - - if (is_exec) { - /* - * Allow execution from readable areas if the MMU does not - * provide separate controls over reading and executing. - * - * Note: That code used to not be enabled for 4xx/BookE. - * It is now as I/D cache coherency for these is done at - * set_pte_at() time and I see no reason why the test - * below wouldn't be valid on those processors. This -may- - * break programs compiled with a really old ABI though. - */ - if (!(vma->vm_flags & VM_EXEC) && - (cpu_has_feature(CPU_FTR_NOEXECUTE) || - !(vma->vm_flags & (VM_READ | VM_WRITE)))) - goto bad_area; - /* a write */ - } else if (is_write) { - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - flags |= FAULT_FLAG_WRITE; - /* a read */ - } else { - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } -#ifdef CONFIG_PPC_STD_MMU - /* - * For hash translation mode, we should never get a - * PROTFAULT. Any update to pte to reduce access will result in us - * removing the hash page table entry, thus resulting in a DSISR_NOHPTE - * fault instead of DSISR_PROTFAULT. - * - * A pte update to relax the access will not result in a hash page table - * entry invalidate and hence can result in DSISR_PROTFAULT. - * ptep_set_access_flags() doesn't do a hpte flush. This is why we have - * the special !is_write in the below conditional. - * - * For platforms that doesn't supports coherent icache and do support - * per page noexec bit, we do setup things such that we do the - * sync between D/I cache via fault. But that is handled via low level - * hash fault code (hash_page_do_lazy_icache()) and we should not reach - * here in such case. - * - * For wrong access that can result in PROTFAULT, the above vma->vm_flags - * check should handle those and hence we should fall to the bad_area - * handling correctly. - * - * For embedded with per page exec support that doesn't support coherent - * icache we do get PROTFAULT and we handle that D/I cache sync in - * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON - * is conditional for server MMU. - * - * For radix, we can get prot fault for autonuma case, because radix - * page table will have them marked noaccess for user. - */ - if (!radix_enabled() && !is_write) - WARN_ON_ONCE(error_code & DSISR_PROTFAULT); -#endif /* CONFIG_PPC_STD_MMU */ + if (unlikely(access_error(is_write, is_exec, vma))) + return bad_area(regs, address); /* * If for any reason at all we couldn't handle the fault, @@ -448,6 +498,7 @@ good_area: * the fault. */ fault = handle_mm_fault(vma, address, flags); + major |= fault & VM_FAULT_MAJOR; /* * Handle the retry right now, the mmap_sem has been released in that @@ -465,64 +516,39 @@ good_area: if (!fatal_signal_pending(current)) goto retry; } - /* We will enter mm_fault_error() below */ - } else - up_read(¤t->mm->mmap_sem); - - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (fault & VM_FAULT_SIGSEGV) - goto bad_area_nosemaphore; - rc = mm_fault_error(regs, address, fault); - if (rc >= MM_FAULT_RETURN) - goto bail; - else - rc = 0; + + /* + * User mode? Just return to handle the fatal exception otherwise + * return to bad_page_fault + */ + return is_user ? 0 : SIGBUS; } + up_read(¤t->mm->mmap_sem); + + if (unlikely(fault & VM_FAULT_ERROR)) + return mm_fault_error(regs, address, fault); + /* * Major/minor page fault accounting. */ - if (fault & VM_FAULT_MAJOR) { + if (major) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); -#ifdef CONFIG_PPC_SMLPAR - if (firmware_has_feature(FW_FEATURE_CMO)) { - u32 page_ins; - - preempt_disable(); - page_ins = be32_to_cpu(get_lppaca()->page_ins); - page_ins += 1 << PAGE_FACTOR; - get_lppaca()->page_ins = cpu_to_be32(page_ins); - preempt_enable(); - } -#endif /* CONFIG_PPC_SMLPAR */ + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + cmo_account_page_fault(); } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - - goto bail; - -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses cause a SIGSEGV */ - if (is_user) { - _exception(SIGSEGV, regs, code, address); - goto bail; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } + return 0; +} +NOKPROBE_SYMBOL(__do_page_fault); - if (is_exec && (error_code & DSISR_PROTFAULT)) - printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" - " page (%lx) - exploit attempt? (uid: %d)\n", - address, from_kuid(&init_user_ns, current_uid())); - - rc = SIGSEGV; - -bail: +int do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + enum ctx_state prev_state = exception_enter(); + int rc = __do_page_fault(regs, address, error_code); exception_exit(prev_state); return rc; } diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 6f962e5cb5e1..ffbd7c0bda96 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -575,7 +575,6 @@ _GLOBAL(flush_hash_pages) rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ stwcx. r8,0,r5 /* update the pte */ bne- 33b -EXPORT_SYMBOL(flush_hash_pages) /* Get the address of the primary PTE group in the hash table (r3) */ _GLOBAL(flush_hash_patch_A) @@ -634,6 +633,7 @@ _GLOBAL(flush_hash_patch_B) SYNC_601 isync blr +EXPORT_SYMBOL(flush_hash_pages) /* * Flush an entry from the TLB diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7a20669c19e7..67ec2e927253 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -61,6 +61,7 @@ #include <asm/tm.h> #include <asm/trace.h> #include <asm/ps3.h> +#include <asm/pte-walk.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -507,9 +508,9 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, printk(KERN_INFO "Huge page(16GB) memory: " "addr = 0x%lX size = 0x%lX pages = %d\n", phys_addr, block_size, expected_pages); - if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) { + if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { memblock_reserve(phys_addr, block_size * expected_pages); - add_gpage(phys_addr, block_size, expected_pages); + pseries_add_gpage(phys_addr, block_size, expected_pages); } return 0; } @@ -1019,6 +1020,7 @@ void __init hash__early_init_mmu(void) __kernel_virt_size = H_KERN_VIRT_SIZE; __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; + __kernel_io_start = H_KERN_IO_START; vmemmap = (struct page *)H_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; @@ -1228,7 +1230,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, unsigned long vsid; pte_t *ptep; unsigned hugeshift; - const struct cpumask *tmp; int rc, user_region = 0; int psize, ssize; @@ -1280,8 +1281,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, } /* Check CPU locality */ - tmp = cpumask_of(smp_processor_id()); - if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) + if (user_region && mm_is_thread_local(mm)) flags |= HPTE_LOCAL_UPDATE; #ifndef CONFIG_PPC_64K_PAGES @@ -1297,7 +1297,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift); + ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); rc = 1; @@ -1526,7 +1526,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, * THP pages use update_mmu_cache_pmd. We don't do * hash preload there. Hence can ignore THP here */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift); + ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); if (!ptep) goto out_exit; @@ -1543,7 +1543,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* Is that local to this CPU ? */ - if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + if (mm_is_thread_local(mm)) update_flags |= HPTE_LOCAL_UPDATE; /* Hash it in */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e1bf5ca397fe..1571a498a33f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -24,6 +24,8 @@ #include <asm/tlb.h> #include <asm/setup.h> #include <asm/hugetlb.h> +#include <asm/pte-walk.h> + #ifdef CONFIG_HUGETLB_PAGE @@ -36,32 +38,15 @@ unsigned int HPAGE_SHIFT; EXPORT_SYMBOL(HPAGE_SHIFT); -/* - * Tracks gpages after the device tree is scanned and before the - * huge_boot_pages list is ready. On non-Freescale implementations, this is - * just used to track 16G pages and so is a single array. FSL-based - * implementations may have more than one gpage size, so we need multiple - * arrays - */ -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) -#define MAX_NUMBER_GPAGES 128 -struct psize_gpages { - u64 gpage_list[MAX_NUMBER_GPAGES]; - unsigned int nr_gpages; -}; -static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; -#else -#define MAX_NUMBER_GPAGES 1024 -static u64 gpage_freearray[MAX_NUMBER_GPAGES]; -static unsigned nr_gpages; -#endif - #define hugepd_none(hpd) (hpd_val(hpd) == 0) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { - /* Only called for hugetlbfs pages, hence can ignore THP */ - return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); + /* + * Only called for hugetlbfs pages, hence can ignore THP and the + * irq disabled walk. + */ + return __find_linux_pte(mm->pgd, addr, NULL, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -210,145 +195,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz return hugepte_offset(*hpdp, addr, pdshift); } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) -/* Build list of addresses of gigantic pages. This function is used in early - * boot before the buddy allocator is setup. - */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) -{ - unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); - int i; - - if (addr == 0) - return; - - gpage_freearray[idx].nr_gpages = number_of_pages; - - for (i = 0; i < number_of_pages; i++) { - gpage_freearray[idx].gpage_list[i] = addr; - addr += page_size; - } -} - -/* - * Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) -{ - struct huge_bootmem_page *m; - int idx = shift_to_mmu_psize(huge_page_shift(hstate)); - int nr_gpages = gpage_freearray[idx].nr_gpages; - - if (nr_gpages == 0) - return 0; - -#ifdef CONFIG_HIGHMEM - /* - * If gpages can be in highmem we can't use the trick of storing the - * data structure in the page; allocate space for this - */ - m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0); - m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; -#else - m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); -#endif - - list_add(&m->list, &huge_boot_pages); - gpage_freearray[idx].nr_gpages = nr_gpages; - gpage_freearray[idx].gpage_list[nr_gpages] = 0; - m->hstate = hstate; - - return 1; -} +#ifdef CONFIG_PPC_BOOK3S_64 /* - * Scan the command line hugepagesz= options for gigantic pages; store those in - * a list that we use to allocate the memory once all options are parsed. + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready on pseries. */ - -unsigned long gpage_npages[MMU_PAGE_COUNT]; - -static int __init do_gpage_early_setup(char *param, char *val, - const char *unused, void *arg) -{ - static phys_addr_t size; - unsigned long npages; - - /* - * The hugepagesz and hugepages cmdline options are interleaved. We - * use the size variable to keep track of whether or not this was done - * properly and skip over instances where it is incorrect. Other - * command-line parsing code will issue warnings, so we don't need to. - * - */ - if ((strcmp(param, "default_hugepagesz") == 0) || - (strcmp(param, "hugepagesz") == 0)) { - size = memparse(val, NULL); - } else if (strcmp(param, "hugepages") == 0) { - if (size != 0) { - if (sscanf(val, "%lu", &npages) <= 0) - npages = 0; - if (npages > MAX_NUMBER_GPAGES) { - pr_warn("MMU: %lu pages requested for page " -#ifdef CONFIG_PHYS_ADDR_T_64BIT - "size %llu KB, limiting to " -#else - "size %u KB, limiting to " -#endif - __stringify(MAX_NUMBER_GPAGES) "\n", - npages, size / 1024); - npages = MAX_NUMBER_GPAGES; - } - gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; - size = 0; - } - } - return 0; -} - +#define MAX_NUMBER_GPAGES 1024 +__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; +__initdata static unsigned nr_gpages; /* - * This function allocates physical space for pages that are larger than the - * buddy allocator can handle. We want to allocate these in highmem because - * the amount of lowmem is limited. This means that this function MUST be - * called before lowmem_end_addr is set up in MMU_init() in order for the lmb - * allocate to grab highmem. - */ -void __init reserve_hugetlb_gpages(void) -{ - static __initdata char cmdline[COMMAND_LINE_SIZE]; - phys_addr_t size, base; - int i; - - strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, - NULL, &do_gpage_early_setup); - - /* - * Walk gpage list in reverse, allocating larger page sizes first. - * Skip over unsupported sizes, or sizes that have 0 gpages allocated. - * When we reach the point in the list where pages are no longer - * considered gpages, we're done. - */ - for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { - if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) - continue; - else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) - break; - - size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); - base = memblock_alloc_base(size * gpage_npages[i], size, - MEMBLOCK_ALLOC_ANYWHERE); - add_gpage(base, size, gpage_npages[i]); - } -} - -#else /* !PPC_FSL_BOOK3E */ - -/* Build list of addresses of gigantic pages. This function is used in early + * Build list of addresses of gigantic pages. This function is used in early * boot before the buddy allocator is setup. */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) { if (!addr) return; @@ -360,10 +220,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) } } -/* Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) +int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) { struct huge_bootmem_page *m; if (nr_gpages == 0) @@ -376,6 +233,17 @@ int alloc_bootmem_huge_page(struct hstate *hstate) } #endif + +int __init alloc_bootmem_huge_page(struct hstate *h) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) + return pseries_alloc_bootmem_huge_page(h); +#endif + return __alloc_bootmem_huge_page(h); +} + #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) @@ -407,8 +275,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte) batchp = &get_cpu_var(hugepd_freelist_cur); if (atomic_read(&tlb->mm->mm_users) < 2 || - cpumask_equal(mm_cpumask(tlb->mm), - cpumask_of(smp_processor_id()))) { + mm_is_thread_local(tlb->mm)) { kmem_cache_free(hugepte_cache, hugepte); put_cpu_var(hugepd_freelist_cur); return; @@ -886,9 +753,8 @@ void flush_dcache_icache_hugepage(struct page *page) * This function need to be called with interrupts disabled. We use this variant * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 */ - -pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *shift) +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hpage_shift) { pgd_t pgd, *pgdp; pud_t pud, *pudp; @@ -897,8 +763,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, hugepd_t *hpdp = NULL; unsigned pdshift = PGDIR_SHIFT; - if (shift) - *shift = 0; + if (hpage_shift) + *hpage_shift = 0; if (is_thp) *is_thp = false; @@ -968,16 +834,15 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, ret_pte = hugepte_offset(*hpdp, ea, pdshift); pdshift = hugepd_shift(*hpdp); out: - if (shift) - *shift = pdshift; + if (hpage_shift) + *hpage_shift = pdshift; return ret_pte; } -EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); +EXPORT_SYMBOL_GPL(__find_linux_pte); int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask; unsigned long pte_end; struct page *head, *page; pte_t pte; @@ -988,18 +853,10 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, end = pte_end; pte = READ_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_READ; - /* - * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined - * as 0 and _PAGE_RO has to be set when a page is not writable - */ - if (write) - mask |= _PAGE_WRITE; - else - mask |= _PAGE_RO; - - if ((pte_val(pte) & mask) != mask) + if (!pte_present(pte) || !pte_read(pte)) + return 0; + if (write && !pte_write(pte)) return 0; /* hugepages are never "special" */ diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c deleted file mode 100644 index 1fa794d7d59f..000000000000 --- a/arch/powerpc/mm/icswx.c +++ /dev/null @@ -1,292 +0,0 @@ -/* - * ICSWX and ACOP Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/uaccess.h> - -#include "icswx.h" - -/* - * The processor and its L2 cache cause the icswx instruction to - * generate a COP_REQ transaction on PowerBus. The transaction has no - * address, and the processor does not perform an MMU access to - * authenticate the transaction. The command portion of the PowerBus - * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor - * Process ID (PID), which the coprocessor compares to the authorized - * LPID and PID held in the coprocessor, to determine if the process - * is authorized to generate the transaction. The data of the COP_REQ - * transaction is 128-byte or less in size and is placed in cacheable - * memory on a 128-byte cache line boundary. - * - * The task to use a coprocessor should use use_cop() to mark the use - * of the Coprocessor Type (CT) and context switching. On a server - * class processor, the PID register is used only for coprocessor - * management + * and so a coprocessor PID is allocated before - * executing icswx + * instruction. Drop_cop() is used to free the - * coprocessor PID. - * - * Example: - * Host Fabric Interface (HFI) is a PowerPC network coprocessor. - * Each HFI have multiple windows. Each HFI window serves as a - * network device sending to and receiving from HFI network. - * HFI immediate send function uses icswx instruction. The immediate - * send function allows small (single cache-line) packets be sent - * without using the regular HFI send FIFO and doorbell, which are - * much slower than immediate send. - * - * For each task intending to use HFI immediate send, the HFI driver - * calls use_cop() to obtain a coprocessor PID for the task. - * The HFI driver then allocate a free HFI window and save the - * coprocessor PID to the HFI window to allow the task to use the - * HFI window. - * - * The HFI driver repeatedly creates immediate send packets and - * issues icswx instruction to send data through the HFI window. - * The HFI compares the coprocessor PID in the CPU PID register - * to the PID held in the HFI window to determine if the transaction - * is allowed. - * - * When the task to release the HFI window, the HFI driver calls - * drop_cop() to release the coprocessor PID. - */ - -void switch_cop(struct mm_struct *next) -{ -#ifdef CONFIG_PPC_ICSWX_PID - mtspr(SPRN_PID, next->context.cop_pid); -#endif - mtspr(SPRN_ACOP, next->context.acop); -} - -/** - * Start using a coprocessor. - * @acop: mask of coprocessor to be used. - * @mm: The mm the coprocessor to associate with. Most likely current mm. - * - * Return a positive PID if successful. Negative errno otherwise. - * The returned PID will be fed to the coprocessor to determine if an - * icswx transaction is authenticated. - */ -int use_cop(unsigned long acop, struct mm_struct *mm) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return -ENODEV; - - if (!mm || !acop) - return -EINVAL; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - ret = get_cop_pid(mm); - if (ret < 0) - goto out; - - /* update acop */ - mm->context.acop |= acop; - - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - -out: - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(use_cop); - -/** - * Stop using a coprocessor. - * @acop: mask of coprocessor to be stopped. - * @mm: The mm the coprocessor associated with. - */ -void drop_cop(unsigned long acop, struct mm_struct *mm) -{ - int free_pid; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return; - - if (WARN_ON_ONCE(!mm)) - return; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - mm->context.acop &= ~acop; - - free_pid = disable_cop_pid(mm); - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - - if (free_pid != COP_PID_NONE) - free_cop_pid(free_pid); - - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); -} -EXPORT_SYMBOL_GPL(drop_cop); - -static int acop_use_cop(int ct) -{ - /* There is no alternate policy, yet */ - return -1; -} - -/* - * Get the instruction word at the NIP - */ -static u32 acop_get_inst(struct pt_regs *regs) -{ - u32 inst; - u32 __user *p; - - p = (u32 __user *)regs->nip; - if (!access_ok(VERIFY_READ, p, sizeof(*p))) - return 0; - - if (__get_user(inst, p)) - return 0; - - return inst; -} - -/** - * @regs: registers at time of interrupt - * @address: storage address - * @error_code: Fault code, usually the DSISR or ESR depending on - * processor type - * - * Return 0 if we are able to resolve the data storage fault that - * results from a CT miss in the ACOP register. - */ -int acop_handle_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - int ct; - u32 inst = 0; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) { - pr_info("No coprocessors available"); - _exception(SIGILL, regs, ILL_ILLOPN, address); - } - - if (!user_mode(regs)) { - /* this could happen if the HV denies the - * kernel access, for now we just die */ - die("ICSWX from kernel failed", regs, SIGSEGV); - } - - /* Some implementations leave us a hint for the CT */ - ct = ICSWX_GET_CT_HINT(error_code); - if (ct < 0) { - /* we have to peek at the instruction word to figure out CT */ - u32 ccw; - u32 rs; - - inst = acop_get_inst(regs); - if (inst == 0) - return -1; - - rs = (inst >> (31 - 10)) & 0x1f; - ccw = regs->gpr[rs]; - ct = (ccw >> 16) & 0x3f; - } - - /* - * We could be here because another thread has enabled acop - * but the ACOP register has yet to be updated. - * - * This should have been taken care of by the IPI to sync all - * the threads (see smp_call_function(sync_cop, mm, 1)), but - * that could take forever if there are a significant amount - * of threads. - * - * Given the number of threads on some of these systems, - * perhaps this is the best way to sync ACOP rather than whack - * every thread with an IPI. - */ - if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) { - sync_cop(current->active_mm); - return 0; - } - - /* check for alternate policy */ - if (!acop_use_cop(ct)) - return 0; - - /* at this point the CT is unknown to the system */ - pr_warn("%s[%d]: Coprocessor %d is unavailable\n", - current->comm, current->pid, ct); - - /* get inst if we don't already have it */ - if (inst == 0) { - inst = acop_get_inst(regs); - if (inst == 0) - return -1; - } - - /* Check if the instruction is the "record form" */ - if (inst & 1) { - /* - * the instruction is "record" form so we can reject - * using CR0 - */ - regs->ccr &= ~(0xful << 28); - regs->ccr |= ICSWX_RC_NOT_FOUND << 28; - - /* Move on to the next instruction */ - regs->nip += 4; - } else { - /* - * There is no architected mechanism to report a bad - * CT so we could either SIGILL or report nothing. - * Since the non-record version should only bu used - * for "hints" or "don't care" we should probably do - * nothing. However, I could see how some people - * might want an SIGILL so it here if you want it. - */ -#ifdef CONFIG_PPC_ICSWX_USE_SIGILL - _exception(SIGILL, regs, ILL_ILLOPN, address); -#else - regs->nip += 4; -#endif - } - - return 0; -} -EXPORT_SYMBOL_GPL(acop_handle_fault); diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h deleted file mode 100644 index 6dedc08e62c8..000000000000 --- a/arch/powerpc/mm/icswx.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef _ARCH_POWERPC_MM_ICSWX_H_ -#define _ARCH_POWERPC_MM_ICSWX_H_ - -/* - * ICSWX and ACOP Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <asm/mmu_context.h> - -/* also used to denote that PIDs are not used */ -#define COP_PID_NONE 0 - -static inline void sync_cop(void *arg) -{ - struct mm_struct *mm = arg; - - if (mm == current->active_mm) - switch_cop(current->active_mm); -} - -#ifdef CONFIG_PPC_ICSWX_PID -extern int get_cop_pid(struct mm_struct *mm); -extern int disable_cop_pid(struct mm_struct *mm); -extern void free_cop_pid(int free_pid); -#else -#define get_cop_pid(m) (COP_PID_NONE) -#define disable_cop_pid(m) (COP_PID_NONE) -#define free_cop_pid(p) -#endif - -/* - * These are implementation bits for architected registers. If this - * ever becomes architecture the should be moved to reg.h et. al. - */ -/* UCT is the same bit for Server and Embedded */ -#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */ - -#ifdef CONFIG_PPC_BOOK3E -/* Embedded implementation gives us no hints as to what the CT is */ -#define ICSWX_GET_CT_HINT(x) (-1) -#else -/* Server implementation contains the CT value in the DSISR */ -#define ICSWX_DSISR_CTMASK 0x00003f00 -#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8) -#endif - -#define ICSWX_RC_STARTED 0x8 /* The request has been started */ -#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */ -#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */ -#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */ - -extern int acop_handle_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code); - -static inline u64 acop_copro_type_bit(unsigned int type) -{ - return 1ULL << (63 - type); -} - -#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */ diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c deleted file mode 100644 index 91e30eb7d054..000000000000 --- a/arch/powerpc/mm/icswx_pid.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * ICSWX and ACOP/PID Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/idr.h> -#include <linux/module.h> -#include "icswx.h" - -#define COP_PID_MIN (COP_PID_NONE + 1) -#define COP_PID_MAX (0xFFFF) - -static DEFINE_SPINLOCK(mmu_context_acop_lock); -static DEFINE_IDA(cop_ida); - -static int new_cop_pid(struct ida *ida, int min_id, int max_id, - spinlock_t *lock) -{ - int index; - int err; - -again: - if (!ida_pre_get(ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(lock); - err = ida_get_new_above(ida, min_id, &index); - spin_unlock(lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > max_id) { - spin_lock(lock); - ida_remove(ida, index); - spin_unlock(lock); - return -ENOMEM; - } - - return index; -} - -int get_cop_pid(struct mm_struct *mm) -{ - int pid; - - if (mm->context.cop_pid == COP_PID_NONE) { - pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX, - &mmu_context_acop_lock); - if (pid >= 0) - mm->context.cop_pid = pid; - } - return mm->context.cop_pid; -} - -int disable_cop_pid(struct mm_struct *mm) -{ - int free_pid = COP_PID_NONE; - - if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) { - free_pid = mm->context.cop_pid; - mm->context.cop_pid = COP_PID_NONE; - } - return free_pid; -} - -void free_cop_pid(int free_pid) -{ - spin_lock(&mmu_context_acop_lock); - ida_remove(&cop_ida, free_pid); - spin_unlock(&mmu_context_acop_lock); -} diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 8a7c38b8d335..6419b33ca309 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -113,6 +113,12 @@ void __init MMU_setup(void) __map_without_bats = 1; __map_without_ltlbs = 1; } +#ifdef CONFIG_STRICT_KERNEL_RWX + if (rodata_enabled) { + __map_without_bats = 1; + __map_without_ltlbs = 1; + } +#endif } /* @@ -132,8 +138,6 @@ void __init MMU_init(void) * Reserve gigantic pages for hugetlb. This MUST occur before * lowmem_end_addr is initialized below. */ - reserve_hugetlb_gpages(); - if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII memblock_enforce_memory_limit(memblock.memory.regions[0].size); diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 5b4c25d12ff3..588a521966ec 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -356,7 +356,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(realmode_pfn_to_page); -#elif defined(CONFIG_FLATMEM) +#else struct page *realmode_pfn_to_page(unsigned long pfn) { @@ -365,7 +365,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(realmode_pfn_to_page); -#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_STD_MMU_64 static bool disable_radix; @@ -381,7 +381,7 @@ early_param("disable_radix", parse_disable_radix); * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do * radix. If not, we clear the radix feature bit so we fall back to hash. */ -static void early_check_vec5(void) +static void __init early_check_vec5(void) { unsigned long root, chosen; int size; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 46b4e67d2372..4362b86ef84c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -436,7 +436,7 @@ void flush_dcache_icache_page(struct page *page) return; } #endif -#if defined(CONFIG_8xx) || defined(CONFIG_PPC64) +#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC64) /* On 8xx there is no need to kmap since highmem is not supported */ __flush_dcache_icache(page_address(page)); #else diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c new file mode 100644 index 000000000000..0f613bc63c50 --- /dev/null +++ b/arch/powerpc/mm/mmu_context.c @@ -0,0 +1,99 @@ +/* + * Common implementation of switch_mm_irqs_off + * + * Copyright IBM Corp. 2017 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/mm.h> +#include <linux/cpu.h> + +#include <asm/mmu_context.h> + +#if defined(CONFIG_PPC32) +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) +{ + /* 32-bit keeps track of the current PGDIR in the thread struct */ + tsk->thread.pgdir = mm->pgd; +} +#elif defined(CONFIG_PPC_BOOK3E_64) +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) +{ + /* 64-bit Book3E keeps track of current PGD in the PACA */ + get_paca()->pgd = mm->pgd; +} +#else +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) { } +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +static inline void inc_mm_active_cpus(struct mm_struct *mm) +{ + atomic_inc(&mm->context.active_cpus); +} +#else +static inline void inc_mm_active_cpus(struct mm_struct *mm) { } +#endif + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + bool new_on_cpu = false; + + /* Mark this context has been used on the new CPU */ + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + inc_mm_active_cpus(next); + + /* + * This full barrier orders the store to the cpumask above vs + * a subsequent operation which allows this CPU to begin loading + * translations for next. + * + * When using the radix MMU that operation is the load of the + * MMU context id, which is then moved to SPRN_PID. + * + * For the hash MMU it is either the first load from slb_cache + * in switch_slb(), and/or the store of paca->mm_ctx_id in + * copy_mm_to_paca(). + * + * On the read side the barrier is in pte_xchg(), which orders + * the store to the PTE vs the load of mm_cpumask. + */ + smp_mb(); + + new_on_cpu = true; + } + + /* Some subarchs need to track the PGD elsewhere */ + switch_mm_pgdir(tsk, next); + + /* Nothing else to do if we aren't actually switching */ + if (prev == next) + return; + + /* + * We must stop all altivec streams before changing the HW + * context + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + asm volatile ("dssall"); + + if (new_on_cpu) + radix_kvm_prefetch_workaround(next); + + /* + * The actual HW switching method differs between the various + * sub architectures. Out of line for now + */ + switch_mmu_context(prev, next, tsk); +} + diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index a75f63833284..05e15386d4cb 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -25,8 +25,6 @@ #include <asm/mmu_context.h> #include <asm/pgalloc.h> -#include "icswx.h" - static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); @@ -165,16 +163,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) return index; mm->context.id = index; -#ifdef CONFIG_PPC_ICSWX - mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); - if (!mm->context.cop_lockp) { - __destroy_context(index); - subpage_prot_free(mm); - mm->context.id = MMU_NO_CONTEXT; - return -ENOMEM; - } - spin_lock_init(mm->context.cop_lockp); -#endif /* CONFIG_PPC_ICSWX */ #ifdef CONFIG_PPC_64K_PAGES mm->context.pte_frag = NULL; @@ -182,6 +170,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(mm); #endif + atomic_set(&mm->context.active_cpus, 0); + return 0; } @@ -226,12 +216,6 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif -#ifdef CONFIG_PPC_ICSWX - drop_cop(mm->context.acop, mm); - kfree(mm->context.cop_lockp); - mm->context.cop_lockp = NULL; -#endif /* CONFIG_PPC_ICSWX */ - if (radix_enabled()) { /* * Radix doesn't have a valid bit in the process table diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d46128b22150..57fbc554c785 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -27,7 +27,7 @@ /* * On 40x and 8xx, we directly inline tlbia and tlbivax */ -#if defined(CONFIG_40x) || defined(CONFIG_8xx) +#if defined(CONFIG_40x) || defined(CONFIG_PPC_8xx) static inline void _tlbil_all(void) { asm volatile ("sync; tlbia; isync" : : : "memory"); @@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid) } #define _tlbil_pid_noind(pid) _tlbil_pid(pid) -#else /* CONFIG_40x || CONFIG_8xx */ +#else /* CONFIG_40x || CONFIG_PPC_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); #ifdef CONFIG_PPC_BOOK3E @@ -46,12 +46,12 @@ extern void _tlbil_pid_noind(unsigned int pid); #else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) #endif -#endif /* !(CONFIG_40x || CONFIG_8xx) */ +#endif /* !(CONFIG_40x || CONFIG_PPC_8xx) */ /* * On 8xx, we directly inline tlbie, on others, it's extern */ -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { @@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, { __tlbil_va(address, pid); } -#endif /* CONFIG_8xx */ +#endif /* CONFIG_PPC_8xx */ #if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) extern void _tlbivax_bcast(unsigned long address, unsigned int pid, diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 31eed8fa8e99..3b65917785a5 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -9,6 +9,7 @@ #include <linux/sched.h> #include <linux/mm_types.h> +#include <misc/cxl-base.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -64,6 +65,27 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } + +static void do_nothing(void *unused) +{ + +} +/* + * Serialize against find_current_mm_pte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_current_mm_pte to finish. + */ +void serialize_against_pte_lookup(struct mm_struct *mm) +{ + smp_mb(); + smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); +} + /* * We use this to invalidate a pmdp entry before switching from a * hugepte to regular pmd entry. @@ -77,7 +99,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(vma->vm_mm); } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 443a2c66a304..ec277913e01b 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -239,7 +239,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * by sending an IPI to all the cpus and executing a dummy * function there. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(vma->vm_mm); /* * Now invalidate the hpte entries in the range * covered by pmd. This make sure we take a @@ -329,7 +329,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, unsigned int psize; unsigned long vsid; unsigned long flags = 0; - const struct cpumask *tmp; /* get the base page size,vsid and segment size */ #ifdef CONFIG_DEBUG_VM @@ -350,8 +349,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, ssize = mmu_kernel_ssize; } - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(mm), tmp)) + if (mm_is_thread_local(mm)) flags |= HPTE_LOCAL_UPDATE; return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); @@ -380,16 +378,16 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, */ memset(pgtable, 0, PTE_FRAG_SIZE); /* - * Serialize against find_linux_pte_or_hugepte which does lock-less + * Serialize against find_current_mm_pte variants which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. + * find_curren_mm_pte to finish. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 671a45d86c18..39c252b54d16 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -8,10 +8,15 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + +#define pr_fmt(fmt) "radix-mmu: " fmt + +#include <linux/kernel.h> #include <linux/sched/mm.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/mm.h> +#include <linux/string_helpers.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -31,9 +36,13 @@ unsigned int mmu_base_pid; static int native_register_process_table(unsigned long base, unsigned long pg_sz, unsigned long table_size) { - unsigned long patb1 = base | table_size | PATB_GR; + unsigned long patb0, patb1; + + patb0 = be64_to_cpu(partition_tb[0].patb0); + patb1 = base | table_size | PATB_GR; + + mmu_partition_table_set_entry(0, patb0, patb1); - partition_tb->patb1 = cpu_to_be64(patb1); return 0; } @@ -179,10 +188,14 @@ static inline void __meminit print_mapping(unsigned long start, unsigned long end, unsigned long size) { + char buf[10]; + if (end <= start) return; - pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size); + string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); + + pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf); } static int __meminit create_physical_mapping(unsigned long start, @@ -526,6 +539,7 @@ void __init radix__early_init_mmu(void) __kernel_virt_size = RADIX_KERN_VIRT_SIZE; __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; + __kernel_io_start = RADIX_KERN_IO_START; vmemmap = (struct page *)RADIX_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; @@ -836,9 +850,12 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre */ pmd = *pmdp; pmd_clear(pmdp); + /*FIXME!! Verify whether we need this kick below */ - kick_all_cpus_sync(); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + serialize_against_pte_lookup(vma->vm_mm); + + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); + return pmd; } @@ -897,16 +914,16 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); old_pmd = __pmd(old); /* - * Serialize against find_linux_pte_or_hugepte which does lock-less + * Serialize against find_current_mm_pte which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. + * find_current_mm_pte to finish. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index a9e4bfc025bc..65eda1997c3f 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -34,6 +34,7 @@ #include <asm/fixmap.h> #include <asm/io.h> #include <asm/setup.h> +#include <asm/sections.h> #include "mmu_decl.h" @@ -242,7 +243,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) /* * Map in a chunk of physical memory starting at start. */ -void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) +static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { unsigned long v, s, f; phys_addr_t p; @@ -294,7 +295,7 @@ void __init mapin_ram(void) * Returns true (1) if PTE was found, zero otherwise. The pointer to * the PTE pointer is unmodified if PTE is not found. */ -int +static int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) { pgd_t *pgd; @@ -323,9 +324,7 @@ get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) return(retval); } -#ifdef CONFIG_DEBUG_PAGEALLOC - -static int __change_page_attr(struct page *page, pgprot_t prot) +static int __change_page_attr_noflush(struct page *page, pgprot_t prot) { pte_t *kpte; pmd_t *kpmd; @@ -339,8 +338,6 @@ static int __change_page_attr(struct page *page, pgprot_t prot) if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) return -EINVAL; __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0); - wmb(); - flush_tlb_page(NULL, address); pte_unmap(kpte); return 0; @@ -349,44 +346,65 @@ static int __change_page_attr(struct page *page, pgprot_t prot) /* * Change the page attributes of an page in the linear mapping. * - * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY + * THIS DOES NOTHING WITH BAT MAPPINGS, DEBUG USE ONLY */ static int change_page_attr(struct page *page, int numpages, pgprot_t prot) { int i, err = 0; unsigned long flags; + struct page *start = page; local_irq_save(flags); for (i = 0; i < numpages; i++, page++) { - err = __change_page_attr(page, prot); + err = __change_page_attr_noflush(page, prot); if (err) break; } + wmb(); + flush_tlb_kernel_range((unsigned long)page_address(start), + (unsigned long)page_address(page)); local_irq_restore(flags); return err; } - -void __kernel_map_pages(struct page *page, int numpages, int enable) +void mark_initmem_nx(void) { - if (PageHighMem(page)) - return; + struct page *page = virt_to_page(_sinittext); + unsigned long numpages = PFN_UP((unsigned long)_einittext) - + PFN_DOWN((unsigned long)_sinittext); - change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); + change_page_attr(page, numpages, PAGE_KERNEL); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ -static int fixmaps; - -void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +#ifdef CONFIG_STRICT_KERNEL_RWX +void mark_rodata_ro(void) { - unsigned long address = __fix_to_virt(idx); + struct page *page; + unsigned long numpages; + + page = virt_to_page(_stext); + numpages = PFN_UP((unsigned long)_etext) - + PFN_DOWN((unsigned long)_stext); - if (idx >= __end_of_fixed_addresses) { - BUG(); + change_page_attr(page, numpages, PAGE_KERNEL_ROX); + /* + * mark .rodata as read only. Use __init_begin rather than __end_rodata + * to cover NOTES and EXCEPTION_TABLE. + */ + page = virt_to_page(__start_rodata); + numpages = PFN_UP((unsigned long)__init_begin) - + PFN_DOWN((unsigned long)__start_rodata); + + change_page_attr(page, numpages, PAGE_KERNEL_RO); +} +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) return; - } - map_kernel_page(address, phys, pgprot_val(flags)); - fixmaps++; + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); } +#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 0736e94c7615..ac0717a90ca6 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -104,6 +104,8 @@ unsigned long __vmalloc_start; EXPORT_SYMBOL(__vmalloc_start); unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); +unsigned long __kernel_io_start; +EXPORT_SYMBOL(__kernel_io_start); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index bde378559d01..906a86fe457b 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -121,12 +121,25 @@ slb_miss_kernel_load_vmemmap: 1: #endif /* CONFIG_SPARSEMEM_VMEMMAP */ - /* vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines + /* + * r10 contains the ESID, which is the original faulting EA shifted + * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) + * which is 0xd00038000. That can't be used as an immediate, even if we + * ignored the 0xd, so we have to load it into a register, and we only + * have one register free. So we must load all of (H_VMALLOC_END >> 28) + * into a register and compare ESID against that. + */ + lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 + ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 + // Rotate left 4, then mask with 0xffffffff0 + rldic r11,r11,4,28 // r11 = 0xd00038000 + cmpld r10,r11 // if r10 >= r11 + bge 5f // goto io_mapping + + /* + * vmalloc mapping gets the encoding from the PACA as the mapping + * can be demoted from 64K -> 4K dynamically on some machines. */ - clrldi r11,r10,48 - cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 - bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f 5: diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 16ae1bbe13f0..b3e849c4886e 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -54,23 +54,15 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) */ __tlbiel_pid(pid, 0, ric); - if (ric == RIC_FLUSH_ALL) - /* For the remaining sets, just flush the TLB */ - ric = RIC_FLUSH_TLB; + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + /* For the remaining sets, just flush the TLB */ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_pid(pid, set, ric); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void tlbiel_pwc(unsigned long pid) -{ - asm volatile("ptesync": : :"memory"); - - /* For PWC flush, we don't look at set number */ - __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); asm volatile("ptesync": : :"memory"); asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); @@ -146,31 +138,23 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm) preempt_disable(); pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); -void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +#ifndef CONFIG_SMP +static void radix__local_flush_all_mm(struct mm_struct *mm) { unsigned long pid; - struct mm_struct *mm = tlb->mm; - /* - * If we are doing a full mm flush, we will do a tlb flush - * with RIC_FLUSH_ALL later. - */ - if (tlb->fullmm) - return; preempt_disable(); - pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - tlbiel_pwc(pid); - + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(radix__local_flush_tlb_pwc); +#endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) @@ -208,38 +192,35 @@ void radix__flush_tlb_mm(struct mm_struct *mm) goto no_context; if (!mm_is_thread_local(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); + _tlbie_pid(pid, RIC_FLUSH_TLB); else - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_TLB); no_context: preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_mm); -void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +static void radix__flush_all_mm(struct mm_struct *mm) { unsigned long pid; - struct mm_struct *mm = tlb->mm; - /* - * If we are doing a full mm flush, we will do a tlb flush - * with RIC_FLUSH_ALL later. - */ - if (tlb->fullmm) - return; preempt_disable(); - pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto no_context; if (!mm_is_thread_local(mm)) - _tlbie_pid(pid, RIC_FLUSH_PWC); + _tlbie_pid(pid, RIC_FLUSH_ALL); else - tlbiel_pwc(pid); + _tlbiel_pid(pid, RIC_FLUSH_ALL); no_context: preempt_enable(); } + +void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +{ + tlb->need_flush_all = 1; +} EXPORT_SYMBOL(radix__flush_tlb_pwc); void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, @@ -271,6 +252,8 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) } EXPORT_SYMBOL(radix__flush_tlb_page); +#else /* CONFIG_SMP */ +#define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -288,6 +271,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm = vma->vm_mm; + radix__flush_tlb_mm(mm); } EXPORT_SYMBOL(radix__flush_tlb_range); @@ -319,7 +303,10 @@ void radix__tlb_flush(struct mmu_gather *tlb) */ if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else + else if (tlb->need_flush_all) { + tlb->need_flush_all = 0; + radix__flush_all_mm(mm); + } else radix__flush_tlb_mm(mm); } @@ -364,6 +351,43 @@ err_out: preempt_enable(); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) +{ + int local = mm_is_thread_local(mm); + unsigned long ap = mmu_get_ap(mmu_virtual_psize); + unsigned long pid, end; + + + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + /* 4k page size, just blow the world */ + if (PAGE_SIZE == 0x1000) { + radix__flush_all_mm(mm); + return; + } + + /* Otherwise first do the PWC */ + if (local) + _tlbiel_pid(pid, RIC_FLUSH_PWC); + else + _tlbie_pid(pid, RIC_FLUSH_PWC); + + /* Then iterate the pages */ + end = addr + HPAGE_PMD_SIZE; + for (; addr < end; addr += PAGE_SIZE) { + if (local) + _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + else + _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + } +no_context: + preempt_enable(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, unsigned long page_size) { diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index b5b0fb97b9c0..881ebd53ffc2 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -29,6 +29,8 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/bug.h> +#include <asm/pte-walk.h> + #include <trace/events/thp.h> @@ -138,13 +140,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, */ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) { - const struct cpumask *tmp; - int i, local = 0; + int i, local; i = batch->index; - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(batch->mm), tmp)) - local = 1; + local = mm_is_thread_local(batch->mm); if (i == 1) flush_hash_page(batch->vpn[0], batch->pte[0], batch->psize, batch->ssize, local); @@ -207,8 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp, - &hugepage_shift); + pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, + &hugepage_shift); unsigned long pte; if (ptep == NULL) diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index eabecfcaef7c..048b8e9f4492 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -60,7 +60,7 @@ _GLOBAL(__tlbil_va) isync 1: blr -#elif defined(CONFIG_8xx) +#elif defined(CONFIG_PPC_8xx) /* * Nothing to do for 8xx, everything is inline diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 30cf03f53428..47fc6660845d 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -263,6 +263,7 @@ static inline bool is_nearbranch(int offset) #define COND_EQ (CR0_EQ | COND_CMP_TRUE) #define COND_NE (CR0_EQ | COND_CMP_FALSE) #define COND_LT (CR0_LT | COND_CMP_TRUE) +#define COND_LE (CR0_GT | COND_CMP_FALSE) #endif diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 861c5af1c9c4..a66e64b0b251 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -25,11 +25,7 @@ int bpf_jit_enable __read_mostly; static void bpf_jit_fill_ill_insns(void *area, unsigned int size) { - int *p = area; - - /* Fill whole space with trap instructions */ - while (p < (int *)((char *)area + size)) - *p++ = BREAKPOINT_INSTRUCTION; + memset32(area, BREAKPOINT_INSTRUCTION, size/4); } static inline void bpf_flush_icache(void *start, void *end) @@ -795,12 +791,24 @@ emit_clear: case BPF_JMP | BPF_JSGT | BPF_X: true_cond = COND_GT; goto cond_branch; + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_X: + true_cond = COND_LT; + goto cond_branch; case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JSGE | BPF_K: case BPF_JMP | BPF_JSGE | BPF_X: true_cond = COND_GE; goto cond_branch; + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_X: + true_cond = COND_LE; + goto cond_branch; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: true_cond = COND_EQ; @@ -817,14 +825,18 @@ emit_clear: cond_branch: switch (code) { case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: /* unsigned comparison */ PPC_CMPLD(dst_reg, src_reg); break; case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: /* signed comparison */ PPC_CMPD(dst_reg, src_reg); break; @@ -834,7 +846,9 @@ cond_branch: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: /* * Need sign-extended load, so only positive * values can be used as imm in cmpldi @@ -849,7 +863,9 @@ cond_branch: } break; case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: /* * signed comparison, so any 16-bit value * can be used in cmpdi diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index 4d606b99a5cb..3f3a5ce66495 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -8,6 +8,7 @@ obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ isa207-common.o power8-pmu.o power9-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o +obj-$(CONFIG_PPC_POWERNV) += imc-pmu.o obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 0fc26714780a..0af051a1974e 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -22,6 +22,7 @@ #ifdef CONFIG_PPC64 #include "../kernel/ppc32.h" #endif +#include <asm/pte-walk.h> /* @@ -127,7 +128,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) return -EFAULT; local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(pgdir, addr, NULL, &shift); + ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); if (!ptep) goto err_out; if (!shift) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6c2d4168daec..2e3eb7431571 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, ~0ULL, event->hw.last_period); - if (event->attr.sample_type & PERF_SAMPLE_ADDR) + if (event->attr.sample_type & + (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) perf_get_data_addr(regs, &data.addr); if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) { diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c new file mode 100644 index 000000000000..9ccac86f3463 --- /dev/null +++ b/arch/powerpc/perf/imc-pmu.c @@ -0,0 +1,1306 @@ +/* + * In-Memory Collection (IMC) Performance Monitor counter support. + * + * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. + * (C) 2017 Anju T Sudhakar, IBM Corporation. + * (C) 2017 Hemant K Shaw, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or later version. + */ +#include <linux/perf_event.h> +#include <linux/slab.h> +#include <asm/opal.h> +#include <asm/imc-pmu.h> +#include <asm/cputhreads.h> +#include <asm/smp.h> +#include <linux/string.h> + +/* Nest IMC data structures and variables */ + +/* + * Used to avoid races in counting the nest-pmu units during hotplug + * register and unregister + */ +static DEFINE_MUTEX(nest_init_lock); +static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc); +static struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS]; +static cpumask_t nest_imc_cpumask; +struct imc_pmu_ref *nest_imc_refc; +static int nest_pmus; + +/* Core IMC data structures and variables */ + +static cpumask_t core_imc_cpumask; +struct imc_pmu_ref *core_imc_refc; +static struct imc_pmu *core_imc_pmu; + +/* Thread IMC data structures and variables */ + +static DEFINE_PER_CPU(u64 *, thread_imc_mem); +static struct imc_pmu *thread_imc_pmu; +static int thread_imc_mem_size; + +struct imc_pmu *imc_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct imc_pmu, pmu); +} + +PMU_FORMAT_ATTR(event, "config:0-40"); +PMU_FORMAT_ATTR(offset, "config:0-31"); +PMU_FORMAT_ATTR(rvalue, "config:32"); +PMU_FORMAT_ATTR(mode, "config:33-40"); +static struct attribute *imc_format_attrs[] = { + &format_attr_event.attr, + &format_attr_offset.attr, + &format_attr_rvalue.attr, + &format_attr_mode.attr, + NULL, +}; + +static struct attribute_group imc_format_group = { + .name = "format", + .attrs = imc_format_attrs, +}; + +/* Get the cpumask printed to a buffer "buf" */ +static ssize_t imc_pmu_cpumask_get_attr(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct imc_pmu *imc_pmu = container_of(pmu, struct imc_pmu, pmu); + cpumask_t *active_mask; + + switch(imc_pmu->domain){ + case IMC_DOMAIN_NEST: + active_mask = &nest_imc_cpumask; + break; + case IMC_DOMAIN_CORE: + active_mask = &core_imc_cpumask; + break; + default: + return 0; + } + + return cpumap_print_to_pagebuf(true, buf, active_mask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL); + +static struct attribute *imc_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group imc_pmu_cpumask_attr_group = { + .attrs = imc_pmu_cpumask_attrs, +}; + +/* device_str_attr_create : Populate event "name" and string "str" in attribute */ +static struct attribute *device_str_attr_create(const char *name, const char *str) +{ + struct perf_pmu_events_attr *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return NULL; + sysfs_attr_init(&attr->attr.attr); + + attr->event_str = str; + attr->attr.attr.name = name; + attr->attr.attr.mode = 0444; + attr->attr.show = perf_event_sysfs_show; + + return &attr->attr.attr; +} + +struct imc_events *imc_parse_event(struct device_node *np, const char *scale, + const char *unit, const char *prefix, u32 base) +{ + struct imc_events *event; + const char *s; + u32 reg; + + event = kzalloc(sizeof(struct imc_events), GFP_KERNEL); + if (!event) + return NULL; + + if (of_property_read_u32(np, "reg", ®)) + goto error; + /* Add the base_reg value to the "reg" */ + event->value = base + reg; + + if (of_property_read_string(np, "event-name", &s)) + goto error; + + event->name = kasprintf(GFP_KERNEL, "%s%s", prefix, s); + if (!event->name) + goto error; + + if (of_property_read_string(np, "scale", &s)) + s = scale; + + if (s) { + event->scale = kstrdup(s, GFP_KERNEL); + if (!event->scale) + goto error; + } + + if (of_property_read_string(np, "unit", &s)) + s = unit; + + if (s) { + event->unit = kstrdup(s, GFP_KERNEL); + if (!event->unit) + goto error; + } + + return event; +error: + kfree(event->unit); + kfree(event->scale); + kfree(event->name); + kfree(event); + + return NULL; +} + +/* + * update_events_in_group: Update the "events" information in an attr_group + * and assign the attr_group to the pmu "pmu". + */ +static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) +{ + struct attribute_group *attr_group; + struct attribute **attrs, *dev_str; + struct device_node *np, *pmu_events; + struct imc_events *ev; + u32 handle, base_reg; + int i=0, j=0, ct; + const char *prefix, *g_scale, *g_unit; + const char *ev_val_str, *ev_scale_str, *ev_unit_str; + + if (!of_property_read_u32(node, "events", &handle)) + pmu_events = of_find_node_by_phandle(handle); + else + return 0; + + /* Did not find any node with a given phandle */ + if (!pmu_events) + return 0; + + /* Get a count of number of child nodes */ + ct = of_get_child_count(pmu_events); + + /* Get the event prefix */ + if (of_property_read_string(node, "events-prefix", &prefix)) + return 0; + + /* Get a global unit and scale data if available */ + if (of_property_read_string(node, "scale", &g_scale)) + g_scale = NULL; + + if (of_property_read_string(node, "unit", &g_unit)) + g_unit = NULL; + + /* "reg" property gives out the base offset of the counters data */ + of_property_read_u32(node, "reg", &base_reg); + + /* Allocate memory for the events */ + pmu->events = kcalloc(ct, sizeof(struct imc_events), GFP_KERNEL); + if (!pmu->events) + return -ENOMEM; + + ct = 0; + /* Parse the events and update the struct */ + for_each_child_of_node(pmu_events, np) { + ev = imc_parse_event(np, g_scale, g_unit, prefix, base_reg); + if (ev) + pmu->events[ct++] = ev; + } + + /* Allocate memory for attribute group */ + attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL); + if (!attr_group) + return -ENOMEM; + + /* + * Allocate memory for attributes. + * Since we have count of events for this pmu, we also allocate + * memory for the scale and unit attribute for now. + * "ct" has the total event structs added from the events-parent node. + * So allocate three times the "ct" (this includes event, event_scale and + * event_unit). + */ + attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL); + if (!attrs) { + kfree(attr_group); + kfree(pmu->events); + return -ENOMEM; + } + + attr_group->name = "events"; + attr_group->attrs = attrs; + do { + ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i]->value); + dev_str = device_str_attr_create(pmu->events[i]->name, ev_val_str); + if (!dev_str) + continue; + + attrs[j++] = dev_str; + if (pmu->events[i]->scale) { + ev_scale_str = kasprintf(GFP_KERNEL, "%s.scale",pmu->events[i]->name); + dev_str = device_str_attr_create(ev_scale_str, pmu->events[i]->scale); + if (!dev_str) + continue; + + attrs[j++] = dev_str; + } + + if (pmu->events[i]->unit) { + ev_unit_str = kasprintf(GFP_KERNEL, "%s.unit",pmu->events[i]->name); + dev_str = device_str_attr_create(ev_unit_str, pmu->events[i]->unit); + if (!dev_str) + continue; + + attrs[j++] = dev_str; + } + } while (++i < ct); + + /* Save the event attribute */ + pmu->attr_groups[IMC_EVENT_ATTR] = attr_group; + + kfree(pmu->events); + return 0; +} + +/* get_nest_pmu_ref: Return the imc_pmu_ref struct for the given node */ +static struct imc_pmu_ref *get_nest_pmu_ref(int cpu) +{ + return per_cpu(local_nest_imc_refc, cpu); +} + +static void nest_change_cpu_context(int old_cpu, int new_cpu) +{ + struct imc_pmu **pn = per_nest_pmu_arr; + int i; + + if (old_cpu < 0 || new_cpu < 0) + return; + + for (i = 0; *pn && i < IMC_MAX_PMUS; i++, pn++) + perf_pmu_migrate_context(&(*pn)->pmu, old_cpu, new_cpu); +} + +static int ppc_nest_imc_cpu_offline(unsigned int cpu) +{ + int nid, target = -1; + const struct cpumask *l_cpumask; + struct imc_pmu_ref *ref; + + /* + * Check in the designated list for this cpu. Dont bother + * if not one of them. + */ + if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask)) + return 0; + + /* + * Now that this cpu is one of the designated, + * find a next cpu a) which is online and b) in same chip. + */ + nid = cpu_to_node(cpu); + l_cpumask = cpumask_of_node(nid); + target = cpumask_any_but(l_cpumask, cpu); + + /* + * Update the cpumask with the target cpu and + * migrate the context if needed + */ + if (target >= 0 && target < nr_cpu_ids) { + cpumask_set_cpu(target, &nest_imc_cpumask); + nest_change_cpu_context(cpu, target); + } else { + opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(cpu)); + /* + * If this is the last cpu in this chip then, skip the reference + * count mutex lock and make the reference count on this chip zero. + */ + ref = get_nest_pmu_ref(cpu); + if (!ref) + return -EINVAL; + + ref->refc = 0; + } + return 0; +} + +static int ppc_nest_imc_cpu_online(unsigned int cpu) +{ + const struct cpumask *l_cpumask; + static struct cpumask tmp_mask; + int res; + + /* Get the cpumask of this node */ + l_cpumask = cpumask_of_node(cpu_to_node(cpu)); + + /* + * If this is not the first online CPU on this node, then + * just return. + */ + if (cpumask_and(&tmp_mask, l_cpumask, &nest_imc_cpumask)) + return 0; + + /* + * If this is the first online cpu on this node + * disable the nest counters by making an OPAL call. + */ + res = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(cpu)); + if (res) + return res; + + /* Make this CPU the designated target for counter collection */ + cpumask_set_cpu(cpu, &nest_imc_cpumask); + return 0; +} + +static int nest_pmu_cpumask_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, + "perf/powerpc/imc:online", + ppc_nest_imc_cpu_online, + ppc_nest_imc_cpu_offline); +} + +static void nest_imc_counters_release(struct perf_event *event) +{ + int rc, node_id; + struct imc_pmu_ref *ref; + + if (event->cpu < 0) + return; + + node_id = cpu_to_node(event->cpu); + + /* + * See if we need to disable the nest PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * enable or disable the nest counters. + */ + ref = get_nest_pmu_ref(event->cpu); + if (!ref) + return; + + /* Take the mutex lock for this node and then decrement the reference count */ + mutex_lock(&ref->lock); + ref->refc--; + if (ref->refc == 0) { + rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id); + return; + } + } else if (ref->refc < 0) { + WARN(1, "nest-imc: Invalid event reference count\n"); + ref->refc = 0; + } + mutex_unlock(&ref->lock); +} + +static int nest_imc_event_init(struct perf_event *event) +{ + int chip_id, rc, node_id; + u32 l_config, config = event->attr.config; + struct imc_mem_info *pcni; + struct imc_pmu *pmu; + struct imc_pmu_ref *ref; + bool flag = false; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling not supported */ + if (event->hw.sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + pmu = imc_event_to_pmu(event); + + /* Sanity check for config (event offset) */ + if ((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size) + return -EINVAL; + + /* + * Nest HW counter memory resides in a per-chip reserve-memory (HOMER). + * Get the base memory addresss for this cpu. + */ + chip_id = topology_physical_package_id(event->cpu); + pcni = pmu->mem_info; + do { + if (pcni->id == chip_id) { + flag = true; + break; + } + pcni++; + } while (pcni); + + if (!flag) + return -ENODEV; + + /* + * Add the event offset to the base address. + */ + l_config = config & IMC_EVENT_OFFSET_MASK; + event->hw.event_base = (u64)pcni->vbase + l_config; + node_id = cpu_to_node(event->cpu); + + /* + * Get the imc_pmu_ref struct for this node. + * Take the mutex lock and then increment the count of nest pmu events + * inited. + */ + ref = get_nest_pmu_ref(event->cpu); + if (!ref) + return -EINVAL; + + mutex_lock(&ref->lock); + if (ref->refc == 0) { + rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("nest-imc: Unable to start the counters for node %d\n", + node_id); + return rc; + } + } + ++ref->refc; + mutex_unlock(&ref->lock); + + event->destroy = nest_imc_counters_release; + return 0; +} + +/* + * core_imc_mem_init : Initializes memory for the current core. + * + * Uses alloc_pages_node() and uses the returned address as an argument to + * an opal call to configure the pdbar. The address sent as an argument is + * converted to physical address before the opal call is made. This is the + * base address at which the core imc counters are populated. + */ +static int core_imc_mem_init(int cpu, int size) +{ + int phys_id, rc = 0, core_id = (cpu / threads_per_core); + struct imc_mem_info *mem_info; + + /* + * alloc_pages_node() will allocate memory for core in the + * local node only. + */ + phys_id = topology_physical_package_id(cpu); + mem_info = &core_imc_pmu->mem_info[core_id]; + mem_info->id = core_id; + + /* We need only vbase for core counters */ + mem_info->vbase = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, + get_order(size))); + if (!mem_info->vbase) + return -ENOMEM; + + /* Init the mutex */ + core_imc_refc[core_id].id = core_id; + mutex_init(&core_imc_refc[core_id].lock); + + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE, + __pa((void *)mem_info->vbase), + get_hard_smp_processor_id(cpu)); + if (rc) { + free_pages((u64)mem_info->vbase, get_order(size)); + mem_info->vbase = NULL; + } + + return rc; +} + +static bool is_core_imc_mem_inited(int cpu) +{ + struct imc_mem_info *mem_info; + int core_id = (cpu / threads_per_core); + + mem_info = &core_imc_pmu->mem_info[core_id]; + if (!mem_info->vbase) + return false; + + return true; +} + +static int ppc_core_imc_cpu_online(unsigned int cpu) +{ + const struct cpumask *l_cpumask; + static struct cpumask tmp_mask; + int ret = 0; + + /* Get the cpumask for this core */ + l_cpumask = cpu_sibling_mask(cpu); + + /* If a cpu for this core is already set, then, don't do anything */ + if (cpumask_and(&tmp_mask, l_cpumask, &core_imc_cpumask)) + return 0; + + if (!is_core_imc_mem_inited(cpu)) { + ret = core_imc_mem_init(cpu, core_imc_pmu->counter_mem_size); + if (ret) { + pr_info("core_imc memory allocation for cpu %d failed\n", cpu); + return ret; + } + } + + /* set the cpu in the mask */ + cpumask_set_cpu(cpu, &core_imc_cpumask); + return 0; +} + +static int ppc_core_imc_cpu_offline(unsigned int cpu) +{ + unsigned int ncpu, core_id; + struct imc_pmu_ref *ref; + + /* + * clear this cpu out of the mask, if not present in the mask, + * don't bother doing anything. + */ + if (!cpumask_test_and_clear_cpu(cpu, &core_imc_cpumask)) + return 0; + + /* Find any online cpu in that core except the current "cpu" */ + ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); + + if (ncpu >= 0 && ncpu < nr_cpu_ids) { + cpumask_set_cpu(ncpu, &core_imc_cpumask); + perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu); + } else { + /* + * If this is the last cpu in this core then, skip taking refernce + * count mutex lock for this core and directly zero "refc" for + * this core. + */ + opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(cpu)); + core_id = cpu / threads_per_core; + ref = &core_imc_refc[core_id]; + if (!ref) + return -EINVAL; + + ref->refc = 0; + } + return 0; +} + +static int core_imc_pmu_cpumask_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, + "perf/powerpc/imc_core:online", + ppc_core_imc_cpu_online, + ppc_core_imc_cpu_offline); +} + +static void core_imc_counters_release(struct perf_event *event) +{ + int rc, core_id; + struct imc_pmu_ref *ref; + + if (event->cpu < 0) + return; + /* + * See if we need to disable the IMC PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * enable or disable the core counters. + */ + core_id = event->cpu / threads_per_core; + + /* Take the mutex lock and decrement the refernce count for this core */ + ref = &core_imc_refc[core_id]; + if (!ref) + return; + + mutex_lock(&ref->lock); + ref->refc--; + if (ref->refc == 0) { + rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("IMC: Unable to stop the counters for core %d\n", core_id); + return; + } + } else if (ref->refc < 0) { + WARN(1, "core-imc: Invalid event reference count\n"); + ref->refc = 0; + } + mutex_unlock(&ref->lock); +} + +static int core_imc_event_init(struct perf_event *event) +{ + int core_id, rc; + u64 config = event->attr.config; + struct imc_mem_info *pcmi; + struct imc_pmu *pmu; + struct imc_pmu_ref *ref; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling not supported */ + if (event->hw.sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + event->hw.idx = -1; + pmu = imc_event_to_pmu(event); + + /* Sanity check for config (event offset) */ + if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)) + return -EINVAL; + + if (!is_core_imc_mem_inited(event->cpu)) + return -ENODEV; + + core_id = event->cpu / threads_per_core; + pcmi = &core_imc_pmu->mem_info[core_id]; + if ((!pcmi->vbase)) + return -ENODEV; + + /* Get the core_imc mutex for this core */ + ref = &core_imc_refc[core_id]; + if (!ref) + return -EINVAL; + + /* + * Core pmu units are enabled only when it is used. + * See if this is triggered for the first time. + * If yes, take the mutex lock and enable the core counters. + * If not, just increment the count in core_imc_refc struct. + */ + mutex_lock(&ref->lock); + if (ref->refc == 0) { + rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("core-imc: Unable to start the counters for core %d\n", + core_id); + return rc; + } + } + ++ref->refc; + mutex_unlock(&ref->lock); + + event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK); + event->destroy = core_imc_counters_release; + return 0; +} + +/* + * Allocates a page of memory for each of the online cpus, and write the + * physical base address of that page to the LDBAR for that cpu. + * + * LDBAR Register Layout: + * + * 0 4 8 12 16 20 24 28 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * | | [ ] [ Counter Address [8:50] + * | * Mode | + * | * PB Scope + * * Enable/Disable + * + * 32 36 40 44 48 52 56 60 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * Counter Address [8:50] ] + * + */ +static int thread_imc_mem_alloc(int cpu_id, int size) +{ + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + int phys_id = topology_physical_package_id(cpu_id); + + if (!local_mem) { + /* + * This case could happen only once at start, since we dont + * free the memory in cpu offline path. + */ + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, + get_order(size))); + if (!local_mem) + return -ENOMEM; + + per_cpu(thread_imc_mem, cpu_id) = local_mem; + } + + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + + mtspr(SPRN_LDBAR, ldbar_value); + return 0; +} + +static int ppc_thread_imc_cpu_online(unsigned int cpu) +{ + return thread_imc_mem_alloc(cpu, thread_imc_mem_size); +} + +static int ppc_thread_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int thread_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, + "perf/powerpc/imc_thread:online", + ppc_thread_imc_cpu_online, + ppc_thread_imc_cpu_offline); +} + +void thread_imc_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + int core_id; + struct imc_pmu_ref *ref; + + if (!is_core_imc_mem_inited(smp_processor_id())) + return; + + core_id = smp_processor_id() / threads_per_core; + /* + * imc pmus are enabled only when it is used. + * See if this is triggered for the first time. + * If yes, take the mutex lock and enable the counters. + * If not, just increment the count in ref count struct. + */ + ref = &core_imc_refc[core_id]; + if (!ref) + return; + + if (sched_in) { + mutex_lock(&ref->lock); + if (ref->refc == 0) { + if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(smp_processor_id()))) { + mutex_unlock(&ref->lock); + pr_err("thread-imc: Unable to start the counter\ + for core %d\n", core_id); + return; + } + } + ++ref->refc; + mutex_unlock(&ref->lock); + } else { + mutex_lock(&ref->lock); + ref->refc--; + if (ref->refc == 0) { + if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(smp_processor_id()))) { + mutex_unlock(&ref->lock); + pr_err("thread-imc: Unable to stop the counters\ + for core %d\n", core_id); + return; + } + } else if (ref->refc < 0) { + ref->refc = 0; + } + mutex_unlock(&ref->lock); + } + + return; +} + +static int thread_imc_event_init(struct perf_event *event) +{ + u32 config = event->attr.config; + struct task_struct *target; + struct imc_pmu *pmu; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling not supported */ + if (event->hw.sample_period) + return -EINVAL; + + event->hw.idx = -1; + pmu = imc_event_to_pmu(event); + + /* Sanity check for config offset */ + if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)) + return -EINVAL; + + target = event->hw.target; + if (!target) + return -EINVAL; + + event->pmu->task_ctx_nr = perf_sw_context; + return 0; +} + +static bool is_thread_imc_pmu(struct perf_event *event) +{ + if (!strncmp(event->pmu->name, "thread_imc", strlen("thread_imc"))) + return true; + + return false; +} + +static u64 * get_event_base_addr(struct perf_event *event) +{ + u64 addr; + + if (is_thread_imc_pmu(event)) { + addr = (u64)per_cpu(thread_imc_mem, smp_processor_id()); + return (u64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK)); + } + + return (u64 *)event->hw.event_base; +} + +static void thread_imc_pmu_start_txn(struct pmu *pmu, + unsigned int txn_flags) +{ + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); +} + +static void thread_imc_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +static int thread_imc_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static u64 imc_read_counter(struct perf_event *event) +{ + u64 *addr, data; + + /* + * In-Memory Collection (IMC) counters are free flowing counters. + * So we take a snapshot of the counter value on enable and save it + * to calculate the delta at later stage to present the event counter + * value. + */ + addr = get_event_base_addr(event); + data = be64_to_cpu(READ_ONCE(*addr)); + local64_set(&event->hw.prev_count, data); + + return data; +} + +static void imc_event_update(struct perf_event *event) +{ + u64 counter_prev, counter_new, final_count; + + counter_prev = local64_read(&event->hw.prev_count); + counter_new = imc_read_counter(event); + final_count = counter_new - counter_prev; + + /* Update the delta to the event count */ + local64_add(final_count, &event->count); +} + +static void imc_event_start(struct perf_event *event, int flags) +{ + /* + * In Memory Counters are free flowing counters. HW or the microcode + * keeps adding to the counter offset in memory. To get event + * counter value, we snapshot the value here and we calculate + * delta at later point. + */ + imc_read_counter(event); +} + +static void imc_event_stop(struct perf_event *event, int flags) +{ + /* + * Take a snapshot and calculate the delta and update + * the event counter values. + */ + imc_event_update(event); +} + +static int imc_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + imc_event_start(event, flags); + + return 0; +} + +static int thread_imc_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + imc_event_start(event, flags); + + /* Enable the sched_task to start the engine */ + perf_sched_cb_inc(event->ctx->pmu); + return 0; +} + +static void thread_imc_event_del(struct perf_event *event, int flags) +{ + /* + * Take a snapshot and calculate the delta and update + * the event counter values. + */ + imc_event_update(event); + perf_sched_cb_dec(event->ctx->pmu); +} + +/* update_pmu_ops : Populate the appropriate operations for "pmu" */ +static int update_pmu_ops(struct imc_pmu *pmu) +{ + pmu->pmu.task_ctx_nr = perf_invalid_context; + pmu->pmu.add = imc_event_add; + pmu->pmu.del = imc_event_stop; + pmu->pmu.start = imc_event_start; + pmu->pmu.stop = imc_event_stop; + pmu->pmu.read = imc_event_update; + pmu->pmu.attr_groups = pmu->attr_groups; + pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group; + + switch (pmu->domain) { + case IMC_DOMAIN_NEST: + pmu->pmu.event_init = nest_imc_event_init; + pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group; + break; + case IMC_DOMAIN_CORE: + pmu->pmu.event_init = core_imc_event_init; + pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group; + break; + case IMC_DOMAIN_THREAD: + pmu->pmu.event_init = thread_imc_event_init; + pmu->pmu.sched_task = thread_imc_pmu_sched_task; + pmu->pmu.add = thread_imc_event_add; + pmu->pmu.del = thread_imc_event_del; + pmu->pmu.start_txn = thread_imc_pmu_start_txn; + pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn; + pmu->pmu.commit_txn = thread_imc_pmu_commit_txn; + break; + default: + break; + } + + return 0; +} + +/* init_nest_pmu_ref: Initialize the imc_pmu_ref struct for all the nodes */ +static int init_nest_pmu_ref(void) +{ + int nid, i, cpu; + + nest_imc_refc = kcalloc(num_possible_nodes(), sizeof(*nest_imc_refc), + GFP_KERNEL); + + if (!nest_imc_refc) + return -ENOMEM; + + i = 0; + for_each_node(nid) { + /* + * Mutex lock to avoid races while tracking the number of + * sessions using the chip's nest pmu units. + */ + mutex_init(&nest_imc_refc[i].lock); + + /* + * Loop to init the "id" with the node_id. Variable "i" initialized to + * 0 and will be used as index to the array. "i" will not go off the + * end of the array since the "for_each_node" loops for "N_POSSIBLE" + * nodes only. + */ + nest_imc_refc[i++].id = nid; + } + + /* + * Loop to init the per_cpu "local_nest_imc_refc" with the proper + * "nest_imc_refc" index. This makes get_nest_pmu_ref() alot simple. + */ + for_each_possible_cpu(cpu) { + nid = cpu_to_node(cpu); + for (i = 0; i < num_possible_nodes(); i++) { + if (nest_imc_refc[i].id == nid) { + per_cpu(local_nest_imc_refc, cpu) = &nest_imc_refc[i]; + break; + } + } + } + return 0; +} + +static void cleanup_all_core_imc_memory(void) +{ + int i, nr_cores = num_present_cpus() / threads_per_core; + struct imc_mem_info *ptr = core_imc_pmu->mem_info; + int size = core_imc_pmu->counter_mem_size; + + /* mem_info will never be NULL */ + for (i = 0; i < nr_cores; i++) { + if (ptr[i].vbase) + free_pages((u64)ptr->vbase, get_order(size)); + } + + kfree(ptr); + kfree(core_imc_refc); +} + +static void thread_imc_ldbar_disable(void *dummy) +{ + /* + * By Zeroing LDBAR, we disable thread-imc + * updates. + */ + mtspr(SPRN_LDBAR, 0); +} + +void thread_imc_disable(void) +{ + on_each_cpu(thread_imc_ldbar_disable, NULL, 1); +} + +static void cleanup_all_thread_imc_memory(void) +{ + int i, order = get_order(thread_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(thread_imc_mem, i)) + free_pages((u64)per_cpu(thread_imc_mem, i), order); + + } +} + +/* + * Common function to unregister cpu hotplug callback and + * free the memory. + * TODO: Need to handle pmu unregistering, which will be + * done in followup series. + */ +static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) +{ + if (pmu_ptr->domain == IMC_DOMAIN_NEST) { + mutex_lock(&nest_init_lock); + if (nest_pmus == 1) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); + kfree(nest_imc_refc); + } + + if (nest_pmus > 0) + nest_pmus--; + mutex_unlock(&nest_init_lock); + } + + /* Free core_imc memory */ + if (pmu_ptr->domain == IMC_DOMAIN_CORE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE); + cleanup_all_core_imc_memory(); + } + + /* Free thread_imc memory */ + if (pmu_ptr->domain == IMC_DOMAIN_THREAD) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); + cleanup_all_thread_imc_memory(); + } + + /* Only free the attr_groups which are dynamically allocated */ + kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); + kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); + kfree(pmu_ptr); + return; +} + + +/* + * imc_mem_init : Function to support memory allocation for core imc. + */ +static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, + int pmu_index) +{ + const char *s; + int nr_cores, cpu, res; + + if (of_property_read_string(parent, "name", &s)) + return -ENODEV; + + switch (pmu_ptr->domain) { + case IMC_DOMAIN_NEST: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + /* Needed for hotplug/migration */ + per_nest_pmu_arr[pmu_index] = pmu_ptr; + break; + case IMC_DOMAIN_CORE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + nr_cores = num_present_cpus() / threads_per_core; + pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), + GFP_KERNEL); + + if (!pmu_ptr->mem_info) + return -ENOMEM; + + core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), + GFP_KERNEL); + + if (!core_imc_refc) + return -ENOMEM; + + core_imc_pmu = pmu_ptr; + break; + case IMC_DOMAIN_THREAD: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + thread_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size); + if (res) + return res; + } + + thread_imc_pmu = pmu_ptr; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * init_imc_pmu : Setup and register the IMC pmu device. + * + * @parent: Device tree unit node + * @pmu_ptr: memory allocated for this pmu + * @pmu_idx: Count of nest pmc registered + * + * init_imc_pmu() setup pmu cpumask and registers for a cpu hotplug callback. + * Handles failure cases and accordingly frees memory. + */ +int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_idx) +{ + int ret; + + ret = imc_mem_init(pmu_ptr, parent, pmu_idx); + if (ret) + goto err_free; + + switch (pmu_ptr->domain) { + case IMC_DOMAIN_NEST: + /* + * Nest imc pmu need only one cpu per chip, we initialize the + * cpumask for the first nest imc pmu and use the same for the + * rest. To handle the cpuhotplug callback unregister, we track + * the number of nest pmus in "nest_pmus". + */ + mutex_lock(&nest_init_lock); + if (nest_pmus == 0) { + ret = init_nest_pmu_ref(); + if (ret) { + mutex_unlock(&nest_init_lock); + goto err_free; + } + /* Register for cpu hotplug notification. */ + ret = nest_pmu_cpumask_init(); + if (ret) { + mutex_unlock(&nest_init_lock); + goto err_free; + } + } + nest_pmus++; + mutex_unlock(&nest_init_lock); + break; + case IMC_DOMAIN_CORE: + ret = core_imc_pmu_cpumask_init(); + if (ret) { + cleanup_all_core_imc_memory(); + return ret; + } + + break; + case IMC_DOMAIN_THREAD: + ret = thread_imc_cpu_init(); + if (ret) { + cleanup_all_thread_imc_memory(); + return ret; + } + + break; + default: + return -1; /* Unknown domain */ + } + + ret = update_events_in_group(parent, pmu_ptr); + if (ret) + goto err_free; + + ret = update_pmu_ops(pmu_ptr); + if (ret) + goto err_free; + + ret = perf_pmu_register(&pmu_ptr->pmu, pmu_ptr->pmu.name, -1); + if (ret) + goto err_free; + + pr_info("%s performance monitor hardware support registered\n", + pmu_ptr->pmu.name); + + return 0; + +err_free: + imc_common_cpuhp_mem_free(pmu_ptr); + return ret; +} diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 3f3aa9a7063a..2efee3f196f5 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -99,7 +99,7 @@ static void mmcra_sdar_mode(u64 event, unsigned long *mmcra) else if (!cpu_has_feature(CPU_FTR_POWER9_DD1) && p9_SDAR_MODE(event)) *mmcra |= p9_SDAR_MODE(event) << MMCRA_SDAR_MODE_SHIFT; else - *mmcra |= MMCRA_SDAR_MODE_TLB; + *mmcra |= MMCRA_SDAR_MODE_DCACHE; } else *mmcra |= MMCRA_SDAR_MODE_TLB; } @@ -488,8 +488,8 @@ static int find_alternative(u64 event, const unsigned int ev_alt[][MAX_ALT], int return -1; } -int isa207_get_alternatives(u64 event, u64 alt[], - const unsigned int ev_alt[][MAX_ALT], int size) +int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags, + const unsigned int ev_alt[][MAX_ALT]) { int i, j, num_alt = 0; u64 alt_event; @@ -505,5 +505,30 @@ int isa207_get_alternatives(u64 event, u64 alt[], } } + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, so PM_CYC is equivalent to + * PM_RUN_CYC and PM_INST_CMPL === PM_RUN_INST_CMPL. + */ + j = num_alt; + for (i = 0; i < num_alt; ++i) { + switch (alt[i]) { + case 0x1e: /* PMC_CYC */ + alt[j++] = 0x600f4; /* PM_RUN_CYC */ + break; + case 0x600f4: + alt[j++] = 0x1e; + break; + case 0x2: /* PM_INST_CMPL */ + alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ + break; + case 0x500fa: + alt[j++] = 0x2; + break; + } + } + num_alt = j; + } + return num_alt; } diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 8acbe6e802c7..6c737d675792 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -247,6 +247,7 @@ #define MMCRA_SDAR_MODE_SHIFT 42 #define MMCRA_SDAR_MODE_TLB (1ull << MMCRA_SDAR_MODE_SHIFT) #define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT) +#define MMCRA_SDAR_MODE_DCACHE (2ull << MMCRA_SDAR_MODE_SHIFT) #define MMCRA_IFM_SHIFT 30 #define MMCRA_THR_CTR_MANT_SHIFT 19 #define MMCRA_THR_CTR_MANT_MASK 0x7Ful @@ -287,8 +288,8 @@ int isa207_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], unsigned long mmcr[], struct perf_event *pevents[]); void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]); -int isa207_get_alternatives(u64 event, u64 alt[], - const unsigned int ev_alt[][MAX_ALT], int size); +int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags, + const unsigned int ev_alt[][MAX_ALT]); void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, struct pt_regs *regs); void isa207_get_mem_weight(u64 *weight); diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 5463516e369b..c9356955cab4 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -50,34 +50,11 @@ static const unsigned int event_alternatives[][MAX_ALT] = { static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { - int i, j, num_alt = 0; - - num_alt = isa207_get_alternatives(event, alt, event_alternatives, - (int)ARRAY_SIZE(event_alternatives)); - if (flags & PPMU_ONLY_COUNT_RUN) { - /* - * We're only counting in RUN state, so PM_CYC is equivalent to - * PM_RUN_CYC and PM_INST_CMPL === PM_RUN_INST_CMPL. - */ - j = num_alt; - for (i = 0; i < num_alt; ++i) { - switch (alt[i]) { - case PM_CYC: - alt[j++] = PM_RUN_CYC; - break; - case PM_RUN_CYC: - alt[j++] = PM_CYC; - break; - case PM_INST_CMPL: - alt[j++] = PM_RUN_INST_CMPL; - break; - case PM_RUN_INST_CMPL: - alt[j++] = PM_INST_CMPL; - break; - } - } - num_alt = j; - } + int num_alt = 0; + + num_alt = isa207_get_alternatives(event, alt, + ARRAY_SIZE(event_alternatives), flags, + event_alternatives); return num_alt; } diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index 50689180a6c1..e99c6bf4d391 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -16,13 +16,16 @@ EVENT(PM_CYC, 0x0001e) EVENT(PM_ICT_NOSLOT_CYC, 0x100f8) EVENT(PM_CMPLU_STALL, 0x1e054) EVENT(PM_INST_CMPL, 0x00002) -EVENT(PM_BRU_CMPL, 0x4d05e) +EVENT(PM_BR_CMPL, 0x4d05e) EVENT(PM_BR_MPRED_CMPL, 0x400f6) /* All L1 D cache load references counted at finish, gated by reject */ EVENT(PM_LD_REF_L1, 0x100fc) /* Load Missed L1 */ EVENT(PM_LD_MISS_L1_FIN, 0x2c04e) +EVENT(PM_LD_MISS_L1, 0x3e054) +/* Alternate event code for PM_LD_MISS_L1 */ +EVENT(PM_LD_MISS_L1_ALT, 0x400f0) /* Store Missed L1 */ EVENT(PM_ST_MISS_L1, 0x300f0) /* L1 cache data prefetches */ @@ -62,3 +65,7 @@ EVENT(PM_INST_DISP, 0x200f2) EVENT(PM_INST_DISP_ALT, 0x300f2) /* Alternate Branch event code */ EVENT(PM_BR_CMPL_ALT, 0x10012) +/* Branch event that are not strongly biased */ +EVENT(PM_BR_2PATH, 0x20036) +/* ALternate branch event that are not strongly biased */ +EVENT(PM_BR_2PATH_ALT, 0x40036) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 2280cf87ff9c..24b5b5b7a206 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -109,14 +109,17 @@ static const unsigned int power9_event_alternatives[][MAX_ALT] = { { PM_INST_DISP, PM_INST_DISP_ALT }, { PM_RUN_CYC_ALT, PM_RUN_CYC }, { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_BR_2PATH, PM_BR_2PATH_ALT }, }; static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int num_alt = 0; - num_alt = isa207_get_alternatives(event, alt, power9_event_alternatives, - (int)ARRAY_SIZE(power9_event_alternatives)); + num_alt = isa207_get_alternatives(event, alt, + ARRAY_SIZE(power9_event_alternatives), flags, + power9_event_alternatives); return num_alt; } @@ -125,7 +128,7 @@ GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC); GENERIC_EVENT_ATTR(stalled-cycles-frontend, PM_ICT_NOSLOT_CYC); GENERIC_EVENT_ATTR(stalled-cycles-backend, PM_CMPLU_STALL); GENERIC_EVENT_ATTR(instructions, PM_INST_CMPL); -GENERIC_EVENT_ATTR(branch-instructions, PM_BRU_CMPL); +GENERIC_EVENT_ATTR(branch-instructions, PM_BR_CMPL); GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1_FIN); @@ -143,7 +146,7 @@ CACHE_EVENT_ATTR(LLC-prefetches, PM_L3_PREF_ALL); CACHE_EVENT_ATTR(LLC-store-misses, PM_L2_ST_MISS); CACHE_EVENT_ATTR(LLC-stores, PM_L2_ST); CACHE_EVENT_ATTR(branch-load-misses, PM_BR_MPRED_CMPL); -CACHE_EVENT_ATTR(branch-loads, PM_BRU_CMPL); +CACHE_EVENT_ATTR(branch-loads, PM_BR_CMPL); CACHE_EVENT_ATTR(dTLB-load-misses, PM_DTLB_MISS); CACHE_EVENT_ATTR(iTLB-load-misses, PM_ITLB_MISS); @@ -152,7 +155,7 @@ static struct attribute *power9_events_attr[] = { GENERIC_EVENT_PTR(PM_ICT_NOSLOT_CYC), GENERIC_EVENT_PTR(PM_CMPLU_STALL), GENERIC_EVENT_PTR(PM_INST_CMPL), - GENERIC_EVENT_PTR(PM_BRU_CMPL), + GENERIC_EVENT_PTR(PM_BR_CMPL), GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), GENERIC_EVENT_PTR(PM_LD_REF_L1), GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN), @@ -169,7 +172,7 @@ static struct attribute *power9_events_attr[] = { CACHE_EVENT_PTR(PM_L2_ST_MISS), CACHE_EVENT_PTR(PM_L2_ST), CACHE_EVENT_PTR(PM_BR_MPRED_CMPL), - CACHE_EVENT_PTR(PM_BRU_CMPL), + CACHE_EVENT_PTR(PM_BR_CMPL), CACHE_EVENT_PTR(PM_DTLB_MISS), CACHE_EVENT_PTR(PM_ITLB_MISS), NULL @@ -244,7 +247,7 @@ static int power9_generic_events[] = { [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_ICT_NOSLOT_CYC, [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PM_CMPLU_STALL, [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_CMPL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BRU_CMPL, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BR_CMPL, [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL, [PERF_COUNT_HW_CACHE_REFERENCES] = PM_LD_REF_L1, [PERF_COUNT_HW_CACHE_MISSES] = PM_LD_MISS_L1_FIN, @@ -370,7 +373,7 @@ static int power9_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { }, [ C(BPU) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = PM_BRU_CMPL, + [ C(RESULT_ACCESS) ] = PM_BR_CMPL, [ C(RESULT_MISS) ] = PM_BR_MPRED_CMPL, }, [ C(OP_WRITE) ] = { @@ -459,8 +462,8 @@ static int __init init_power9_pmu(void) * Power9 DD1 should use PM_BR_CMPL_ALT event code for * "branches" to provide correct counter value. */ - EVENT_VAR(PM_BRU_CMPL, _g).id = PM_BR_CMPL_ALT; - EVENT_VAR(PM_BRU_CMPL, _c).id = PM_BR_CMPL_ALT; + EVENT_VAR(PM_BR_CMPL, _g).id = PM_BR_CMPL_ALT; + EVENT_VAR(PM_BR_CMPL, _c).id = PM_BR_CMPL_ALT; rc = register_power_pmu(&power9_isa207_pmu); } else { rc = register_power_pmu(&power9_pmu); diff --git a/arch/powerpc/platforms/44x/Makefile b/arch/powerpc/platforms/44x/Makefile index 72b824160660..2c5651992369 100644 --- a/arch/powerpc/platforms/44x/Makefile +++ b/arch/powerpc/platforms/44x/Makefile @@ -1,6 +1,6 @@ -obj-$(CONFIG_44x) += misc_44x.o +obj-y += misc_44x.o machine_check.o ifneq ($(CONFIG_PPC4xx_CPM),y) -obj-$(CONFIG_44x) += idle.o +obj-y += idle.o endif obj-$(CONFIG_PPC44x_SIMPLE) += ppc44x_simple.o obj-$(CONFIG_EBONY) += ebony.o diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c new file mode 100644 index 000000000000..034d70d6d335 --- /dev/null +++ b/arch/powerpc/platforms/44x/machine_check.c @@ -0,0 +1,89 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/ptrace.h> + +#include <asm/reg.h> + +int machine_check_440A(struct pt_regs *regs) +{ + unsigned long reason = regs->dsisr; + + printk("Machine check in kernel mode.\n"); + if (reason & ESR_IMCP){ + printk("Instruction Synchronous Machine Check exception\n"); + mtspr(SPRN_ESR, reason & ~ESR_IMCP); + } + else { + u32 mcsr = mfspr(SPRN_MCSR); + if (mcsr & MCSR_IB) + printk("Instruction Read PLB Error\n"); + if (mcsr & MCSR_DRB) + printk("Data Read PLB Error\n"); + if (mcsr & MCSR_DWB) + printk("Data Write PLB Error\n"); + if (mcsr & MCSR_TLBP) + printk("TLB Parity Error\n"); + if (mcsr & MCSR_ICP){ + flush_instruction_cache(); + printk("I-Cache Parity Error\n"); + } + if (mcsr & MCSR_DCSP) + printk("D-Cache Search Parity Error\n"); + if (mcsr & MCSR_DCFP) + printk("D-Cache Flush Parity Error\n"); + if (mcsr & MCSR_IMPE) + printk("Machine Check exception is imprecise\n"); + + /* Clear MCSR */ + mtspr(SPRN_MCSR, mcsr); + } + return 0; +} + +#ifdef CONFIG_PPC_47x +int machine_check_47x(struct pt_regs *regs) +{ + unsigned long reason = regs->dsisr; + u32 mcsr; + + printk(KERN_ERR "Machine check in kernel mode.\n"); + if (reason & ESR_IMCP) { + printk(KERN_ERR "Instruction Synchronous Machine Check exception\n"); + mtspr(SPRN_ESR, reason & ~ESR_IMCP); + return 0; + } + mcsr = mfspr(SPRN_MCSR); + if (mcsr & MCSR_IB) + printk(KERN_ERR "Instruction Read PLB Error\n"); + if (mcsr & MCSR_DRB) + printk(KERN_ERR "Data Read PLB Error\n"); + if (mcsr & MCSR_DWB) + printk(KERN_ERR "Data Write PLB Error\n"); + if (mcsr & MCSR_TLBP) + printk(KERN_ERR "TLB Parity Error\n"); + if (mcsr & MCSR_ICP) { + flush_instruction_cache(); + printk(KERN_ERR "I-Cache Parity Error\n"); + } + if (mcsr & MCSR_DCSP) + printk(KERN_ERR "D-Cache Search Parity Error\n"); + if (mcsr & PPC47x_MCSR_GPR) + printk(KERN_ERR "GPR Parity Error\n"); + if (mcsr & PPC47x_MCSR_FPR) + printk(KERN_ERR "FPR Parity Error\n"); + if (mcsr & PPC47x_MCSR_IPR) + printk(KERN_ERR "Machine Check exception is imprecise\n"); + + /* Clear MCSR */ + mtspr(SPRN_MCSR, mcsr); + + return 0; +} +#endif /* CONFIG_PPC_47x */ diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile new file mode 100644 index 000000000000..9779c32db34e --- /dev/null +++ b/arch/powerpc/platforms/4xx/Makefile @@ -0,0 +1,8 @@ +obj-y += uic.o machine_check.o +obj-$(CONFIG_PPC4xx_OCM) += ocm.o +obj-$(CONFIG_4xx_SOC) += soc.o +obj-$(CONFIG_PCI) += pci.o +obj-$(CONFIG_PPC4xx_HSTA_MSI) += hsta_msi.o +obj-$(CONFIG_PPC4xx_MSI) += msi.o +obj-$(CONFIG_PPC4xx_CPM) += cpm.o +obj-$(CONFIG_PPC4xx_GPIO) += gpio.o diff --git a/arch/powerpc/sysdev/ppc4xx_cpm.c b/arch/powerpc/platforms/4xx/cpm.c index ba95adf81d8d..53ff81ca8a3c 100644 --- a/arch/powerpc/sysdev/ppc4xx_cpm.c +++ b/arch/powerpc/platforms/4xx/cpm.c @@ -240,7 +240,7 @@ static int cpm_suspend_enter(suspend_state_t state) return 0; } -static struct platform_suspend_ops cpm_suspend_ops = { +static const struct platform_suspend_ops cpm_suspend_ops = { .valid = cpm_suspend_valid, .enter = cpm_suspend_enter, }; @@ -278,8 +278,8 @@ static int __init cpm_init(void) dcr_len = dcr_resource_len(np, 0); if (dcr_base == 0 || dcr_len == 0) { - printk(KERN_ERR "cpm: could not parse dcr property for %s\n", - np->full_name); + printk(KERN_ERR "cpm: could not parse dcr property for %pOF\n", + np); ret = -EINVAL; goto node_put; } @@ -287,8 +287,8 @@ static int __init cpm_init(void) cpm.dcr_host = dcr_map(np, dcr_base, dcr_len); if (!DCR_MAP_OK(cpm.dcr_host)) { - printk(KERN_ERR "cpm: failed to map dcr property for %s\n", - np->full_name); + printk(KERN_ERR "cpm: failed to map dcr property for %pOF\n", + np); ret = -EINVAL; goto node_put; } diff --git a/arch/powerpc/sysdev/ppc4xx_gpio.c b/arch/powerpc/platforms/4xx/gpio.c index 5382d04dd872..2238e369cde4 100644 --- a/arch/powerpc/sysdev/ppc4xx_gpio.c +++ b/arch/powerpc/platforms/4xx/gpio.c @@ -198,8 +198,7 @@ static int __init ppc4xx_add_gpiochips(void) goto err; continue; err: - pr_err("%s: registration failed with status %d\n", - np->full_name, ret); + pr_err("%pOF: registration failed with status %d\n", np, ret); kfree(ppc4xx_gc); /* try others anyway */ } diff --git a/arch/powerpc/sysdev/ppc4xx_hsta_msi.c b/arch/powerpc/platforms/4xx/hsta_msi.c index 9926ad67af76..9926ad67af76 100644 --- a/arch/powerpc/sysdev/ppc4xx_hsta_msi.c +++ b/arch/powerpc/platforms/4xx/hsta_msi.c diff --git a/arch/powerpc/platforms/4xx/machine_check.c b/arch/powerpc/platforms/4xx/machine_check.c new file mode 100644 index 000000000000..aa039dfaf82f --- /dev/null +++ b/arch/powerpc/platforms/4xx/machine_check.c @@ -0,0 +1,26 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/ptrace.h> + +#include <asm/reg.h> + +int machine_check_4xx(struct pt_regs *regs) +{ + unsigned long reason = regs->dsisr; + + if (reason & ESR_IMCP) { + printk("Instruction"); + mtspr(SPRN_ESR, reason & ~ESR_IMCP); + } else + printk("Data"); + printk(" machine check in kernel mode.\n"); + + return 0; +} diff --git a/arch/powerpc/sysdev/ppc4xx_msi.c b/arch/powerpc/platforms/4xx/msi.c index 590dab4f47d6..d50417e23add 100644 --- a/arch/powerpc/sysdev/ppc4xx_msi.c +++ b/arch/powerpc/platforms/4xx/msi.c @@ -233,8 +233,7 @@ static int ppc4xx_msi_probe(struct platform_device *dev) /* Get MSI ranges */ err = of_address_to_resource(dev->dev.of_node, 0, &res); if (err) { - dev_err(&dev->dev, "%s resource error!\n", - dev->dev.of_node->full_name); + dev_err(&dev->dev, "%pOF resource error!\n", dev->dev.of_node); goto error_out; } diff --git a/arch/powerpc/sysdev/ppc4xx_ocm.c b/arch/powerpc/platforms/4xx/ocm.c index 85d9e37f5ccb..85d9e37f5ccb 100644 --- a/arch/powerpc/sysdev/ppc4xx_ocm.c +++ b/arch/powerpc/platforms/4xx/ocm.c diff --git a/arch/powerpc/sysdev/ppc4xx_pci.c b/arch/powerpc/platforms/4xx/pci.c index 086aca69ecae..73e6b36bcd51 100644 --- a/arch/powerpc/sysdev/ppc4xx_pci.c +++ b/arch/powerpc/platforms/4xx/pci.c @@ -32,7 +32,7 @@ #include <asm/dcr-regs.h> #include <mm/mmu_decl.h> -#include "ppc4xx_pci.h" +#include "pci.h" static int dma_offset_set; @@ -127,9 +127,9 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, * within 32 bits space */ if (cpu_addr != 0 || pci_addr > 0xffffffff) { - printk(KERN_WARNING "%s: Ignored unsupported dma range" + printk(KERN_WARNING "%pOF: Ignored unsupported dma range" " 0x%016llx...0x%016llx -> 0x%016llx\n", - hose->dn->full_name, + hose->dn, pci_addr, pci_addr + size - 1, cpu_addr); continue; } @@ -152,8 +152,7 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, /* We only support one global DMA offset */ if (dma_offset_set && pci_dram_offset != res->start) { - printk(KERN_ERR "%s: dma-ranges(s) mismatch\n", - hose->dn->full_name); + printk(KERN_ERR "%pOF: dma-ranges(s) mismatch\n", hose->dn); return -ENXIO; } @@ -161,17 +160,16 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, * DMA bounce buffers */ if (size < total_memory) { - printk(KERN_ERR "%s: dma-ranges too small " + printk(KERN_ERR "%pOF: dma-ranges too small " "(size=%llx total_memory=%llx)\n", - hose->dn->full_name, size, (u64)total_memory); + hose->dn, size, (u64)total_memory); return -ENXIO; } /* Check we are a power of 2 size and that base is a multiple of size*/ if ((size & (size - 1)) != 0 || (res->start & (size - 1)) != 0) { - printk(KERN_ERR "%s: dma-ranges unaligned\n", - hose->dn->full_name); + printk(KERN_ERR "%pOF: dma-ranges unaligned\n", hose->dn); return -ENXIO; } @@ -181,8 +179,8 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, if (res->end > 0xffffffff && !(of_device_is_compatible(hose->dn, "ibm,plb-pciex-460sx") || of_device_is_compatible(hose->dn, "ibm,plb-pciex-476fpe"))) { - printk(KERN_ERR "%s: dma-ranges outside of 32 bits space\n", - hose->dn->full_name); + printk(KERN_ERR "%pOF: dma-ranges outside of 32 bits space\n", + hose->dn); return -ENXIO; } out: @@ -233,8 +231,7 @@ static int __init ppc4xx_setup_one_pci_PMM(struct pci_controller *hose, */ if ((plb_addr + size) > 0xffffffffull || !is_power_of_2(size) || size < 0x1000 || (plb_addr & (size - 1)) != 0) { - printk(KERN_WARNING "%s: Resource out of range\n", - hose->dn->full_name); + printk(KERN_WARNING "%pOF: Resource out of range\n", hose->dn); return -1; } ma = (0xffffffffu << ilog2(size)) | 1; @@ -266,8 +263,7 @@ static void __init ppc4xx_configure_pci_PMMs(struct pci_controller *hose, if (!(res->flags & IORESOURCE_MEM)) continue; if (j > 2) { - printk(KERN_WARNING "%s: Too many ranges\n", - hose->dn->full_name); + printk(KERN_WARNING "%pOF: Too many ranges\n", hose->dn); break; } @@ -292,8 +288,8 @@ static void __init ppc4xx_configure_pci_PMMs(struct pci_controller *hose, if (j <= 2 && !found_isa_hole && hose->isa_mem_size) if (ppc4xx_setup_one_pci_PMM(hose, reg, hose->isa_mem_phys, 0, hose->isa_mem_size, 0, j) == 0) - printk(KERN_INFO "%s: Legacy ISA memory support enabled\n", - hose->dn->full_name); + printk(KERN_INFO "%pOF: Legacy ISA memory support enabled\n", + hose->dn); } static void __init ppc4xx_configure_pci_PTMs(struct pci_controller *hose, @@ -333,21 +329,20 @@ static void __init ppc4xx_probe_pci_bridge(struct device_node *np) /* Check if device is enabled */ if (!of_device_is_available(np)) { - printk(KERN_INFO "%s: Port disabled via device-tree\n", - np->full_name); + printk(KERN_INFO "%pOF: Port disabled via device-tree\n", np); return; } /* Fetch config space registers address */ if (of_address_to_resource(np, 0, &rsrc_cfg)) { - printk(KERN_ERR "%s: Can't get PCI config register base !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get PCI config register base !", + np); return; } /* Fetch host bridge internal registers address */ if (of_address_to_resource(np, 3, &rsrc_reg)) { - printk(KERN_ERR "%s: Can't get PCI internal register base !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get PCI internal register base !", + np); return; } @@ -361,7 +356,7 @@ static void __init ppc4xx_probe_pci_bridge(struct device_node *np) /* Map registers */ reg = ioremap(rsrc_reg.start, resource_size(&rsrc_reg)); if (reg == NULL) { - printk(KERN_ERR "%s: Can't map registers !", np->full_name); + printk(KERN_ERR "%pOF: Can't map registers !", np); goto fail; } @@ -423,8 +418,8 @@ static int __init ppc4xx_setup_one_pcix_POM(struct pci_controller *hose, if (!is_power_of_2(size) || size < 0x1000 || (plb_addr & (size - 1)) != 0) { - printk(KERN_WARNING "%s: Resource out of range\n", - hose->dn->full_name); + printk(KERN_WARNING "%pOF: Resource out of range\n", + hose->dn); return -1; } @@ -467,8 +462,7 @@ static void __init ppc4xx_configure_pcix_POMs(struct pci_controller *hose, if (!(res->flags & IORESOURCE_MEM)) continue; if (j > 1) { - printk(KERN_WARNING "%s: Too many ranges\n", - hose->dn->full_name); + printk(KERN_WARNING "%pOF: Too many ranges\n", hose->dn); break; } @@ -493,8 +487,8 @@ static void __init ppc4xx_configure_pcix_POMs(struct pci_controller *hose, if (j <= 1 && !found_isa_hole && hose->isa_mem_size) if (ppc4xx_setup_one_pcix_POM(hose, reg, hose->isa_mem_phys, 0, hose->isa_mem_size, 0, j) == 0) - printk(KERN_INFO "%s: Legacy ISA memory support enabled\n", - hose->dn->full_name); + printk(KERN_INFO "%pOF: Legacy ISA memory support enabled\n", + hose->dn); } static void __init ppc4xx_configure_pcix_PIMs(struct pci_controller *hose, @@ -539,14 +533,14 @@ static void __init ppc4xx_probe_pcix_bridge(struct device_node *np) /* Fetch config space registers address */ if (of_address_to_resource(np, 0, &rsrc_cfg)) { - printk(KERN_ERR "%s:Can't get PCI-X config register base !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get PCI-X config register base !", + np); return; } /* Fetch host bridge internal registers address */ if (of_address_to_resource(np, 3, &rsrc_reg)) { - printk(KERN_ERR "%s: Can't get PCI-X internal register base !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get PCI-X internal register base !", + np); return; } @@ -568,7 +562,7 @@ static void __init ppc4xx_probe_pcix_bridge(struct device_node *np) /* Map registers */ reg = ioremap(rsrc_reg.start, resource_size(&rsrc_reg)); if (reg == NULL) { - printk(KERN_ERR "%s: Can't map registers !", np->full_name); + printk(KERN_ERR "%pOF: Can't map registers !", np); goto fail; } @@ -1246,8 +1240,8 @@ static void __init ppc460sx_pciex_check_link(struct ppc4xx_pciex_port *port) mbase = ioremap(port->cfg_space.start + 0x10000000, 0x1000); if (mbase == NULL) { - printk(KERN_ERR "%s: Can't map internal config space !", - port->node->full_name); + printk(KERN_ERR "%pOF: Can't map internal config space !", + port->node); goto done; } @@ -1389,7 +1383,7 @@ static void __init ppc_476fpe_pciex_check_link(struct ppc4xx_pciex_port *port) port->index); return; } - + while (timeout_ms--) { val = in_le32(mbase + PECFG_TLDLP); @@ -1448,8 +1442,7 @@ static int __init ppc4xx_pciex_check_core_init(struct device_node *np) ppc4xx_pciex_hwops = &ppc_476fpe_pcie_hwops; #endif if (ppc4xx_pciex_hwops == NULL) { - printk(KERN_WARNING "PCIE: unknown host type %s\n", - np->full_name); + printk(KERN_WARNING "PCIE: unknown host type %pOF\n", np); return -ENODEV; } @@ -1730,8 +1723,7 @@ static int __init ppc4xx_setup_one_pciex_POM(struct ppc4xx_pciex_port *port, (index < 2 && size < 0x100000) || (index == 2 && size < 0x100) || (plb_addr & (size - 1)) != 0) { - printk(KERN_WARNING "%s: Resource out of range\n", - hose->dn->full_name); + printk(KERN_WARNING "%pOF: Resource out of range\n", hose->dn); return -1; } @@ -1807,8 +1799,8 @@ static void __init ppc4xx_configure_pciex_POMs(struct ppc4xx_pciex_port *port, if (!(res->flags & IORESOURCE_MEM)) continue; if (j > 1) { - printk(KERN_WARNING "%s: Too many ranges\n", - port->node->full_name); + printk(KERN_WARNING "%pOF: Too many ranges\n", + port->node); break; } @@ -1834,8 +1826,8 @@ static void __init ppc4xx_configure_pciex_POMs(struct ppc4xx_pciex_port *port, if (ppc4xx_setup_one_pciex_POM(port, hose, mbase, hose->isa_mem_phys, 0, hose->isa_mem_size, 0, j) == 0) - printk(KERN_INFO "%s: Legacy ISA memory support enabled\n", - hose->dn->full_name); + printk(KERN_INFO "%pOF: Legacy ISA memory support enabled\n", + hose->dn); /* Configure IO, always 64K starting at 0. We hard wire it to 64K ! * Note also that it -has- to be region index 2 on this HW @@ -1970,8 +1962,8 @@ static void __init ppc4xx_pciex_port_setup_hose(struct ppc4xx_pciex_port *port) (hose->first_busno + 1) * 0x100000, busses * 0x100000); if (cfg_data == NULL) { - printk(KERN_ERR "%s: Can't map external config space !", - port->node->full_name); + printk(KERN_ERR "%pOF: Can't map external config space !", + port->node); goto fail; } hose->cfg_data = cfg_data; @@ -1982,13 +1974,13 @@ static void __init ppc4xx_pciex_port_setup_hose(struct ppc4xx_pciex_port *port) */ mbase = ioremap(port->cfg_space.start + 0x10000000, 0x1000); if (mbase == NULL) { - printk(KERN_ERR "%s: Can't map internal config space !", - port->node->full_name); + printk(KERN_ERR "%pOF: Can't map internal config space !", + port->node); goto fail; } hose->cfg_addr = mbase; - pr_debug("PCIE %s, bus %d..%d\n", port->node->full_name, + pr_debug("PCIE %pOF, bus %d..%d\n", port->node, hose->first_busno, hose->last_busno); pr_debug(" config space mapped at: root @0x%p, other @0x%p\n", hose->cfg_addr, hose->cfg_data); @@ -2100,14 +2092,13 @@ static void __init ppc4xx_probe_pciex_bridge(struct device_node *np) /* Get the port number from the device-tree */ pval = of_get_property(np, "port", NULL); if (pval == NULL) { - printk(KERN_ERR "PCIE: Can't find port number for %s\n", - np->full_name); + printk(KERN_ERR "PCIE: Can't find port number for %pOF\n", np); return; } portno = *pval; if (portno >= ppc4xx_pciex_port_count) { - printk(KERN_ERR "PCIE: port number out of range for %s\n", - np->full_name); + printk(KERN_ERR "PCIE: port number out of range for %pOF\n", + np); return; } port = &ppc4xx_pciex_ports[portno]; @@ -2125,8 +2116,8 @@ static void __init ppc4xx_probe_pciex_bridge(struct device_node *np) if (ppc4xx_pciex_hwops->want_sdr) { pval = of_get_property(np, "sdr-base", NULL); if (pval == NULL) { - printk(KERN_ERR "PCIE: missing sdr-base for %s\n", - np->full_name); + printk(KERN_ERR "PCIE: missing sdr-base for %pOF\n", + np); return; } port->sdr_base = *pval; @@ -2142,29 +2133,26 @@ static void __init ppc4xx_probe_pciex_bridge(struct device_node *np) } else if (!strcmp(val, "pci")) { port->endpoint = 0; } else { - printk(KERN_ERR "PCIE: missing or incorrect device_type for %s\n", - np->full_name); + printk(KERN_ERR "PCIE: missing or incorrect device_type for %pOF\n", + np); return; } /* Fetch config space registers address */ if (of_address_to_resource(np, 0, &port->cfg_space)) { - printk(KERN_ERR "%s: Can't get PCI-E config space !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get PCI-E config space !", np); return; } /* Fetch host bridge internal registers address */ if (of_address_to_resource(np, 1, &port->utl_regs)) { - printk(KERN_ERR "%s: Can't get UTL register base !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get UTL register base !", np); return; } /* Map DCRs */ dcrs = dcr_resource_start(np, 0); if (dcrs == 0) { - printk(KERN_ERR "%s: Can't get DCR register base !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get DCR register base !", np); return; } port->dcrs = dcr_map(np, dcrs, dcr_resource_len(np, 0)); diff --git a/arch/powerpc/sysdev/ppc4xx_pci.h b/arch/powerpc/platforms/4xx/pci.h index bb4821938ab1..bb4821938ab1 100644 --- a/arch/powerpc/sysdev/ppc4xx_pci.h +++ b/arch/powerpc/platforms/4xx/pci.h diff --git a/arch/powerpc/sysdev/ppc4xx_soc.c b/arch/powerpc/platforms/4xx/soc.c index d41134d2f786..5e36508b2a70 100644 --- a/arch/powerpc/sysdev/ppc4xx_soc.c +++ b/arch/powerpc/platforms/4xx/soc.c @@ -90,7 +90,7 @@ static int __init ppc4xx_l2c_probe(void) /* Get l2 cache size */ prop = of_get_property(np, "cache-size", NULL); if (prop == NULL) { - printk(KERN_ERR "%s: Can't get cache-size!\n", np->full_name); + printk(KERN_ERR "%pOF: Can't get cache-size!\n", np); of_node_put(np); return -ENODEV; } @@ -99,8 +99,7 @@ static int __init ppc4xx_l2c_probe(void) /* Map DCRs */ dcrreg = of_get_property(np, "dcr-reg", &len); if (!dcrreg || (len != 4 * sizeof(u32))) { - printk(KERN_ERR "%s: Can't get DCR register base !", - np->full_name); + printk(KERN_ERR "%pOF: Can't get DCR register base !", np); of_node_put(np); return -ENODEV; } diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/platforms/4xx/uic.c index a00949f3e378..8b4dd0da0839 100644 --- a/arch/powerpc/sysdev/uic.c +++ b/arch/powerpc/platforms/4xx/uic.c @@ -243,16 +243,16 @@ static struct uic * __init uic_init_one(struct device_node *node) raw_spin_lock_init(&uic->lock); indexp = of_get_property(node, "cell-index", &len); if (!indexp || (len != sizeof(u32))) { - printk(KERN_ERR "uic: Device node %s has missing or invalid " - "cell-index property\n", node->full_name); + printk(KERN_ERR "uic: Device node %pOF has missing or invalid " + "cell-index property\n", node); return NULL; } uic->index = *indexp; dcrreg = of_get_property(node, "dcr-reg", &len); if (!dcrreg || (len != 2*sizeof(u32))) { - printk(KERN_ERR "uic: Device node %s has missing or invalid " - "dcr-reg property\n", node->full_name); + printk(KERN_ERR "uic: Device node %pOF has missing or invalid " + "dcr-reg property\n", node); return NULL; } uic->dcrbase = *dcrreg; @@ -292,7 +292,7 @@ void __init uic_init_tree(void) * top-level interrupt controller */ primary_uic = uic_init_one(np); if (!primary_uic) - panic("Unable to initialize primary UIC %s\n", np->full_name); + panic("Unable to initialize primary UIC %pOF\n", np); irq_set_default_host(primary_uic->irqhost); of_node_put(np); @@ -306,8 +306,8 @@ void __init uic_init_tree(void) uic = uic_init_one(np); if (! uic) - panic("Unable to initialize a secondary UIC %s\n", - np->full_name); + panic("Unable to initialize a secondary UIC %pOF\n", + np); cascade_virq = irq_of_parse_and_map(np, 0); diff --git a/arch/powerpc/platforms/512x/clock-commonclk.c b/arch/powerpc/platforms/512x/clock-commonclk.c index add5a5374fa0..b3097fe6441b 100644 --- a/arch/powerpc/platforms/512x/clock-commonclk.c +++ b/arch/powerpc/platforms/512x/clock-commonclk.c @@ -363,7 +363,7 @@ static int get_cpmf_mult_x2(void) */ /* applies to the IPS_DIV, and PCI_DIV values */ -static struct clk_div_table divtab_2346[] = { +static const struct clk_div_table divtab_2346[] = { { .val = 2, .div = 2, }, { .val = 3, .div = 3, }, { .val = 4, .div = 4, }, @@ -372,7 +372,7 @@ static struct clk_div_table divtab_2346[] = { }; /* applies to the MBX_DIV, LPC_DIV, and NFC_DIV values */ -static struct clk_div_table divtab_1234[] = { +static const struct clk_div_table divtab_1234[] = { { .val = 1, .div = 1, }, { .val = 2, .div = 2, }, { .val = 3, .div = 3, }, diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index 6b4f4cb7009a..f99e79ee060e 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -387,8 +387,8 @@ static unsigned int __init get_fifo_size(struct device_node *np, if (fp) return *fp; - pr_warning("no %s property in %s node, defaulting to %d\n", - prop_name, np->full_name, DEFAULT_FIFO_SIZE); + pr_warning("no %s property in %pOF node, defaulting to %d\n", + prop_name, np, DEFAULT_FIFO_SIZE); return DEFAULT_FIFO_SIZE; } @@ -426,15 +426,15 @@ static void __init mpc512x_psc_fifo_init(void) psc = of_iomap(np, 0); if (!psc) { - pr_err("%s: Can't map %s device\n", - __func__, np->full_name); + pr_err("%s: Can't map %pOF device\n", + __func__, np); continue; } /* FIFO space is 4KiB, check if requested size is available */ if ((fifobase + tx_fifo_size + rx_fifo_size) > 0x1000) { - pr_err("%s: no fifo space available for %s\n", - __func__, np->full_name); + pr_err("%s: no fifo space available for %pOF\n", + __func__, np); iounmap(psc); /* * chances are that another device requests less diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index 39b49822ace1..1ecbf176d35a 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -99,7 +99,7 @@ static void __init efika_pcisetup(void) bus_range = of_get_property(pcictrl, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { printk(KERN_WARNING EFIKA_PLATFORM_NAME - ": Can't get bus-range for %s\n", pcictrl->full_name); + ": Can't get bus-range for %pOF\n", pcictrl); goto out_put; } @@ -109,14 +109,14 @@ static void __init efika_pcisetup(void) else printk(KERN_INFO EFIKA_PLATFORM_NAME ": PCI buses %d..%d", bus_range[0], bus_range[1]); - printk(" controlled by %s\n", pcictrl->full_name); + printk(" controlled by %pOF\n", pcictrl); printk("\n"); hose = pcibios_alloc_controller(pcictrl); if (!hose) { printk(KERN_WARNING EFIKA_PLATFORM_NAME - ": Can't allocate PCI controller structure for %s\n", - pcictrl->full_name); + ": Can't allocate PCI controller structure for %pOF\n", + pcictrl); goto out_put; } diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c index a3227040cc86..1fcab233d2f2 100644 --- a/arch/powerpc/platforms/52xx/media5200.c +++ b/arch/powerpc/platforms/52xx/media5200.c @@ -156,7 +156,7 @@ static void __init media5200_init_irq(void) fpga_np = of_find_compatible_node(NULL, NULL, "fsl,media5200-fpga"); if (!fpga_np) goto out; - pr_debug("%s: found fpga node: %s\n", __func__, fpga_np->full_name); + pr_debug("%s: found fpga node: %pOF\n", __func__, fpga_np); media5200_irq.regs = of_iomap(fpga_np, 0); if (!media5200_irq.regs) diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 22645a7c6b8a..9e974b1e1697 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -226,7 +226,7 @@ static int mpc52xx_gpt_irq_xlate(struct irq_domain *h, struct device_node *ct, dev_dbg(gpt->dev, "%s: flags=%i\n", __func__, intspec[0]); if ((intsize < 1) || (intspec[0] > 3)) { - dev_err(gpt->dev, "bad irq specifier in %s\n", ct->full_name); + dev_err(gpt->dev, "bad irq specifier in %pOF\n", ct); return -EINVAL; } @@ -331,7 +331,7 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node) if (!of_find_property(node, "gpio-controller", NULL)) return; - gpt->gc.label = kstrdup(node->full_name, GFP_KERNEL); + gpt->gc.label = kasprintf(GFP_KERNEL, "%pOF", node); if (!gpt->gc.label) { dev_err(gpt->dev, "out of memory\n"); return; diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pci.c b/arch/powerpc/platforms/52xx/mpc52xx_pci.c index 00282c2b0cae..af0f79995214 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pci.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pci.c @@ -369,19 +369,19 @@ mpc52xx_add_bridge(struct device_node *node) const int *bus_range; struct resource rsrc; - pr_debug("Adding MPC52xx PCI host bridge %s\n", node->full_name); + pr_debug("Adding MPC52xx PCI host bridge %pOF\n", node); pci_add_flags(PCI_REASSIGN_ALL_BUS); if (of_address_to_resource(node, 0, &rsrc) != 0) { - printk(KERN_ERR "Can't get %s resources\n", node->full_name); + printk(KERN_ERR "Can't get %pOF resources\n", node); return -EINVAL; } bus_range = of_get_property(node, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get %s bus-range, assume bus 0\n", - node->full_name); + printk(KERN_WARNING "Can't get %pOF bus-range, assume bus 0\n", + node); bus_range = NULL; } diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c index 63c5ab6489c9..96bb55ca61d3 100644 --- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c +++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c @@ -128,7 +128,7 @@ static int mcu_gpiochip_add(struct mcu *mcu) return -ENODEV; gc->owner = THIS_MODULE; - gc->label = np->full_name; + gc->label = kasprintf(GFP_KERNEL, "%pOF", np); gc->can_sleep = 1; gc->ngpio = MCU_NUM_GPIO; gc->base = -1; @@ -141,6 +141,7 @@ static int mcu_gpiochip_add(struct mcu *mcu) static int mcu_gpiochip_remove(struct mcu *mcu) { + kfree(mcu->gc.label); gpiochip_remove(&mcu->gc); return 0; } diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c index 763ffca9628d..a4539c5accb0 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c @@ -113,7 +113,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk, unreg: platform_device_del(pdev); err: - pr_err("%s: registration failed\n", np->full_name); + pr_err("%pOF: registration failed\n", np); next: i++; } diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c index 978b85bb3233..7fa3e197871a 100644 --- a/arch/powerpc/platforms/83xx/suspend.c +++ b/arch/powerpc/platforms/83xx/suspend.c @@ -361,7 +361,7 @@ static int pmc_probe(struct platform_device *ofdev) return -EBUSY; } - pmc_regs = ioremap(res.start, sizeof(struct mpc83xx_pmc)); + pmc_regs = ioremap(res.start, sizeof(*pmc_regs)); if (!pmc_regs) { ret = -ENOMEM; @@ -374,7 +374,7 @@ static int pmc_probe(struct platform_device *ofdev) goto out_pmc; } - clock_regs = ioremap(res.start, sizeof(struct mpc83xx_pmc)); + clock_regs = ioremap(res.start, sizeof(*clock_regs)); if (!clock_regs) { ret = -ENOMEM; diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c index 0908abd7e36f..9fb57f78cdbe 100644 --- a/arch/powerpc/platforms/85xx/p1022_ds.c +++ b/arch/powerpc/platforms/85xx/p1022_ds.c @@ -508,8 +508,8 @@ static void __init p1022_ds_setup_arch(void) * allocate one static local variable for each * call to this function. */ - pr_info("p1022ds: disabling %s node", - np2->full_name); + pr_info("p1022ds: disabling %pOF node", + np2); of_update_property(np2, &nor_status); of_node_put(np2); } @@ -524,8 +524,8 @@ static void __init p1022_ds_setup_arch(void) .length = sizeof("disabled"), }; - pr_info("p1022ds: disabling %s node", - np2->full_name); + pr_info("p1022ds: disabling %pOF node", + np2); of_update_property(np2, &nand_status); of_node_put(np2); } diff --git a/arch/powerpc/platforms/85xx/xes_mpc85xx.c b/arch/powerpc/platforms/85xx/xes_mpc85xx.c index cd6ce845f398..77e618dce4a8 100644 --- a/arch/powerpc/platforms/85xx/xes_mpc85xx.c +++ b/arch/powerpc/platforms/85xx/xes_mpc85xx.c @@ -100,8 +100,8 @@ static void xes_mpc85xx_fixups(void) err = of_address_to_resource(np, 0, &r[0]); if (err) { printk(KERN_WARNING "xes_mpc85xx: Could not get " - "resource for device tree node '%s'", - np->full_name); + "resource for device tree node '%pOF'", + np); continue; } diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig index 80cbcb0ad9b1..536b0c5d5ce3 100644 --- a/arch/powerpc/platforms/8xx/Kconfig +++ b/arch/powerpc/platforms/8xx/Kconfig @@ -5,7 +5,6 @@ config CPM1 choice prompt "8xx Machine Type" depends on PPC_8xx - depends on 8xx default MPC885ADS config MPC8XXFADS @@ -92,7 +91,7 @@ endmenu # menu "MPC8xx CPM Options" - depends on 8xx + depends on PPC_8xx # This doesn't really belong here, but it is convenient to ask # 8xx specific questions. diff --git a/arch/powerpc/platforms/8xx/Makefile b/arch/powerpc/platforms/8xx/Makefile index 76a81c3350a8..f9af3218bd9c 100644 --- a/arch/powerpc/platforms/8xx/Makefile +++ b/arch/powerpc/platforms/8xx/Makefile @@ -1,7 +1,7 @@ # # Makefile for the PowerPC 8xx linux kernel. # -obj-$(CONFIG_PPC_8xx) += m8xx_setup.o +obj-y += m8xx_setup.o machine_check.o pic.o obj-$(CONFIG_MPC885ADS) += mpc885ads_setup.o obj-$(CONFIG_MPC86XADS) += mpc86xads_setup.o obj-$(CONFIG_PPC_EP88XC) += ep88xc.o diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index f81069f79a94..1917d69f84df 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -23,7 +23,7 @@ #include <asm/fs_pd.h> #include <mm/mmu_decl.h> -#include <sysdev/mpc8xx_pic.h> +#include "pic.h" #include "mpc8xx.h" diff --git a/arch/powerpc/platforms/8xx/machine_check.c b/arch/powerpc/platforms/8xx/machine_check.c new file mode 100644 index 000000000000..402016705a39 --- /dev/null +++ b/arch/powerpc/platforms/8xx/machine_check.c @@ -0,0 +1,37 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/ptrace.h> + +#include <asm/reg.h> + +int machine_check_8xx(struct pt_regs *regs) +{ + unsigned long reason = regs->msr; + + pr_err("Machine check in kernel mode.\n"); + pr_err("Caused by (from SRR1=%lx): ", reason); + if (reason & 0x40000000) + pr_err("Fetch error at address %lx\n", regs->nip); + else + pr_err("Data access error at address %lx\n", regs->dar); + +#ifdef CONFIG_PCI + /* the qspan pci read routines can cause machine checks -- Cort + * + * yuck !!! that totally needs to go away ! There are better ways + * to deal with that than having a wart in the mcheck handler. + * -- BenH + */ + bad_page_fault(regs, regs->dar, SIGBUS); + return 1; +#else + return 0; +#endif +} diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/platforms/8xx/pic.c index 2842f9d63d21..8d5a25d43ef3 100644 --- a/arch/powerpc/sysdev/mpc8xx_pic.c +++ b/arch/powerpc/platforms/8xx/pic.c @@ -9,7 +9,7 @@ #include <asm/io.h> #include <asm/8xx_immap.h> -#include "mpc8xx_pic.h" +#include "pic.h" #define PIC_VEC_SPURRIOUS 15 diff --git a/arch/powerpc/sysdev/mpc8xx_pic.h b/arch/powerpc/platforms/8xx/pic.h index 9fe00eebdc8b..9fe00eebdc8b 100644 --- a/arch/powerpc/sysdev/mpc8xx_pic.h +++ b/arch/powerpc/platforms/8xx/pic.h diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 2f629e0551e9..13663efc1d31 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -32,7 +32,6 @@ config PPC_85xx config PPC_8xx bool "Freescale 8xx" select FSL_SOC - select 8xx select PPC_LIB_RHEAP select SYS_SUPPORTS_HUGETLBFS @@ -149,10 +148,6 @@ config 6xx depends on PPC32 && PPC_BOOK3S select PPC_HAVE_PMU_SUPPORT -# this is temp to handle compat with arch=ppc -config 8xx - bool - config E500 select FSL_EMB_PERFMON select PPC_FSL_BOOK3E @@ -271,44 +266,6 @@ config VSX If in doubt, say Y here. -config PPC_ICSWX - bool "Support for PowerPC icswx coprocessor instruction" - depends on PPC_BOOK3S_64 - default n - ---help--- - - This option enables kernel support for the PowerPC Initiate - Coprocessor Store Word (icswx) coprocessor instruction on POWER7 - and POWER8 processors. POWER9 uses new copy/paste instructions - to invoke the coprocessor. - - This option is only useful if you have a processor that supports - the icswx coprocessor instruction. It does not have any effect - on processors without the icswx coprocessor instruction. - - This option slightly increases kernel memory usage. - - If in doubt, say N here. - -config PPC_ICSWX_PID - bool "icswx requires direct PID management" - depends on PPC_ICSWX - default y - ---help--- - The PID register in server is used explicitly for ICSWX. In - embedded systems PID management is done by the system. - -config PPC_ICSWX_USE_SIGILL - bool "Should a bad CT cause a SIGILL?" - depends on PPC_ICSWX - default n - ---help--- - Should a bad CT used for "non-record form ICSWX" cause an - illegal instruction signal or should it be silent as - architected. - - If in doubt, say N here. - config SPE_POSSIBLE def_bool y depends on E200 || (E500 && !PPC_E500MC) @@ -413,7 +370,7 @@ config NR_CPUS config NOT_COHERENT_CACHE bool - depends on 4xx || 8xx || E200 || PPC_MPC512x || GAMECUBE_COMMON + depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || GAMECUBE_COMMON default n if PPC_47x default y diff --git a/arch/powerpc/platforms/Makefile b/arch/powerpc/platforms/Makefile index 469ef170d218..d7a55ecfaee5 100644 --- a/arch/powerpc/platforms/Makefile +++ b/arch/powerpc/platforms/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_FSL_ULI1575) += fsl_uli1575.o obj-$(CONFIG_PPC_PMAC) += powermac/ obj-$(CONFIG_PPC_CHRP) += chrp/ +obj-$(CONFIG_4xx) += 4xx/ obj-$(CONFIG_40x) += 40x/ obj-$(CONFIG_44x) += 44x/ obj-$(CONFIG_PPC_MPC512x) += 512x/ diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c index 45cb9821173c..b9d466cc2b8a 100644 --- a/arch/powerpc/platforms/amigaone/setup.c +++ b/arch/powerpc/platforms/amigaone/setup.c @@ -40,7 +40,7 @@ static int __init amigaone_add_bridge(struct device_node *dev) const int *bus_range; struct pci_controller *hose; - printk(KERN_INFO "Adding PCI host bridge %s\n", dev->full_name); + printk(KERN_INFO "Adding PCI host bridge %pOF\n", dev); cfg_addr = of_get_address(dev, 0, NULL, NULL); cfg_data = of_get_address(dev, 1, NULL, NULL); @@ -49,8 +49,8 @@ static int __init amigaone_add_bridge(struct device_node *dev) bus_range = of_get_property(dev, "bus-range", &len); if ((bus_range == NULL) || (len < 2 * sizeof(int))) - printk(KERN_WARNING "Can't get bus-range for %s, assume" - " bus 0\n", dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume" + " bus 0\n", dev); hose = pcibios_alloc_controller(dev); if (hose == NULL) diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 8d3ae2cc52bf..6ea3f248b155 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -187,8 +187,8 @@ static struct axon_msic *find_msi_translator(struct pci_dev *dev) irq_domain = irq_find_host(dn); if (!irq_domain) { - dev_dbg(&dev->dev, "axon_msi: no irq_domain found for node %s\n", - dn->full_name); + dev_dbg(&dev->dev, "axon_msi: no irq_domain found for node %pOF\n", + dn); goto out_error; } @@ -326,8 +326,8 @@ static void axon_msi_shutdown(struct platform_device *device) struct axon_msic *msic = dev_get_drvdata(&device->dev); u32 tmp; - pr_devel("axon_msi: disabling %s\n", - irq_domain_get_of_node(msic->irq_domain)->full_name); + pr_devel("axon_msi: disabling %pOF\n", + irq_domain_get_of_node(msic->irq_domain)); tmp = dcr_read(msic->dcr_host, MSIC_CTRL_REG); tmp &= ~MSIC_CTRL_ENABLE & ~MSIC_CTRL_IRQ_ENABLE; msic_dcr_write(msic, MSIC_CTRL_REG, tmp); @@ -340,12 +340,12 @@ static int axon_msi_probe(struct platform_device *device) unsigned int virq; int dcr_base, dcr_len; - pr_devel("axon_msi: setting up dn %s\n", dn->full_name); + pr_devel("axon_msi: setting up dn %pOF\n", dn); msic = kzalloc(sizeof(struct axon_msic), GFP_KERNEL); if (!msic) { - printk(KERN_ERR "axon_msi: couldn't allocate msic for %s\n", - dn->full_name); + printk(KERN_ERR "axon_msi: couldn't allocate msic for %pOF\n", + dn); goto out; } @@ -354,30 +354,30 @@ static int axon_msi_probe(struct platform_device *device) if (dcr_base == 0 || dcr_len == 0) { printk(KERN_ERR - "axon_msi: couldn't parse dcr properties on %s\n", - dn->full_name); + "axon_msi: couldn't parse dcr properties on %pOF\n", + dn); goto out_free_msic; } msic->dcr_host = dcr_map(dn, dcr_base, dcr_len); if (!DCR_MAP_OK(msic->dcr_host)) { - printk(KERN_ERR "axon_msi: dcr_map failed for %s\n", - dn->full_name); + printk(KERN_ERR "axon_msi: dcr_map failed for %pOF\n", + dn); goto out_free_msic; } msic->fifo_virt = dma_alloc_coherent(&device->dev, MSIC_FIFO_SIZE_BYTES, &msic->fifo_phys, GFP_KERNEL); if (!msic->fifo_virt) { - printk(KERN_ERR "axon_msi: couldn't allocate fifo for %s\n", - dn->full_name); + printk(KERN_ERR "axon_msi: couldn't allocate fifo for %pOF\n", + dn); goto out_free_msic; } virq = irq_of_parse_and_map(dn, 0); if (!virq) { - printk(KERN_ERR "axon_msi: irq parse and map failed for %s\n", - dn->full_name); + printk(KERN_ERR "axon_msi: irq parse and map failed for %pOF\n", + dn); goto out_free_fifo; } memset(msic->fifo_virt, 0xff, MSIC_FIFO_SIZE_BYTES); @@ -385,8 +385,8 @@ static int axon_msi_probe(struct platform_device *device) /* We rely on being able to stash a virq in a u16, so limit irqs to < 65536 */ msic->irq_domain = irq_domain_add_nomap(dn, 65536, &msic_host_ops, msic); if (!msic->irq_domain) { - printk(KERN_ERR "axon_msi: couldn't allocate irq_domain for %s\n", - dn->full_name); + printk(KERN_ERR "axon_msi: couldn't allocate irq_domain for %pOF\n", + dn); goto out_free_fifo; } @@ -412,7 +412,7 @@ static int axon_msi_probe(struct platform_device *device) axon_msi_debug_setup(dn, msic); - printk(KERN_DEBUG "axon_msi: setup MSIC on %s\n", dn->full_name); + printk(KERN_DEBUG "axon_msi: setup MSIC on %pOF\n", dn); return 0; diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index 871d38479a25..6fc85e29dc08 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -303,8 +303,8 @@ static void __init init_one_iic(unsigned int hw_cpu, unsigned long addr, iic->node = of_node_get(node); out_be64(&iic->regs->prio, 0); - printk(KERN_INFO "IIC for CPU %d target id 0x%x : %s\n", - hw_cpu, iic->target_id, node->full_name); + printk(KERN_INFO "IIC for CPU %d target id 0x%x : %pOF\n", + hw_cpu, iic->target_id, node); } static int __init setup_iic(void) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 29d4f96ed33e..4b91ad08eefd 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -278,8 +278,8 @@ static int cell_iommu_find_ioc(int nid, unsigned long *base) if (of_node_to_nid(np) != nid) continue; if (of_address_to_resource(np, 0, &r)) { - printk(KERN_ERR "iommu: can't get address for %s\n", - np->full_name); + printk(KERN_ERR "iommu: can't get address for %pOF\n", + np); continue; } *base = r.start; @@ -458,8 +458,8 @@ static inline u32 cell_iommu_get_ioid(struct device_node *np) ioid = of_get_property(np, "ioid", NULL); if (ioid == NULL) { - printk(KERN_WARNING "iommu: missing ioid for %s using 0\n", - np->full_name); + printk(KERN_WARNING "iommu: missing ioid for %pOF using 0\n", + np); return 0; } @@ -559,8 +559,8 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev) */ iommu = cell_iommu_for_node(dev_to_node(dev)); if (iommu == NULL || list_empty(&iommu->windows)) { - dev_err(dev, "iommu: missing iommu for %s (node %d)\n", - of_node_full_name(dev->of_node), dev_to_node(dev)); + dev_err(dev, "iommu: missing iommu for %pOF (node %d)\n", + dev->of_node, dev_to_node(dev)); return NULL; } window = list_entry(iommu->windows.next, struct iommu_window, list); @@ -720,12 +720,12 @@ static struct cbe_iommu * __init cell_iommu_alloc(struct device_node *np) /* Get node ID */ nid = of_node_to_nid(np); if (nid < 0) { - printk(KERN_ERR "iommu: failed to get node for %s\n", - np->full_name); + printk(KERN_ERR "iommu: failed to get node for %pOF\n", + np); return NULL; } - pr_debug("iommu: setting up iommu for node %d (%s)\n", - nid, np->full_name); + pr_debug("iommu: setting up iommu for node %d (%pOF)\n", + nid, np); /* XXX todo: If we can have multiple windows on the same IOMMU, which * isn't the case today, we probably want here to check whether the @@ -736,8 +736,8 @@ static struct cbe_iommu * __init cell_iommu_alloc(struct device_node *np) */ if (cbe_nr_iommus >= NR_IOMMUS) { - printk(KERN_ERR "iommu: too many IOMMUs detected ! (%s)\n", - np->full_name); + printk(KERN_ERR "iommu: too many IOMMUs detected ! (%pOF)\n", + np); return NULL; } diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 460ab392f0e7..2f704afe9af3 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -196,8 +196,8 @@ static int __init cbe_ptcal_enable(void) for_each_node_by_type(np, "cpu") { const u32 *nid = of_get_property(np, "node-id", NULL); if (!nid) { - printk(KERN_ERR "%s: node %s is missing node-id?\n", - __func__, np->full_name); + printk(KERN_ERR "%s: node %pOF is missing node-id?\n", + __func__, np); continue; } cbe_ptcal_enable_on_node(*nid, order); diff --git a/arch/powerpc/platforms/cell/spider-pci.c b/arch/powerpc/platforms/cell/spider-pci.c index f1f7878893f3..d1e61e273e64 100644 --- a/arch/powerpc/platforms/cell/spider-pci.c +++ b/arch/powerpc/platforms/cell/spider-pci.c @@ -130,8 +130,8 @@ int __init spiderpci_iowa_init(struct iowa_bus *bus, void *data) struct resource r; unsigned long offset = (unsigned long)data; - pr_debug("SPIDERPCI-IOWA:Bus initialize for spider(%s)\n", - np->full_name); + pr_debug("SPIDERPCI-IOWA:Bus initialize for spider(%pOF)\n", + np); priv = kzalloc(sizeof(struct spiderpci_iowa_private), GFP_KERNEL); if (!priv) { diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c index ff924af00e78..aa44bfc46467 100644 --- a/arch/powerpc/platforms/cell/spider-pic.c +++ b/arch/powerpc/platforms/cell/spider-pic.c @@ -323,8 +323,8 @@ static void __init spider_init_one(struct device_node *of_node, int chip, irq_set_handler_data(virq, pic); irq_set_chained_handler(virq, spider_irq_cascade); - printk(KERN_INFO "spider_pic: node %d, addr: 0x%lx %s\n", - pic->node_id, addr, of_node->full_name); + printk(KERN_INFO "spider_pic: node %d, addr: 0x%lx %pOF\n", + pic->node_id, addr, of_node); /* Enable the interrupt detection enable bit. Do this last! */ out_be32(pic->regs + TIR_DEN, in_be32(pic->regs + TIR_DEN) | 0x1); diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index 672d310dcf14..f636ee22b203 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -191,8 +191,8 @@ static int __init spu_map_interrupts(struct spu *spu, struct device_node *np) goto err; } ret = -EINVAL; - pr_debug(" irq %d no 0x%x on %s\n", i, oirq.args[0], - oirq.np->full_name); + pr_debug(" irq %d no 0x%x on %pOF\n", i, oirq.args[0], + oirq.np); spu->irqs[i] = irq_create_of_mapping(&oirq); if (!spu->irqs[i]) { pr_debug("spu_new: failed to map it !\n"); @@ -243,32 +243,32 @@ static int __init spu_map_device(struct spu *spu) ret = spu_map_resource(spu, 0, (void __iomem**)&spu->local_store, &spu->local_store_phys); if (ret) { - pr_debug("spu_new: failed to map %s resource 0\n", - np->full_name); + pr_debug("spu_new: failed to map %pOF resource 0\n", + np); goto out; } ret = spu_map_resource(spu, 1, (void __iomem**)&spu->problem, &spu->problem_phys); if (ret) { - pr_debug("spu_new: failed to map %s resource 1\n", - np->full_name); + pr_debug("spu_new: failed to map %pOF resource 1\n", + np); goto out_unmap; } ret = spu_map_resource(spu, 2, (void __iomem**)&spu->priv2, NULL); if (ret) { - pr_debug("spu_new: failed to map %s resource 2\n", - np->full_name); + pr_debug("spu_new: failed to map %pOF resource 2\n", + np); goto out_unmap; } if (!firmware_has_feature(FW_FEATURE_LPAR)) ret = spu_map_resource(spu, 3, (void __iomem**)&spu->priv1, NULL); if (ret) { - pr_debug("spu_new: failed to map %s resource 3\n", - np->full_name); + pr_debug("spu_new: failed to map %pOF resource 3\n", + np); goto out_unmap; } - pr_debug("spu_new: %s maps:\n", np->full_name); + pr_debug("spu_new: %pOF maps:\n", np); pr_debug(" local store : 0x%016lx -> 0x%p\n", spu->local_store_phys, spu->local_store); pr_debug(" problem state : 0x%016lx -> 0x%p\n", @@ -316,8 +316,8 @@ static int __init of_create_spu(struct spu *spu, void *data) spu->node = of_node_to_nid(spe); if (spu->node >= MAX_NUMNODES) { - printk(KERN_WARNING "SPE %s on node %d ignored," - " node number too big\n", spe->full_name, spu->node); + printk(KERN_WARNING "SPE %pOF on node %d ignored," + " node number too big\n", spe, spu->node); printk(KERN_WARNING "Check if CONFIG_NUMA is enabled.\n"); ret = -ENODEV; goto out; diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index ae2f740a82f1..5ffcdeb1eb17 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1749,7 +1749,7 @@ out: static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file_inode(file); - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = file_write_and_wait_range(file, start, end); if (!err) { inode_lock(inode); err = spufs_mfc_flush(file, NULL); diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c index 1b87e198faa7..27264794f5c0 100644 --- a/arch/powerpc/platforms/chrp/pci.c +++ b/arch/powerpc/platforms/chrp/pci.c @@ -235,14 +235,14 @@ chrp_find_bridges(void) ++index; /* The GG2 bridge on the LongTrail doesn't have an address */ if (of_address_to_resource(dev, 0, &r) && !is_longtrail) { - printk(KERN_WARNING "Can't use %s: no address\n", - dev->full_name); + printk(KERN_WARNING "Can't use %pOF: no address\n", + dev); continue; } bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s\n", - dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF\n", + dev); continue; } if (bus_range[1] == bus_range[0]) @@ -250,15 +250,15 @@ chrp_find_bridges(void) else printk(KERN_INFO "PCI buses %d..%d", bus_range[0], bus_range[1]); - printk(" controlled by %s", dev->full_name); + printk(" controlled by %pOF", dev); if (!is_longtrail) printk(" at %llx", (unsigned long long)r.start); printk("\n"); hose = pcibios_alloc_controller(dev); if (!hose) { - printk("Can't allocate PCI controller structure for %s\n", - dev->full_name); + printk("Can't allocate PCI controller structure for %pOF\n", + dev); continue; } hose->first_busno = hose->self_busno = bus_range[0]; @@ -297,8 +297,8 @@ chrp_find_bridges(void) } } } else { - printk("No methods for %s (model %s), using RTAS\n", - dev->full_name, model); + printk("No methods for %pOF (model %s), using RTAS\n", + dev, model); hose->ops = &rtas_pci_ops; } diff --git a/arch/powerpc/platforms/chrp/pegasos_eth.c b/arch/powerpc/platforms/chrp/pegasos_eth.c index 2b4dc6abde6c..19760712b39d 100644 --- a/arch/powerpc/platforms/chrp/pegasos_eth.c +++ b/arch/powerpc/platforms/chrp/pegasos_eth.c @@ -63,7 +63,7 @@ static struct platform_device mv643xx_eth_mvmdio_device = { .name = "orion-mdio", .id = -1, .num_resources = ARRAY_SIZE(mv643xx_eth_mvmdio_resources), - .resource = mv643xx_eth_shared_resources, + .resource = mv643xx_eth_mvmdio_resources, }; static struct resource mv643xx_eth_port1_resources[] = { diff --git a/arch/powerpc/platforms/embedded6xx/linkstation.c b/arch/powerpc/platforms/embedded6xx/linkstation.c index f29cf29b11f8..f514d5d28cd4 100644 --- a/arch/powerpc/platforms/embedded6xx/linkstation.c +++ b/arch/powerpc/platforms/embedded6xx/linkstation.c @@ -41,12 +41,12 @@ static int __init linkstation_add_bridge(struct device_node *dev) struct pci_controller *hose; const int *bus_range; - printk("Adding PCI host bridge %s\n", dev->full_name); + printk("Adding PCI host bridge %pOF\n", dev); bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) - printk(KERN_WARNING "Can't get bus-range for %s, assume" - " bus 0\n", dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume" + " bus 0\n", dev); hose = pcibios_alloc_controller(dev); if (hose == NULL) diff --git a/arch/powerpc/platforms/embedded6xx/mvme5100.c b/arch/powerpc/platforms/embedded6xx/mvme5100.c index 8e3590941960..273dfa3f0252 100644 --- a/arch/powerpc/platforms/embedded6xx/mvme5100.c +++ b/arch/powerpc/platforms/embedded6xx/mvme5100.c @@ -115,7 +115,7 @@ static int __init mvme5100_add_bridge(struct device_node *dev) struct pci_controller *hose; unsigned short devid; - pr_info("Adding PCI host bridge %s\n", dev->full_name); + pr_info("Adding PCI host bridge %pOF\n", dev); bus_range = of_get_property(dev, "bus-range", &len); diff --git a/arch/powerpc/platforms/embedded6xx/storcenter.c b/arch/powerpc/platforms/embedded6xx/storcenter.c index 471a50bcd074..ed1914dd34bb 100644 --- a/arch/powerpc/platforms/embedded6xx/storcenter.c +++ b/arch/powerpc/platforms/embedded6xx/storcenter.c @@ -44,7 +44,7 @@ static int __init storcenter_add_bridge(struct device_node *dev) struct pci_controller *hose; const int *bus_range; - printk("Adding PCI host bridge %s\n", dev->full_name); + printk("Adding PCI host bridge %pOF\n", dev); hose = pcibios_alloc_controller(dev); if (hose == NULL) diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index 69794d9389c2..e3821379e86f 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -73,8 +73,8 @@ static void __init fixup_bus_range(struct device_node *bridge) /* Lookup the "bus-range" property for the hose */ prop = of_find_property(bridge, "bus-range", &len); if (prop == NULL || prop->value == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s\n", - bridge->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF\n", + bridge); return; } bus_range = prop->value; @@ -498,12 +498,12 @@ static int __init maple_add_bridge(struct device_node *dev) const int *bus_range; int primary = 1; - DBG("Adding PCI host bridge %s\n", dev->full_name); + DBG("Adding PCI host bridge %pOF\n", dev); bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s, assume bus 0\n", - dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume bus 0\n", + dev); } hose = pcibios_alloc_controller(dev); diff --git a/arch/powerpc/platforms/pasemi/pci.c b/arch/powerpc/platforms/pasemi/pci.c index 10c4e8fc6ea9..5ff6108f19e9 100644 --- a/arch/powerpc/platforms/pasemi/pci.c +++ b/arch/powerpc/platforms/pasemi/pci.c @@ -193,7 +193,7 @@ static int __init pas_add_bridge(struct device_node *dev) { struct pci_controller *hose; - pr_debug("Adding PCI host bridge %s\n", dev->full_name); + pr_debug("Adding PCI host bridge %pOF\n", dev); hose = pcibios_alloc_controller(dev); if (!hose) diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index 1e02328c3f2d..9e3f39d36e88 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -2658,25 +2658,25 @@ static void __init probe_one_macio(const char *name, const char *compat, int typ if (i >= MAX_MACIO_CHIPS) { printk(KERN_ERR "pmac_feature: Please increase MAX_MACIO_CHIPS !\n"); - printk(KERN_ERR "pmac_feature: %s skipped\n", node->full_name); + printk(KERN_ERR "pmac_feature: %pOF skipped\n", node); return; } addrp = of_get_pci_address(node, 0, &size, NULL); if (addrp == NULL) { - printk(KERN_ERR "pmac_feature: %s: can't find base !\n", - node->full_name); + printk(KERN_ERR "pmac_feature: %pOF: can't find base !\n", + node); return; } addr = of_translate_address(node, addrp); if (addr == 0) { - printk(KERN_ERR "pmac_feature: %s, can't translate base !\n", - node->full_name); + printk(KERN_ERR "pmac_feature: %pOF, can't translate base !\n", + node); return; } base = ioremap(addr, (unsigned long)size); if (!base) { - printk(KERN_ERR "pmac_feature: %s, can't map mac-io chip !\n", - node->full_name); + printk(KERN_ERR "pmac_feature: %pOF, can't map mac-io chip !\n", + node); return; } if (type == macio_keylargo || type == macio_keylargo2) { diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index f627c9fd7b48..70183eb3d5c8 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -494,8 +494,8 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) host = kzalloc(sizeof(struct pmac_i2c_host_kw), GFP_KERNEL); if (host == NULL) { - printk(KERN_ERR "low_i2c: Can't allocate host for %s\n", - np->full_name); + printk(KERN_ERR "low_i2c: Can't allocate host for %pOF\n", + np); return NULL; } @@ -505,8 +505,8 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) */ addrp = of_get_property(np, "AAPL,address", NULL); if (addrp == NULL) { - printk(KERN_ERR "low_i2c: Can't find address for %s\n", - np->full_name); + printk(KERN_ERR "low_i2c: Can't find address for %pOF\n", + np); kfree(host); return NULL; } @@ -538,13 +538,13 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) host->irq = irq_of_parse_and_map(np, 0); if (!host->irq) printk(KERN_WARNING - "low_i2c: Failed to map interrupt for %s\n", - np->full_name); + "low_i2c: Failed to map interrupt for %pOF\n", + np); host->base = ioremap((*addrp), 0x1000); if (host->base == NULL) { - printk(KERN_ERR "low_i2c: Can't map registers for %s\n", - np->full_name); + printk(KERN_ERR "low_i2c: Can't map registers for %pOF\n", + np); kfree(host); return NULL; } @@ -560,8 +560,8 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) "keywest i2c", host)) host->irq = 0; - printk(KERN_INFO "KeyWest i2c @0x%08x irq %d %s\n", - *addrp, host->irq, np->full_name); + printk(KERN_INFO "KeyWest i2c @0x%08x irq %d %pOF\n", + *addrp, host->irq, np); return host; } @@ -798,7 +798,7 @@ static void __init pmu_i2c_probe(void) if (busnode == NULL) return; - printk(KERN_INFO "PMU i2c %s\n", busnode->full_name); + printk(KERN_INFO "PMU i2c %pOF\n", busnode); /* * We add bus 1 and 2 only for now, bus 0 is "special" @@ -913,7 +913,7 @@ static void __init smu_i2c_probe(void) if (controller == NULL) return; - printk(KERN_INFO "SMU i2c %s\n", controller->full_name); + printk(KERN_INFO "SMU i2c %pOF\n", controller); /* Look for childs, note that they might not be of the right * type as older device trees mix i2c busses and other things @@ -945,8 +945,8 @@ static void __init smu_i2c_probe(void) bus->flags = 0; list_add(&bus->link, &pmac_i2c_busses); - printk(KERN_INFO " channel %x bus %s\n", - bus->channel, busnode->full_name); + printk(KERN_INFO " channel %x bus %pOF\n", + bus->channel, busnode); } } @@ -1129,7 +1129,7 @@ int pmac_i2c_setmode(struct pmac_i2c_bus *bus, int mode) */ if (mode < pmac_i2c_mode_dumb || mode > pmac_i2c_mode_combined) { printk(KERN_ERR "low_i2c: Invalid mode %d requested on" - " bus %s !\n", mode, bus->busnode->full_name); + " bus %pOF !\n", mode, bus->busnode); return -EINVAL; } bus->mode = mode; @@ -1146,8 +1146,8 @@ int pmac_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize, WARN_ON(!bus->opened); DBG("xfer() chan=%d, addrdir=0x%x, mode=%d, subsize=%d, subaddr=0x%x," - " %d bytes, bus %s\n", bus->channel, addrdir, bus->mode, subsize, - subaddr, len, bus->busnode->full_name); + " %d bytes, bus %pOF\n", bus->channel, addrdir, bus->mode, subsize, + subaddr, len, bus->busnode); rc = bus->xfer(bus, addrdir, subsize, subaddr, data, len); @@ -1241,13 +1241,13 @@ static void* pmac_i2c_do_begin(struct pmf_function *func, struct pmf_args *args) bus = pmac_i2c_find_bus(func->node); if (bus == NULL) { - printk(KERN_ERR "low_i2c: Can't find bus for %s (pfunc)\n", - func->node->full_name); + printk(KERN_ERR "low_i2c: Can't find bus for %pOF (pfunc)\n", + func->node); return NULL; } if (pmac_i2c_open(bus, 0)) { - printk(KERN_ERR "low_i2c: Can't open i2c bus for %s (pfunc)\n", - func->node->full_name); + printk(KERN_ERR "low_i2c: Can't open i2c bus for %pOF (pfunc)\n", + func->node); return NULL; } @@ -1417,7 +1417,7 @@ static struct pmf_handlers pmac_i2c_pfunc_handlers = { static void __init pmac_i2c_dev_create(struct device_node *np, int quirks) { - DBG("dev_create(%s)\n", np->full_name); + DBG("dev_create(%pOF)\n", np); pmf_register_driver(np, &pmac_i2c_pfunc_handlers, (void *)(long)quirks); @@ -1425,20 +1425,20 @@ static void __init pmac_i2c_dev_create(struct device_node *np, int quirks) static void __init pmac_i2c_dev_init(struct device_node *np, int quirks) { - DBG("dev_create(%s)\n", np->full_name); + DBG("dev_create(%pOF)\n", np); pmf_do_functions(np, NULL, 0, PMF_FLAGS_ON_INIT, NULL); } static void pmac_i2c_dev_suspend(struct device_node *np, int quirks) { - DBG("dev_suspend(%s)\n", np->full_name); + DBG("dev_suspend(%pOF)\n", np); pmf_do_functions(np, NULL, 0, PMF_FLAGS_ON_SLEEP, NULL); } static void pmac_i2c_dev_resume(struct device_node *np, int quirks) { - DBG("dev_resume(%s)\n", np->full_name); + DBG("dev_resume(%pOF)\n", np); pmf_do_functions(np, NULL, 0, PMF_FLAGS_ON_WAKE, NULL); } diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c index 6e06c3be2e9a..0b8174a79993 100644 --- a/arch/powerpc/platforms/powermac/pci.c +++ b/arch/powerpc/platforms/powermac/pci.c @@ -783,7 +783,7 @@ static int __init pmac_add_bridge(struct device_node *dev) const int *bus_range; int primary = 1, has_address = 0; - DBG("Adding PCI host bridge %s\n", dev->full_name); + DBG("Adding PCI host bridge %pOF\n", dev); /* Fetch host bridge registers address */ has_address = (of_address_to_resource(dev, 0, &rsrc) == 0); @@ -791,8 +791,8 @@ static int __init pmac_add_bridge(struct device_node *dev) /* Get bus range if any */ bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s, assume" - " bus 0\n", dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume" + " bus 0\n", dev); } hose = pcibios_alloc_controller(dev); diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c index 459138ed4571..860159d46ab8 100644 --- a/arch/powerpc/platforms/powermac/pfunc_base.c +++ b/arch/powerpc/platforms/powermac/pfunc_base.c @@ -54,8 +54,8 @@ static int macio_do_gpio_write(PMF_STD_ARGS, u8 value, u8 mask) raw_spin_lock_irqsave(&feature_lock, flags); tmp = readb(addr); tmp = (tmp & ~mask) | (value & mask); - DBG("Do write 0x%02x to GPIO %s (%p)\n", - tmp, func->node->full_name, addr); + DBG("Do write 0x%02x to GPIO %pOF (%p)\n", + tmp, func->node, addr); writeb(tmp, addr); raw_spin_unlock_irqrestore(&feature_lock, flags); @@ -107,8 +107,8 @@ static void macio_gpio_init_one(struct macio_chip *macio) if (gparent == NULL) return; - DBG("Installing GPIO functions for macio %s\n", - macio->of_node->full_name); + DBG("Installing GPIO functions for macio %pOF\n", + macio->of_node); /* * Ok, got one, we dont need anything special to track them down, so @@ -129,8 +129,8 @@ static void macio_gpio_init_one(struct macio_chip *macio) pmf_register_driver(gp, &macio_gpio_handlers, (void *)offset); } - DBG("Calling initial GPIO functions for macio %s\n", - macio->of_node->full_name); + DBG("Calling initial GPIO functions for macio %pOF\n", + macio->of_node); /* And now we run all the init ones */ for (gp = NULL; (gp = of_get_next_child(gparent, gp)) != NULL;) @@ -267,8 +267,8 @@ static struct pmf_handlers macio_mmio_handlers = { static void macio_mmio_init_one(struct macio_chip *macio) { - DBG("Installing MMIO functions for macio %s\n", - macio->of_node->full_name); + DBG("Installing MMIO functions for macio %pOF\n", + macio->of_node); pmf_register_driver(macio->of_node, &macio_mmio_handlers, macio); } @@ -298,8 +298,8 @@ static void uninorth_install_pfunc(void) { struct device_node *np; - DBG("Installing functions for UniN %s\n", - uninorth_node->full_name); + DBG("Installing functions for UniN %pOF\n", + uninorth_node); /* * Install handlers for the bridge itself @@ -317,8 +317,8 @@ static void uninorth_install_pfunc(void) break; } if (unin_hwclock) { - DBG("Installing functions for UniN clock %s\n", - unin_hwclock->full_name); + DBG("Installing functions for UniN clock %pOF\n", + unin_hwclock); pmf_register_driver(unin_hwclock, &unin_mmio_handlers, NULL); pmf_do_functions(unin_hwclock, NULL, 0, PMF_FLAGS_ON_INIT, NULL); diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c index 695e8c4d4224..df3c93bef228 100644 --- a/arch/powerpc/platforms/powermac/pfunc_core.c +++ b/arch/powerpc/platforms/powermac/pfunc_core.c @@ -708,7 +708,7 @@ int pmf_register_driver(struct device_node *np, if (handlers == NULL) return -EINVAL; - DBG("pmf: registering driver for node %s\n", np->full_name); + DBG("pmf: registering driver for node %pOF\n", np); spin_lock_irqsave(&pmf_lock, flags); dev = pmf_find_device(np); @@ -781,7 +781,7 @@ void pmf_unregister_driver(struct device_node *np) struct pmf_device *dev; unsigned long flags; - DBG("pmf: unregistering driver for node %s\n", np->full_name); + DBG("pmf: unregistering driver for node %pOF\n", np); spin_lock_irqsave(&pmf_lock, flags); dev = pmf_find_device(np); @@ -940,7 +940,7 @@ int pmf_call_one(struct pmf_function *func, struct pmf_args *args) void *instdata = NULL; int rc = 0; - DBG(" ** pmf_call_one(%s/%s) **\n", dev->node->full_name, func->name); + DBG(" ** pmf_call_one(%pOF/%s) **\n", dev->node, func->name); if (dev->handlers->begin) instdata = dev->handlers->begin(func, args); diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index f5f9ad7c3398..5e0719b27294 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -364,8 +364,8 @@ static void __init pmac_pic_probe_oldstyle(void) (addr + 0x10); of_node_put(master); - printk(KERN_INFO "irq: Found primary Apple PIC %s for %d irqs\n", - master->full_name, max_real_irqs); + printk(KERN_INFO "irq: Found primary Apple PIC %pOF for %d irqs\n", + master, max_real_irqs); /* Map interrupts of cascaded controller */ if (slave && !of_address_to_resource(slave, 0, &r)) { @@ -378,8 +378,8 @@ static void __init pmac_pic_probe_oldstyle(void) (addr + 0x10); pmac_irq_cascade = irq_of_parse_and_map(slave, 0); - printk(KERN_INFO "irq: Found slave Apple PIC %s for %d irqs" - " cascade: %d\n", slave->full_name, + printk(KERN_INFO "irq: Found slave Apple PIC %pOF for %d irqs" + " cascade: %d\n", slave, max_irqs - max_real_irqs, pmac_irq_cascade); } of_node_put(slave); diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 6b4e9d181126..ab668cb72263 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -556,7 +556,7 @@ static int __init check_pmac_serial_console(void) pr_debug(" can't find stdout package %s !\n", name); return -ENODEV; } - pr_debug("stdout is %s\n", prom_stdout->full_name); + pr_debug("stdout is %pOF\n", prom_stdout); name = of_get_property(prom_stdout, "name", NULL); if (!name) { diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 6a6f4ef46b9e..340cbe263b33 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -30,3 +30,25 @@ config OPAL_PRD help This enables the opal-prd driver, a facility to run processor recovery diagnostics on OpenPower machines + +config PPC_MEMTRACE + bool "Enable removal of RAM from kernel mappings for tracing" + depends on PPC_POWERNV && MEMORY_HOTREMOVE + default n + help + Enabling this option allows for the removal of memory (RAM) + from the kernel mappings to be used for hardware tracing. + +config PPC_VAS + bool "IBM Virtual Accelerator Switchboard (VAS)" + depends on PPC_POWERNV && PPC_64K_PAGES + default y + help + This enables support for IBM Virtual Accelerator Switchboard (VAS). + + VAS allows accelerators in co-processors like NX-GZIP and NX-842 + to be accessible to kernel subsystems and user processes. + + VAS adapters are found in POWER9 based systems. + + If unsure, say N. diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index b5d98cb3f482..37d60f7dd86d 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -2,7 +2,7 @@ obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o -obj-y += opal-kmsg.o +obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o @@ -12,3 +12,6 @@ obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o +obj-$(CONFIG_PERF_EVENTS) += opal-imc.o +obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o diff --git a/arch/powerpc/platforms/powernv/copy-paste.h b/arch/powerpc/platforms/powernv/copy-paste.h new file mode 100644 index 000000000000..c9a503623431 --- /dev/null +++ b/arch/powerpc/platforms/powernv/copy-paste.h @@ -0,0 +1,46 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/ppc-opcode.h> + +#define CR0_SHIFT 28 +#define CR0_MASK 0xF +/* + * Copy/paste instructions: + * + * copy RA,RB + * Copy contents of address (RA) + effective_address(RB) + * to internal copy-buffer. + * + * paste RA,RB + * Paste contents of internal copy-buffer to the address + * (RA) + effective_address(RB) + */ +static inline int vas_copy(void *crb, int offset) +{ + asm volatile(PPC_COPY(%0, %1)";" + : + : "b" (offset), "b" (crb) + : "memory"); + + return 0; +} + +static inline int vas_paste(void *paste_address, int offset) +{ + u32 cr; + + cr = 0; + asm volatile(PPC_PASTE(%1, %2)";" + "mfocrf %0, 0x80;" + : "=r" (cr) + : "b" (offset), "b" (paste_address) + : "memory", "cr0"); + + return (cr >> CR0_SHIFT) & CR0_MASK; +} diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 3f48f6df1cf3..8864065eba22 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -113,7 +113,6 @@ static ssize_t pnv_eeh_ei_write(struct file *filp, size_t count, loff_t *ppos) { struct pci_controller *hose = filp->private_data; - struct eeh_dev *edev; struct eeh_pe *pe; int pe_no, type, func; unsigned long addr, mask; @@ -135,13 +134,7 @@ static ssize_t pnv_eeh_ei_write(struct file *filp, return -EINVAL; /* Retrieve PE */ - edev = kzalloc(sizeof(*edev), GFP_KERNEL); - if (!edev) - return -ENOMEM; - edev->phb = hose; - edev->pe_config_addr = pe_no; - pe = eeh_pe_get(edev); - kfree(edev); + pe = eeh_pe_get(hose, pe_no, 0); if (!pe) return -ENODEV; @@ -359,6 +352,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) struct eeh_dev *edev = pdn_to_eeh_dev(pdn); uint32_t pcie_flags; int ret; + int config_addr = (pdn->busno << 8) | (pdn->devfn); /* * When probing the root bridge, which doesn't have any @@ -393,8 +387,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) } } - edev->config_addr = (pdn->busno << 8) | (pdn->devfn); - edev->pe_config_addr = phb->ioda.pe_rmap[edev->config_addr]; + edev->pe_config_addr = phb->ioda.pe_rmap[config_addr]; /* Create PE */ ret = eeh_add_to_parent_pe(edev); @@ -933,7 +926,6 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev) static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, int pos, u16 mask) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); int i, status = 0; /* Wait for Transaction Pending bit to be cleared */ @@ -947,7 +939,7 @@ static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, pr_warn("%s: Pending transaction while issuing %sFLR to %04x:%02x:%02x.%01x\n", __func__, type, - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); } @@ -1381,7 +1373,6 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, struct pnv_phb *phb = hose->private_data; struct pnv_ioda_pe *pnv_pe; struct eeh_pe *dev_pe; - struct eeh_dev edev; /* * If PHB supports compound PE, to fetch @@ -1397,10 +1388,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, } /* Find the PE according to PE# */ - memset(&edev, 0, sizeof(struct eeh_dev)); - edev.phb = hose; - edev.pe_config_addr = pe_no; - dev_pe = eeh_pe_get(&edev); + dev_pe = eeh_pe_get(hose, pe_no, 0); if (!dev_pe) return -EEXIST; @@ -1711,6 +1699,7 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) struct eeh_dev *edev = pdn_to_eeh_dev(pdn); struct pnv_phb *phb; s64 ret; + int config_addr = (pdn->busno << 8) | (pdn->devfn); if (!edev) return -EEXIST; @@ -1725,14 +1714,14 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) if (edev->physfn) { ret = pnv_eeh_restore_vf_config(pdn); } else { - phb = edev->phb->private_data; + phb = pdn->phb->private_data; ret = opal_pci_reinit(phb->opal_id, - OPAL_REINIT_PCI_DEV, edev->config_addr); + OPAL_REINIT_PCI_DEV, config_addr); } if (ret) { pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, edev->config_addr, ret); + __func__, config_addr, ret); return -EIO; } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index a553aeea7af6..9f59041a172b 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -69,7 +69,7 @@ static int pnv_save_sprs_for_deep_states(void) * all cpus at boot. Get these reg values of current cpu and use the * same across all cpus. */ - uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + uint64_t lpcr_val = mfspr(SPRN_LPCR); uint64_t hid0_val = mfspr(SPRN_HID0); uint64_t hid1_val = mfspr(SPRN_HID1); uint64_t hid4_val = mfspr(SPRN_HID4); @@ -388,6 +388,14 @@ void power9_idle(void) } #ifdef CONFIG_HOTPLUG_CPU +static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) +{ + u64 pir = get_hard_smp_processor_id(cpu); + + mtspr(SPRN_LPCR, lpcr_val); + opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); +} + /* * pnv_cpu_offline: A function that puts the CPU into the deepest * available platform idle state on a CPU-Offline. @@ -397,6 +405,20 @@ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; u32 idle_states = pnv_get_supported_cpuidle_states(); + u64 lpcr_val; + + /* + * We don't want to take decrementer interrupts while we are + * offline, so clear LPCR:PECE1. We keep PECE2 (and + * LPCR_PECE_HVEE on P9) enabled as to let IPIs in. + * + * If the CPU gets woken up by a special wakeup, ensure that + * the SLW engine sets LPCR with decrementer bit cleared, else + * the CPU will come back to the kernel due to a spurious + * wakeup. + */ + lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); __ppc64_runlatch_off(); @@ -428,6 +450,16 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_on(); + /* + * Re-enable decrementer interrupts in LPCR. + * + * Further, we want stop states to be woken up by decrementer + * for non-hotplug cases. So program the LPCR via stop api as + * well. + */ + lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + return srr1; } #endif diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c new file mode 100644 index 000000000000..de470caf0784 --- /dev/null +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -0,0 +1,282 @@ +/* + * Copyright (C) IBM Corporation, 2014, 2017 + * Anton Blanchard, Rashmica Gupta. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#define pr_fmt(fmt) "memtrace: " fmt + +#include <linux/bitops.h> +#include <linux/string.h> +#include <linux/memblock.h> +#include <linux/init.h> +#include <linux/moduleparam.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <asm/machdep.h> +#include <asm/debugfs.h> + +/* This enables us to keep track of the memory removed from each node. */ +struct memtrace_entry { + void *mem; + u64 start; + u64 size; + u32 nid; + struct dentry *dir; + char name[16]; +}; + +static u64 memtrace_size; + +static struct memtrace_entry *memtrace_array; +static unsigned int memtrace_array_nr; + + +static ssize_t memtrace_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct memtrace_entry *ent = filp->private_data; + + return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size); +} + +static bool valid_memtrace_range(struct memtrace_entry *dev, + unsigned long start, unsigned long size) +{ + if ((start >= dev->start) && + ((start + size) <= (dev->start + dev->size))) + return true; + + return false; +} + +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + struct memtrace_entry *dev = filp->private_data; + + if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size)) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (remap_pfn_range(vma, vma->vm_start, + vma->vm_pgoff + (dev->start >> PAGE_SHIFT), + size, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static const struct file_operations memtrace_fops = { + .llseek = default_llseek, + .read = memtrace_read, + .mmap = memtrace_mmap, + .open = simple_open, +}; + +static void flush_memory_region(u64 base, u64 size) +{ + unsigned long line_size = ppc64_caches.l1d.size; + u64 end = base + size; + u64 addr; + + base = round_down(base, line_size); + end = round_up(end, line_size); + + for (addr = base; addr < end; addr += line_size) + asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory"); +} + +static int check_memblock_online(struct memory_block *mem, void *arg) +{ + if (mem->state != MEM_ONLINE) + return -1; + + return 0; +} + +static int change_memblock_state(struct memory_block *mem, void *arg) +{ + unsigned long state = (unsigned long)arg; + + mem->state = state; + + return 0; +} + +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) +{ + u64 end_pfn = start_pfn + nr_pages - 1; + + if (walk_memory_range(start_pfn, end_pfn, NULL, + check_memblock_online)) + return false; + + walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE, + change_memblock_state); + + if (offline_pages(start_pfn, nr_pages)) { + walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE, + change_memblock_state); + return false; + } + + walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, + change_memblock_state); + + /* RCU grace period? */ + flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), + nr_pages << PAGE_SHIFT); + + lock_device_hotplug(); + remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); + unlock_device_hotplug(); + + return true; +} + +static u64 memtrace_alloc_node(u32 nid, u64 size) +{ + u64 start_pfn, end_pfn, nr_pages; + u64 base_pfn; + + if (!NODE_DATA(nid) || !node_spanned_pages(nid)) + return 0; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + nr_pages = size >> PAGE_SHIFT; + + /* Trace memory needs to be aligned to the size */ + end_pfn = round_down(end_pfn - nr_pages, nr_pages); + + for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) { + if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) + return base_pfn << PAGE_SHIFT; + } + + return 0; +} + +static int memtrace_init_regions_runtime(u64 size) +{ + u32 nid; + u64 m; + + memtrace_array = kcalloc(num_online_nodes(), + sizeof(struct memtrace_entry), GFP_KERNEL); + if (!memtrace_array) { + pr_err("Failed to allocate memtrace_array\n"); + return -EINVAL; + } + + for_each_online_node(nid) { + m = memtrace_alloc_node(nid, size); + + /* + * A node might not have any local memory, so warn but + * continue on. + */ + if (!m) { + pr_err("Failed to allocate trace memory on node %d\n", nid); + continue; + } + + pr_info("Allocated trace memory on node %d at 0x%016llx\n", nid, m); + + memtrace_array[memtrace_array_nr].start = m; + memtrace_array[memtrace_array_nr].size = size; + memtrace_array[memtrace_array_nr].nid = nid; + memtrace_array_nr++; + } + + return 0; +} + +static struct dentry *memtrace_debugfs_dir; + +static int memtrace_init_debugfs(void) +{ + int ret = 0; + int i; + + for (i = 0; i < memtrace_array_nr; i++) { + struct dentry *dir; + struct memtrace_entry *ent = &memtrace_array[i]; + + ent->mem = ioremap(ent->start, ent->size); + /* Warn but continue on */ + if (!ent->mem) { + pr_err("Failed to map trace memory at 0x%llx\n", + ent->start); + ret = -1; + continue; + } + + snprintf(ent->name, 16, "%08x", ent->nid); + dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir); + if (!dir) + return -1; + + ent->dir = dir; + debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops); + debugfs_create_x64("start", 0400, dir, &ent->start); + debugfs_create_x64("size", 0400, dir, &ent->size); + } + + return ret; +} + +static int memtrace_enable_set(void *data, u64 val) +{ + if (memtrace_size) + return -EINVAL; + + if (!val) + return -EINVAL; + + /* Make sure size is aligned to a memory block */ + if (val & (memory_block_size_bytes() - 1)) + return -EINVAL; + + if (memtrace_init_regions_runtime(val)) + return -EINVAL; + + if (memtrace_init_debugfs()) + return -EINVAL; + + memtrace_size = val; + + return 0; +} + +static int memtrace_enable_get(void *data, u64 *val) +{ + *val = memtrace_size; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, + memtrace_enable_set, "0x%016llx\n"); + +static int memtrace_init(void) +{ + memtrace_debugfs_dir = debugfs_create_dir("memtrace", + powerpc_debugfs_root); + if (!memtrace_debugfs_dir) + return -1; + + debugfs_create_file("enable", 0600, memtrace_debugfs_dir, + NULL, &memtrace_init_fops); + + return 0; +} +machine_device_initcall(powernv, memtrace_init); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 4c7b8591f737..2cb6cbea4b3b 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -546,6 +546,12 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, unsigned long pid = npu_context->mm->context.id; /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm. + */ + flush_tlb_mm(npu_context->mm); + + /* * Loop over all the NPUs this process is active on and launch * an invalidate. */ @@ -576,12 +582,6 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, } } - /* - * Unfortunately the nest mmu does not support flushing specific - * addresses so we have to flush the whole mm. - */ - flush_tlb_mm(npu_context->mm); - mmio_invalidate_wait(mmio_atsd_reg, flush); if (flush) /* Wait for the flush to complete */ diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 83bebeec0fea..cf33769a7b72 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -171,8 +171,8 @@ int __init opal_async_comp_init(void) async = of_get_property(opal_node, "opal-msg-async-num", NULL); if (!async) { - pr_err("%s: %s has no opal-msg-async-num\n", - __func__, opal_node->full_name); + pr_err("%s: %pOF has no opal-msg-async-num\n", + __func__, opal_node); err = -ENOENT; goto out_opal_node; } diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c index 4ec6219287fc..2fa3ac80cb4e 100644 --- a/arch/powerpc/platforms/powernv/opal-flash.c +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -520,7 +520,7 @@ out: * update_flash : Flash new firmware image * */ -static struct bin_attribute image_data_attr = { +static const struct bin_attribute image_data_attr = { .attr = {.name = "image", .mode = 0200}, .size = MAX_IMAGE_SIZE, /* Limit image size */ .write = image_data_write, diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index 88f3c61eec95..d78fed728cdf 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -30,6 +30,8 @@ #include <asm/cputable.h> #include <asm/machdep.h> +#include "powernv.h" + static int opal_hmi_handler_nb_init; struct OpalHmiEvtNode { struct list_head list; @@ -267,8 +269,6 @@ static void hmi_event_handler(struct work_struct *work) spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); if (unrecoverable) { - int ret; - /* Pull all HMI events from OPAL before we panic. */ while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { u32 type; @@ -284,23 +284,7 @@ static void hmi_event_handler(struct work_struct *work) print_hmi_event_info(hmi_evt); } - /* - * Unrecoverable HMI exception. We need to inform BMC/OCC - * about this error so that it can collect relevant data - * for error analysis before rebooting. - */ - ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, - "Unrecoverable HMI exception"); - if (ret == OPAL_UNSUPPORTED) { - pr_emerg("Reboot type %d not supported\n", - OPAL_REBOOT_PLATFORM_ERROR); - } - - /* - * Fall through and panic if opal_cec_reboot2() returns - * OPAL_UNSUPPORTED. - */ - panic("Unrecoverable HMI exception"); + pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception"); } } diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c new file mode 100644 index 000000000000..21f6531fae20 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -0,0 +1,226 @@ +/* + * OPAL IMC interface detection driver + * Supported on POWERNV platform + * + * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. + * (C) 2017 Anju T Sudhakar, IBM Corporation. + * (C) 2017 Hemant K Shaw, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or later version. + */ +#include <linux/kernel.h> +#include <linux/platform_device.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/crash_dump.h> +#include <asm/opal.h> +#include <asm/io.h> +#include <asm/imc-pmu.h> +#include <asm/cputhreads.h> + +/* + * imc_get_mem_addr_nest: Function to get nest counter memory region + * for each chip + */ +static int imc_get_mem_addr_nest(struct device_node *node, + struct imc_pmu *pmu_ptr, + u32 offset) +{ + int nr_chips = 0, i; + u64 *base_addr_arr, baddr; + u32 *chipid_arr; + + nr_chips = of_property_count_u32_elems(node, "chip-id"); + if (nr_chips <= 0) + return -ENODEV; + + base_addr_arr = kcalloc(nr_chips, sizeof(u64), GFP_KERNEL); + if (!base_addr_arr) + return -ENOMEM; + + chipid_arr = kcalloc(nr_chips, sizeof(u32), GFP_KERNEL); + if (!chipid_arr) + return -ENOMEM; + + if (of_property_read_u32_array(node, "chip-id", chipid_arr, nr_chips)) + goto error; + + if (of_property_read_u64_array(node, "base-addr", base_addr_arr, + nr_chips)) + goto error; + + pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(struct imc_mem_info), + GFP_KERNEL); + if (!pmu_ptr->mem_info) + goto error; + + for (i = 0; i < nr_chips; i++) { + pmu_ptr->mem_info[i].id = chipid_arr[i]; + baddr = base_addr_arr[i] + offset; + pmu_ptr->mem_info[i].vbase = phys_to_virt(baddr); + } + + pmu_ptr->imc_counter_mmaped = true; + kfree(base_addr_arr); + kfree(chipid_arr); + return 0; + +error: + kfree(pmu_ptr->mem_info); + kfree(base_addr_arr); + kfree(chipid_arr); + return -1; +} + +/* + * imc_pmu_create : Takes the parent device which is the pmu unit, pmu_index + * and domain as the inputs. + * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets + */ +static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) +{ + int ret = 0; + struct imc_pmu *pmu_ptr; + u32 offset; + + /* memory for pmu */ + pmu_ptr = kzalloc(sizeof(struct imc_pmu), GFP_KERNEL); + if (!pmu_ptr) + return -ENOMEM; + + /* Set the domain */ + pmu_ptr->domain = domain; + + ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size); + if (ret) { + ret = -EINVAL; + goto free_pmu; + } + + if (!of_property_read_u32(parent, "offset", &offset)) { + if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) { + ret = -EINVAL; + goto free_pmu; + } + } + + /* Function to register IMC pmu */ + ret = init_imc_pmu(parent, pmu_ptr, pmu_index); + if (ret) + pr_err("IMC PMU %s Register failed\n", pmu_ptr->pmu.name); + + return 0; + +free_pmu: + kfree(pmu_ptr); + return ret; +} + +static void disable_nest_pmu_counters(void) +{ + int nid, cpu; + const struct cpumask *l_cpumask; + + get_online_cpus(); + for_each_online_node(nid) { + l_cpumask = cpumask_of_node(nid); + cpu = cpumask_first(l_cpumask); + opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(cpu)); + } + put_online_cpus(); +} + +static void disable_core_pmu_counters(void) +{ + cpumask_t cores_map; + int cpu, rc; + + get_online_cpus(); + /* Disable the IMC Core functions */ + cores_map = cpu_online_cores_map(); + for_each_cpu(cpu, &cores_map) { + rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(cpu)); + if (rc) + pr_err("%s: Failed to stop Core (cpu = %d)\n", + __FUNCTION__, cpu); + } + put_online_cpus(); +} + +static int opal_imc_counters_probe(struct platform_device *pdev) +{ + struct device_node *imc_dev = pdev->dev.of_node; + int pmu_count = 0, domain; + u32 type; + + /* + * Check whether this is kdump kernel. If yes, force the engines to + * stop and return. + */ + if (is_kdump_kernel()) { + disable_nest_pmu_counters(); + disable_core_pmu_counters(); + return -ENODEV; + } + + for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) { + if (of_property_read_u32(imc_dev, "type", &type)) { + pr_warn("IMC Device without type property\n"); + continue; + } + + switch (type) { + case IMC_TYPE_CHIP: + domain = IMC_DOMAIN_NEST; + break; + case IMC_TYPE_CORE: + domain =IMC_DOMAIN_CORE; + break; + case IMC_TYPE_THREAD: + domain = IMC_DOMAIN_THREAD; + break; + default: + pr_warn("IMC Unknown Device type \n"); + domain = -1; + break; + } + + if (!imc_pmu_create(imc_dev, pmu_count, domain)) + pmu_count++; + } + + return 0; +} + +static void opal_imc_counters_shutdown(struct platform_device *pdev) +{ + /* + * Function only stops the engines which is bare minimum. + * TODO: Need to handle proper memory cleanup and pmu + * unregister. + */ + disable_nest_pmu_counters(); + disable_core_pmu_counters(); +} + +static const struct of_device_id opal_imc_match[] = { + { .compatible = IMC_DTB_COMPAT }, + {}, +}; + +static struct platform_driver opal_imc_driver = { + .driver = { + .name = "opal-imc-counters", + .of_match_table = opal_imc_match, + }, + .probe = opal_imc_counters_probe, + .shutdown = opal_imc_counters_shutdown, +}; + +builtin_platform_driver(opal_imc_driver); diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c new file mode 100644 index 000000000000..badb29bde93f --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-powercap.c @@ -0,0 +1,244 @@ +/* + * PowerNV OPAL Powercap interface + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "opal-powercap: " fmt + +#include <linux/of.h> +#include <linux/kobject.h> +#include <linux/slab.h> + +#include <asm/opal.h> + +DEFINE_MUTEX(powercap_mutex); + +static struct kobject *powercap_kobj; + +struct powercap_attr { + u32 handle; + struct kobj_attribute attr; +}; + +static struct pcap { + struct attribute_group pg; + struct powercap_attr *pattrs; +} *pcaps; + +static ssize_t powercap_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct powercap_attr *pcap_attr = container_of(attr, + struct powercap_attr, attr); + struct opal_msg msg; + u32 pcap; + int ret, token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&powercap_mutex); + if (ret) + goto out_token; + + ret = opal_get_powercap(pcap_attr->handle, token, (u32 *)__pa(&pcap)); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) { + ret = sprintf(buf, "%u\n", be32_to_cpu(pcap)); + if (ret < 0) + ret = -EIO; + } + break; + case OPAL_SUCCESS: + ret = sprintf(buf, "%u\n", be32_to_cpu(pcap)); + if (ret < 0) + ret = -EIO; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&powercap_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static ssize_t powercap_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + struct powercap_attr *pcap_attr = container_of(attr, + struct powercap_attr, attr); + struct opal_msg msg; + u32 pcap; + int ret, token; + + ret = kstrtoint(buf, 0, &pcap); + if (ret) + return ret; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&powercap_mutex); + if (ret) + goto out_token; + + ret = opal_set_powercap(pcap_attr->handle, token, pcap); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) + ret = count; + break; + case OPAL_SUCCESS: + ret = count; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&powercap_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static void powercap_add_attr(int handle, const char *name, + struct powercap_attr *attr) +{ + attr->handle = handle; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = name; + attr->attr.attr.mode = 0444; + attr->attr.show = powercap_show; +} + +void __init opal_powercap_init(void) +{ + struct device_node *powercap, *node; + int i = 0; + + powercap = of_find_compatible_node(NULL, NULL, "ibm,opal-powercap"); + if (!powercap) { + pr_devel("Powercap node not found\n"); + return; + } + + pcaps = kcalloc(of_get_child_count(powercap), sizeof(*pcaps), + GFP_KERNEL); + if (!pcaps) + return; + + powercap_kobj = kobject_create_and_add("powercap", opal_kobj); + if (!powercap_kobj) { + pr_warn("Failed to create powercap kobject\n"); + goto out_pcaps; + } + + i = 0; + for_each_child_of_node(powercap, node) { + u32 cur, min, max; + int j = 0; + bool has_cur = false, has_min = false, has_max = false; + + if (!of_property_read_u32(node, "powercap-min", &min)) { + j++; + has_min = true; + } + + if (!of_property_read_u32(node, "powercap-max", &max)) { + j++; + has_max = true; + } + + if (!of_property_read_u32(node, "powercap-current", &cur)) { + j++; + has_cur = true; + } + + pcaps[i].pattrs = kcalloc(j, sizeof(struct powercap_attr), + GFP_KERNEL); + if (!pcaps[i].pattrs) + goto out_pcaps_pattrs; + + pcaps[i].pg.attrs = kcalloc(j + 1, sizeof(struct attribute *), + GFP_KERNEL); + if (!pcaps[i].pg.attrs) { + kfree(pcaps[i].pattrs); + goto out_pcaps_pattrs; + } + + j = 0; + pcaps[i].pg.name = node->name; + if (has_min) { + powercap_add_attr(min, "powercap-min", + &pcaps[i].pattrs[j]); + pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr; + j++; + } + + if (has_max) { + powercap_add_attr(max, "powercap-max", + &pcaps[i].pattrs[j]); + pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr; + j++; + } + + if (has_cur) { + powercap_add_attr(cur, "powercap-current", + &pcaps[i].pattrs[j]); + pcaps[i].pattrs[j].attr.attr.mode |= 0220; + pcaps[i].pattrs[j].attr.store = powercap_store; + pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr; + j++; + } + + if (sysfs_create_group(powercap_kobj, &pcaps[i].pg)) { + pr_warn("Failed to create powercap attribute group %s\n", + pcaps[i].pg.name); + goto out_pcaps_pattrs; + } + i++; + } + + return; + +out_pcaps_pattrs: + while (--i >= 0) { + kfree(pcaps[i].pattrs); + kfree(pcaps[i].pg.attrs); + } + kobject_put(powercap_kobj); +out_pcaps: + kfree(pcaps); +} diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 2d6ee1c5ad85..de4dd09f4a15 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -241,15 +241,9 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, size = be16_to_cpu(hdr.size); - msg = kmalloc(size, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - rc = copy_from_user(msg, buf, size); - if (rc) { - size = -EFAULT; - goto out_free; - } + msg = memdup_user(buf, size); + if (IS_ERR(msg)) + return PTR_ERR(msg); rc = opal_prd_msg(msg); if (rc) { @@ -257,7 +251,6 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, size = -EIO; } -out_free: kfree(msg); return size; diff --git a/arch/powerpc/platforms/powernv/opal-psr.c b/arch/powerpc/platforms/powernv/opal-psr.c new file mode 100644 index 000000000000..7313b7fc9071 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-psr.c @@ -0,0 +1,175 @@ +/* + * PowerNV OPAL Power-Shift-Ratio interface + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "opal-psr: " fmt + +#include <linux/of.h> +#include <linux/kobject.h> +#include <linux/slab.h> + +#include <asm/opal.h> + +DEFINE_MUTEX(psr_mutex); + +static struct kobject *psr_kobj; + +struct psr_attr { + u32 handle; + struct kobj_attribute attr; +} *psr_attrs; + +static ssize_t psr_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr); + struct opal_msg msg; + int psr, ret, token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&psr_mutex); + if (ret) + goto out_token; + + ret = opal_get_power_shift_ratio(psr_attr->handle, token, + (u32 *)__pa(&psr)); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) { + ret = sprintf(buf, "%u\n", be32_to_cpu(psr)); + if (ret < 0) + ret = -EIO; + } + break; + case OPAL_SUCCESS: + ret = sprintf(buf, "%u\n", be32_to_cpu(psr)); + if (ret < 0) + ret = -EIO; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&psr_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static ssize_t psr_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr); + struct opal_msg msg; + int psr, ret, token; + + ret = kstrtoint(buf, 0, &psr); + if (ret) + return ret; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&psr_mutex); + if (ret) + goto out_token; + + ret = opal_set_power_shift_ratio(psr_attr->handle, token, psr); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) + ret = count; + break; + case OPAL_SUCCESS: + ret = count; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&psr_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +void __init opal_psr_init(void) +{ + struct device_node *psr, *node; + int i = 0; + + psr = of_find_compatible_node(NULL, NULL, + "ibm,opal-power-shift-ratio"); + if (!psr) { + pr_devel("Power-shift-ratio node not found\n"); + return; + } + + psr_attrs = kcalloc(of_get_child_count(psr), sizeof(struct psr_attr), + GFP_KERNEL); + if (!psr_attrs) + return; + + psr_kobj = kobject_create_and_add("psr", opal_kobj); + if (!psr_kobj) { + pr_warn("Failed to create psr kobject\n"); + goto out; + } + + for_each_child_of_node(psr, node) { + if (of_property_read_u32(node, "handle", + &psr_attrs[i].handle)) + goto out_kobj; + + sysfs_attr_init(&psr_attrs[i].attr.attr); + if (of_property_read_string(node, "label", + &psr_attrs[i].attr.attr.name)) + goto out_kobj; + psr_attrs[i].attr.attr.mode = 0664; + psr_attrs[i].attr.show = psr_show; + psr_attrs[i].attr.store = psr_store; + if (sysfs_create_file(psr_kobj, &psr_attrs[i].attr.attr)) { + pr_devel("Failed to create psr sysfs file %s\n", + psr_attrs[i].attr.attr.name); + goto out_kobj; + } + i++; + } + + return; +out_kobj: + kobject_put(psr_kobj); +out: + kfree(psr_attrs); +} diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c new file mode 100644 index 000000000000..7e5a235ebf76 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -0,0 +1,212 @@ +/* + * PowerNV OPAL Sensor-groups interface + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "opal-sensor-groups: " fmt + +#include <linux/of.h> +#include <linux/kobject.h> +#include <linux/slab.h> + +#include <asm/opal.h> + +DEFINE_MUTEX(sg_mutex); + +static struct kobject *sg_kobj; + +struct sg_attr { + u32 handle; + struct kobj_attribute attr; +}; + +static struct sensor_group { + char name[20]; + struct attribute_group sg; + struct sg_attr *sgattrs; +} *sgs; + +static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct sg_attr *sattr = container_of(attr, struct sg_attr, attr); + struct opal_msg msg; + u32 data; + int ret, token; + + ret = kstrtoint(buf, 0, &data); + if (ret) + return ret; + + if (data != 1) + return -EINVAL; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&sg_mutex); + if (ret) + goto out_token; + + ret = opal_sensor_group_clear(sattr->handle, token); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) + ret = count; + break; + case OPAL_SUCCESS: + ret = count; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&sg_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static struct sg_ops_info { + int opal_no; + const char *attr_name; + ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count); +} ops_info[] = { + { OPAL_SENSOR_GROUP_CLEAR, "clear", sg_store }, +}; + +static void add_attr(int handle, struct sg_attr *attr, int index) +{ + attr->handle = handle; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = ops_info[index].attr_name; + attr->attr.attr.mode = 0220; + attr->attr.store = ops_info[index].store; +} + +static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, + u32 handle) +{ + int i, j; + int count = 0; + + for (i = 0; i < len; i++) + for (j = 0; j < ARRAY_SIZE(ops_info); j++) + if (be32_to_cpu(ops[i]) == ops_info[j].opal_no) { + add_attr(handle, &sg->sgattrs[count], j); + sg->sg.attrs[count] = + &sg->sgattrs[count].attr.attr; + count++; + } + + return sysfs_create_group(sg_kobj, &sg->sg); +} + +static int get_nr_attrs(const __be32 *ops, int len) +{ + int i, j; + int nr_attrs = 0; + + for (i = 0; i < len; i++) + for (j = 0; j < ARRAY_SIZE(ops_info); j++) + if (be32_to_cpu(ops[i]) == ops_info[j].opal_no) + nr_attrs++; + + return nr_attrs; +} + +void __init opal_sensor_groups_init(void) +{ + struct device_node *sg, *node; + int i = 0; + + sg = of_find_compatible_node(NULL, NULL, "ibm,opal-sensor-group"); + if (!sg) { + pr_devel("Sensor groups node not found\n"); + return; + } + + sgs = kcalloc(of_get_child_count(sg), sizeof(*sgs), GFP_KERNEL); + if (!sgs) + return; + + sg_kobj = kobject_create_and_add("sensor_groups", opal_kobj); + if (!sg_kobj) { + pr_warn("Failed to create sensor group kobject\n"); + goto out_sgs; + } + + for_each_child_of_node(sg, node) { + const __be32 *ops; + u32 sgid, len, nr_attrs, chipid; + + ops = of_get_property(node, "ops", &len); + if (!ops) + continue; + + nr_attrs = get_nr_attrs(ops, len); + if (!nr_attrs) + continue; + + sgs[i].sgattrs = kcalloc(nr_attrs, sizeof(struct sg_attr), + GFP_KERNEL); + if (!sgs[i].sgattrs) + goto out_sgs_sgattrs; + + sgs[i].sg.attrs = kcalloc(nr_attrs + 1, + sizeof(struct attribute *), + GFP_KERNEL); + + if (!sgs[i].sg.attrs) { + kfree(sgs[i].sgattrs); + goto out_sgs_sgattrs; + } + + if (of_property_read_u32(node, "sensor-group-id", &sgid)) { + pr_warn("sensor-group-id property not found\n"); + goto out_sgs_sgattrs; + } + + if (!of_property_read_u32(node, "ibm,chip-id", &chipid)) + sprintf(sgs[i].name, "%s%d", node->name, chipid); + else + sprintf(sgs[i].name, "%s", node->name); + + sgs[i].sg.name = sgs[i].name; + if (add_attr_group(ops, len, &sgs[i], sgid)) { + pr_warn("Failed to create sensor attribute group %s\n", + sgs[i].sg.name); + goto out_sgs_sgattrs; + } + i++; + } + + return; + +out_sgs_sgattrs: + while (--i >= 0) { + kfree(sgs[i].sgattrs); + kfree(sgs[i].sg.attrs); + } + kobject_put(sg_kobj); +out_sgs: + kfree(sgs); +} diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 4ca6c26a56d5..8c1ede2d3f7e 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -27,7 +27,7 @@ .globl opal_tracepoint_refcount opal_tracepoint_refcount: - .llong 0 + .8byte 0 .section ".text" @@ -310,3 +310,12 @@ OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); +OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); +OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); +OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); +OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); +OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); +OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); +OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 28651fb25417..81c0a943dea9 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -36,14 +36,14 @@ static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) const __be32 *gcid; if (!of_get_property(dev, "scom-controller", NULL)) { - pr_err("%s: device %s is not a SCOM controller\n", - __func__, dev->full_name); + pr_err("%s: device %pOF is not a SCOM controller\n", + __func__, dev); return SCOM_MAP_INVALID; } gcid = of_get_property(dev, "ibm,chip-id", NULL); if (!gcid) { - pr_err("%s: device %s has no ibm,chip-id\n", - __func__, dev->full_name); + pr_err("%s: device %pOF has no ibm,chip-id\n", + __func__, dev); return SCOM_MAP_INVALID; } m = kmalloc(sizeof(struct opal_scom_map), GFP_KERNEL); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index cad6b57ce494..65c79ecf5a4d 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -16,6 +16,7 @@ #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/of_platform.h> +#include <linux/of_address.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/slab.h> @@ -25,11 +26,17 @@ #include <linux/memblock.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/printk.h> +#include <linux/kmsg_dump.h> +#include <linux/console.h> +#include <linux/sched/debug.h> #include <asm/machdep.h> #include <asm/opal.h> #include <asm/firmware.h> #include <asm/mce.h> +#include <asm/imc-pmu.h> +#include <asm/bug.h> #include "powernv.h" @@ -162,12 +169,9 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node, sizeof(struct mcheck_recoverable_range); /* - * Allocate a buffer to hold the MC recoverable ranges. We would be - * accessing them in real mode, hence it needs to be within - * RMO region. + * Allocate a buffer to hold the MC recoverable ranges. */ - mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64), - ppc64_rma_size)); + mc_recoverable_range =__va(memblock_alloc(size, __alignof__(u64))); memset(mc_recoverable_range, 0, size); for (i = 0; i < mc_recoverable_range_len; i++) { @@ -422,24 +426,88 @@ static int opal_recover_mce(struct pt_regs *regs, /* Fatal machine check */ pr_err("Machine check interrupt is fatal\n"); recovered = 0; - } else if ((evt->severity == MCE_SEV_ERROR_SYNC) && - (user_mode(regs) && !is_global_init(current))) { + } + + if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) { /* - * For now, kill the task if we have received exception when - * in userspace. + * Try to kill processes if we get a synchronous machine check + * (e.g., one caused by execution of this instruction). This + * will devolve into a panic if we try to kill init or are in + * an interrupt etc. * * TODO: Queue up this address for hwpoisioning later. + * TODO: This is not quite right for d-side machine + * checks ->nip is not necessarily the important + * address. */ - _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); - recovered = 1; + if ((user_mode(regs))) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else if (die_will_crash()) { + /* + * die() would kill the kernel, so better to go via + * the platform reboot code that will log the + * machine check. + */ + recovered = 0; + } else { + die("Machine check", regs, SIGBUS); + recovered = 1; + } } + return recovered; } +void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) +{ + /* + * This is mostly taken from kernel/panic.c, but tries to do + * relatively minimal work. Don't use delay functions (TB may + * be broken), don't crash dump (need to set a firmware log), + * don't run notifiers. We do want to get some information to + * Linux console. + */ + console_verbose(); + bust_spinlocks(1); + pr_emerg("Hardware platform error: %s\n", msg); + if (regs) + show_regs(regs); + smp_send_stop(); + printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); + debug_locks_off(); + console_flush_on_panic(); + + /* + * Don't bother to shut things down because this will + * xstop the system. + */ + if (opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, msg) + == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported for %s\n", + OPAL_REBOOT_PLATFORM_ERROR, msg); + } + + /* + * We reached here. There can be three possibilities: + * 1. We are running on a firmware level that do not support + * opal_cec_reboot2() + * 2. We are running on a firmware level that do not support + * OPAL_REBOOT_PLATFORM_ERROR reboot type. + * 3. We are running on FSP based system that does not need + * opal to trigger checkstop explicitly for error analysis. + * The FSP PRD component would have already got notified + * about this error through other channels. + */ + + ppc_md.restart(NULL); +} + int opal_machine_check(struct pt_regs *regs) { struct machine_check_event evt; - int ret; if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return 0; @@ -455,43 +523,7 @@ int opal_machine_check(struct pt_regs *regs) if (opal_recover_mce(regs, &evt)) return 1; - /* - * Unrecovered machine check, we are heading to panic path. - * - * We may have hit this MCE in very early stage of kernel - * initialization even before opal-prd has started running. If - * this is the case then this MCE error may go un-noticed or - * un-analyzed if we go down panic path. We need to inform - * BMC/OCC about this error so that they can collect relevant - * data for error analysis before rebooting. - * Use opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR) to do so. - * This function may not return on BMC based system. - */ - ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, - "Unrecoverable Machine Check exception"); - if (ret == OPAL_UNSUPPORTED) { - pr_emerg("Reboot type %d not supported\n", - OPAL_REBOOT_PLATFORM_ERROR); - } - - /* - * We reached here. There can be three possibilities: - * 1. We are running on a firmware level that do not support - * opal_cec_reboot2() - * 2. We are running on a firmware level that do not support - * OPAL_REBOOT_PLATFORM_ERROR reboot type. - * 3. We are running on FSP based system that does not need opal - * to trigger checkstop explicitly for error analysis. The FSP - * PRD component would have already got notified about this - * error through other channels. - * - * If hardware marked this as an unrecoverable MCE, we are - * going to panic anyway. Even if it didn't, it's not safe to - * continue at this point, so we should explicitly panic. - */ - - panic("PowerNV Unrecovered Machine Check"); - return 0; + pnv_platform_error_reboot(regs, "Unrecoverable Machine Check exception"); } /* Early hmi handler called in real mode. */ @@ -720,6 +752,15 @@ static void opal_pdev_init(const char *compatible) of_platform_device_create(np, NULL, NULL); } +static void __init opal_imc_init_dev(void) +{ + struct device_node *np; + + np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT); + if (np) + of_platform_device_create(np, NULL, NULL); +} + static int kopald(void *unused) { unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1; @@ -793,6 +834,9 @@ static int __init opal_init(void) /* Setup a heatbeat thread if requested by OPAL */ opal_init_heartbeat(); + /* Detect In-Memory Collection counters and create devices*/ + opal_imc_init_dev(); + /* Create leds platform devices */ leds = of_find_node_by_path("/ibm,opal/leds"); if (leds) { @@ -836,6 +880,15 @@ static int __init opal_init(void) /* Initialise OPAL kmsg dumper for flushing console on panic */ opal_kmsg_init(); + /* Initialise OPAL powercap interface */ + opal_powercap_init(); + + /* Initialise OPAL Power-Shifting-Ratio interface */ + opal_psr_init(); + + /* Initialise OPAL sensor groups */ + opal_sensor_groups_init(); + return 0; } machine_subsys_initcall(powernv, opal_init); @@ -952,6 +1005,7 @@ int opal_error_code(int rc) case OPAL_UNSUPPORTED: return -EIO; case OPAL_HARDWARE: return -EIO; case OPAL_INTERNAL_ERROR: return -EIO; + case OPAL_TIMEOUT: return -ETIMEDOUT; default: pr_err("%s: unexpected OPAL error %d\n", __func__, rc); return -EIO; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b900eb1d5e17..57f9e55f4352 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -444,8 +444,8 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) r = of_get_property(dn, "ibm,opal-m64-window", NULL); if (!r) { - pr_info(" No <ibm,opal-m64-window> on %s\n", - dn->full_name); + pr_info(" No <ibm,opal-m64-window> on %pOF\n", + dn); return; } @@ -1408,7 +1408,6 @@ m64_failed: static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, int num); -static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) { @@ -2402,7 +2401,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, return 0; } -static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) +void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -3797,8 +3796,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (!of_device_is_available(np)) return; - pr_info("Initializing %s PHB (%s)\n", - pnv_phb_names[ioda_type], of_node_full_name(np)); + pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np); prop64 = of_get_property(np, "ibm,opal-phbid", NULL); if (!prop64) { @@ -3813,8 +3811,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Allocate PCI controller */ phb->hose = hose = pcibios_alloc_controller(np); if (!phb->hose) { - pr_err(" Can't allocate PCI controller for %s\n", - np->full_name); + pr_err(" Can't allocate PCI controller for %pOF\n", + np); memblock_free(__pa(phb), sizeof(struct pnv_phb)); return; } @@ -3825,7 +3823,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, hose->first_busno = be32_to_cpu(prop32[0]); hose->last_busno = be32_to_cpu(prop32[1]); } else { - pr_warn(" Broken <bus-range> on %s\n", np->full_name); + pr_warn(" Broken <bus-range> on %pOF\n", np); hose->first_busno = 0; hose->last_busno = 0xff; } @@ -4046,7 +4044,7 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np) const __be64 *prop64; u64 hub_id; - pr_info("Probing IODA IO-Hub %s\n", np->full_name); + pr_info("Probing IODA IO-Hub %pOF\n", np); prop64 = of_get_property(np, "ibm,opal-hubid", NULL); if (!prop64) { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 7905d179d036..5422f4a6317c 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -37,6 +37,8 @@ #include "powernv.h" #include "pci.h" +static DEFINE_MUTEX(p2p_mutex); + int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) { struct device_node *parent = np; @@ -1017,6 +1019,79 @@ void pnv_pci_dma_bus_setup(struct pci_bus *bus) } } +int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, u64 desc) +{ + struct pci_controller *hose; + struct pnv_phb *phb_init, *phb_target; + struct pnv_ioda_pe *pe_init; + int rc; + + if (!opal_check_token(OPAL_PCI_SET_P2P)) + return -ENXIO; + + hose = pci_bus_to_host(initiator->bus); + phb_init = hose->private_data; + + hose = pci_bus_to_host(target->bus); + phb_target = hose->private_data; + + pe_init = pnv_ioda_get_pe(initiator); + if (!pe_init) + return -ENODEV; + + /* + * Configuring the initiator's PHB requires to adjust its + * TVE#1 setting. Since the same device can be an initiator + * several times for different target devices, we need to keep + * a reference count to know when we can restore the default + * bypass setting on its TVE#1 when disabling. Opal is not + * tracking PE states, so we add a reference count on the PE + * in linux. + * + * For the target, the configuration is per PHB, so we keep a + * target reference count on the PHB. + */ + mutex_lock(&p2p_mutex); + + if (desc & OPAL_PCI_P2P_ENABLE) { + /* always go to opal to validate the configuration */ + rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id, + desc, pe_init->pe_number); + + if (rc != OPAL_SUCCESS) { + rc = -EIO; + goto out; + } + + pe_init->p2p_initiator_count++; + phb_target->p2p_target_count++; + } else { + if (!pe_init->p2p_initiator_count || + !phb_target->p2p_target_count) { + rc = -EINVAL; + goto out; + } + + if (--pe_init->p2p_initiator_count == 0) + pnv_pci_ioda2_set_bypass(pe_init, true); + + if (--phb_target->p2p_target_count == 0) { + rc = opal_pci_set_p2p(phb_init->opal_id, + phb_target->opal_id, desc, + pe_init->pe_number); + if (rc != OPAL_SUCCESS) { + rc = -EIO; + goto out; + } + } + } + rc = 0; +out: + mutex_unlock(&p2p_mutex); + return rc; +} +EXPORT_SYMBOL_GPL(pnv_pci_set_p2p); + void pnv_pci_shutdown(void) { struct pci_controller *hose; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index f16bc403ec03..a95273c524f6 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -78,6 +78,9 @@ struct pnv_ioda_pe { struct pnv_ioda_pe *master; struct list_head slaves; + /* PCI peer-to-peer*/ + int p2p_initiator_count; + /* Link in list of PE#s */ struct list_head list; }; @@ -189,6 +192,7 @@ struct pnv_phb { #ifdef CONFIG_CXL_BASE struct cxl_afu *cxl_afu; #endif + int p2p_target_count; }; extern struct pci_ops pnv_pci_ops; @@ -229,6 +233,7 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); +extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 6dbc0a1da1f6..a159d48573d7 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -7,6 +7,8 @@ extern void pnv_smp_init(void); static inline void pnv_smp_init(void) { } #endif +extern void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) __noreturn; + struct pci_dev; #ifdef CONFIG_PCI diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 1a9d84371a4d..718f50ed22f1 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -16,11 +16,13 @@ #include <linux/slab.h> #include <linux/smp.h> #include <asm/archrandom.h> +#include <asm/cputable.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/machdep.h> #include <asm/smp.h> +#define DARN_ERR 0xFFFFFFFFFFFFFFFFul struct powernv_rng { void __iomem *regs; @@ -67,6 +69,41 @@ int powernv_get_random_real_mode(unsigned long *v) return 1; } +int powernv_get_random_darn(unsigned long *v) +{ + unsigned long val; + + /* Using DARN with L=1 - 64-bit conditioned random number */ + asm volatile(PPC_DARN(%0, 1) : "=r"(val)); + + if (val == DARN_ERR) + return 0; + + *v = val; + + return 1; +} + +static int initialise_darn(void) +{ + unsigned long val; + int i; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -ENODEV; + + for (i = 0; i < 10; i++) { + if (powernv_get_random_darn(&val)) { + ppc_md.get_random_seed = powernv_get_random_darn; + return 0; + } + } + + pr_warn("Unable to use DARN for get_random_seed()\n"); + + return -EIO; +} + int powernv_get_random_long(unsigned long *v) { struct powernv_rng *rng; @@ -88,7 +125,7 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng, chip_id = of_get_ibm_chip_id(dn); if (chip_id == -1) - pr_warn("No ibm,chip-id found for %s.\n", dn->full_name); + pr_warn("No ibm,chip-id found for %pOF.\n", dn); for_each_possible_cpu(cpu) { if (per_cpu(powernv_rng, cpu) == NULL || @@ -141,8 +178,8 @@ static __init int rng_init(void) for_each_compatible_node(dn, NULL, "ibm,power-rng") { rc = rng_create(dn); if (rc) { - pr_err("Failed creating rng for %s (%d).\n", - dn->full_name, rc); + pr_err("Failed creating rng for %pOF (%d).\n", + dn, rc); continue; } @@ -150,6 +187,8 @@ static __init int rng_init(void) of_platform_device_create(dn, NULL, NULL); } + initialise_darn(); + return 0; } machine_subsys_initcall(powernv, rng_init); diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 40dae96f7e20..c17f81e433f7 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -57,7 +57,7 @@ static void pnv_smp_setup_cpu(int cpu) static int pnv_smp_kick_cpu(int nr) { - unsigned int pcpu = get_hard_smp_processor_id(nr); + unsigned int pcpu; unsigned long start_here = __pa(ppc_function_entry(generic_secondary_smp_init)); long rc; @@ -66,6 +66,7 @@ static int pnv_smp_kick_cpu(int nr) if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; + pcpu = get_hard_smp_processor_id(nr); /* * If we already started or OPAL is not supported, we just * kick the CPU via the PACA @@ -164,12 +165,6 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; - /* We don't want to take decrementer interrupts while we are offline, - * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) - * enabled as to let IPIs in. - */ - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); - while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -219,8 +214,6 @@ static void pnv_smp_cpu_kill_self(void) } - /* Re-enable decrementer interrupts */ - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); DBG("CPU%d coming online...\n", cpu); } diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c new file mode 100644 index 000000000000..5aae845b8cd9 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -0,0 +1,1134 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/log2.h> +#include <linux/rcupdate.h> +#include <linux/cred.h> + +#include "vas.h" +#include "copy-paste.h" + +/* + * Compute the paste address region for the window @window using the + * ->paste_base_addr and ->paste_win_id_shift we got from device tree. + */ +static void compute_paste_address(struct vas_window *window, u64 *addr, int *len) +{ + int winid; + u64 base, shift; + + base = window->vinst->paste_base_addr; + shift = window->vinst->paste_win_id_shift; + winid = window->winid; + + *addr = base + (winid << shift); + if (len) + *len = PAGE_SIZE; + + pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); +} + +static inline void get_hvwc_mmio_bar(struct vas_window *window, + u64 *start, int *len) +{ + u64 pbaddr; + + pbaddr = window->vinst->hvwc_bar_start; + *start = pbaddr + window->winid * VAS_HVWC_SIZE; + *len = VAS_HVWC_SIZE; +} + +static inline void get_uwc_mmio_bar(struct vas_window *window, + u64 *start, int *len) +{ + u64 pbaddr; + + pbaddr = window->vinst->uwc_bar_start; + *start = pbaddr + window->winid * VAS_UWC_SIZE; + *len = VAS_UWC_SIZE; +} + +/* + * Map the paste bus address of the given send window into kernel address + * space. Unlike MMIO regions (map_mmio_region() below), paste region must + * be mapped cache-able and is only applicable to send windows. + */ +static void *map_paste_region(struct vas_window *txwin) +{ + int len; + void *map; + char *name; + u64 start; + + name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id, + txwin->winid); + if (!name) + goto free_name; + + txwin->paste_addr_name = name; + compute_paste_address(txwin, &start, &len); + + if (!request_mem_region(start, len, name)) { + pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n", + __func__, start, len); + goto free_name; + } + + map = ioremap_cache(start, len); + if (!map) { + pr_devel("%s(): ioremap_cache(0x%llx, %d) failed\n", __func__, + start, len); + goto free_name; + } + + pr_devel("Mapped paste addr 0x%llx to kaddr 0x%p\n", start, map); + return map; + +free_name: + kfree(name); + return ERR_PTR(-ENOMEM); +} + +static void *map_mmio_region(char *name, u64 start, int len) +{ + void *map; + + if (!request_mem_region(start, len, name)) { + pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n", + __func__, start, len); + return NULL; + } + + map = ioremap(start, len); + if (!map) { + pr_devel("%s(): ioremap(0x%llx, %d) failed\n", __func__, start, + len); + return NULL; + } + + return map; +} + +static void unmap_region(void *addr, u64 start, int len) +{ + iounmap(addr); + release_mem_region((phys_addr_t)start, len); +} + +/* + * Unmap the paste address region for a window. + */ +static void unmap_paste_region(struct vas_window *window) +{ + int len; + u64 busaddr_start; + + if (window->paste_kaddr) { + compute_paste_address(window, &busaddr_start, &len); + unmap_region(window->paste_kaddr, busaddr_start, len); + window->paste_kaddr = NULL; + kfree(window->paste_addr_name); + window->paste_addr_name = NULL; + } +} + +/* + * Unmap the MMIO regions for a window. + */ +static void unmap_winctx_mmio_bars(struct vas_window *window) +{ + int len; + u64 busaddr_start; + + if (window->hvwc_map) { + get_hvwc_mmio_bar(window, &busaddr_start, &len); + unmap_region(window->hvwc_map, busaddr_start, len); + window->hvwc_map = NULL; + } + + if (window->uwc_map) { + get_uwc_mmio_bar(window, &busaddr_start, &len); + unmap_region(window->uwc_map, busaddr_start, len); + window->uwc_map = NULL; + } +} + +/* + * Find the Hypervisor Window Context (HVWC) MMIO Base Address Region and the + * OS/User Window Context (UWC) MMIO Base Address Region for the given window. + * Map these bus addresses and save the mapped kernel addresses in @window. + */ +int map_winctx_mmio_bars(struct vas_window *window) +{ + int len; + u64 start; + + get_hvwc_mmio_bar(window, &start, &len); + window->hvwc_map = map_mmio_region("HVWCM_Window", start, len); + + get_uwc_mmio_bar(window, &start, &len); + window->uwc_map = map_mmio_region("UWCM_Window", start, len); + + if (!window->hvwc_map || !window->uwc_map) { + unmap_winctx_mmio_bars(window); + return -1; + } + + return 0; +} + +/* + * Reset all valid registers in the HV and OS/User Window Contexts for + * the window identified by @window. + * + * NOTE: We cannot really use a for loop to reset window context. Not all + * offsets in a window context are valid registers and the valid + * registers are not sequential. And, we can only write to offsets + * with valid registers. + */ +void reset_window_regs(struct vas_window *window) +{ + write_hvwc_reg(window, VREG(LPID), 0ULL); + write_hvwc_reg(window, VREG(PID), 0ULL); + write_hvwc_reg(window, VREG(XLATE_MSR), 0ULL); + write_hvwc_reg(window, VREG(XLATE_LPCR), 0ULL); + write_hvwc_reg(window, VREG(XLATE_CTL), 0ULL); + write_hvwc_reg(window, VREG(AMR), 0ULL); + write_hvwc_reg(window, VREG(SEIDR), 0ULL); + write_hvwc_reg(window, VREG(FAULT_TX_WIN), 0ULL); + write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL); + write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), 0ULL); + write_hvwc_reg(window, VREG(PSWID), 0ULL); + write_hvwc_reg(window, VREG(LFIFO_BAR), 0ULL); + write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), 0ULL); + write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), 0ULL); + write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL); + write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LRX_WCRED), 0ULL); + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL); + write_hvwc_reg(window, VREG(TX_WCRED), 0ULL); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL); + write_hvwc_reg(window, VREG(LFIFO_SIZE), 0ULL); + write_hvwc_reg(window, VREG(WINCTL), 0ULL); + write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL); + write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), 0ULL); + write_hvwc_reg(window, VREG(TX_RSVD_BUF_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_CTL), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_PID), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_LPID), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_TID), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), 0ULL); + write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL); + + /* Skip read-only registers: NX_UTIL and NX_UTIL_SE */ + + /* + * The send and receive window credit adder registers are also + * accessible from HVWC and have been initialized above. We don't + * need to initialize from the OS/User Window Context, so skip + * following calls: + * + * write_uwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL); + * write_uwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL); + */ +} + +/* + * Initialize window context registers related to Address Translation. + * These registers are common to send/receive windows although they + * differ for user/kernel windows. As we resolve the TODOs we may + * want to add fields to vas_winctx and move the initialization to + * init_vas_winctx_regs(). + */ +static void init_xlate_regs(struct vas_window *window, bool user_win) +{ + u64 lpcr, val; + + /* + * MSR_TA, MSR_US are false for both kernel and user. + * MSR_DR and MSR_PR are false for kernel. + */ + val = 0ULL; + val = SET_FIELD(VAS_XLATE_MSR_HV, val, 1); + val = SET_FIELD(VAS_XLATE_MSR_SF, val, 1); + if (user_win) { + val = SET_FIELD(VAS_XLATE_MSR_DR, val, 1); + val = SET_FIELD(VAS_XLATE_MSR_PR, val, 1); + } + write_hvwc_reg(window, VREG(XLATE_MSR), val); + + lpcr = mfspr(SPRN_LPCR); + val = 0ULL; + /* + * NOTE: From Section 5.7.8.1 Segment Lookaside Buffer of the + * Power ISA, v3.0B, Page size encoding is 0 = 4KB, 5 = 64KB. + * + * NOTE: From Section 1.3.1, Address Translation Context of the + * Nest MMU Workbook, LPCR_SC should be 0 for Power9. + */ + val = SET_FIELD(VAS_XLATE_LPCR_PAGE_SIZE, val, 5); + val = SET_FIELD(VAS_XLATE_LPCR_ISL, val, lpcr & LPCR_ISL); + val = SET_FIELD(VAS_XLATE_LPCR_TC, val, lpcr & LPCR_TC); + val = SET_FIELD(VAS_XLATE_LPCR_SC, val, 0); + write_hvwc_reg(window, VREG(XLATE_LPCR), val); + + /* + * Section 1.3.1 (Address translation Context) of NMMU workbook. + * 0b00 Hashed Page Table mode + * 0b01 Reserved + * 0b10 Radix on HPT + * 0b11 Radix on Radix + */ + val = 0ULL; + val = SET_FIELD(VAS_XLATE_MODE, val, radix_enabled() ? 3 : 2); + write_hvwc_reg(window, VREG(XLATE_CTL), val); + + /* + * TODO: Can we mfspr(AMR) even for user windows? + */ + val = 0ULL; + val = SET_FIELD(VAS_AMR, val, mfspr(SPRN_AMR)); + write_hvwc_reg(window, VREG(AMR), val); + + val = 0ULL; + val = SET_FIELD(VAS_SEIDR, val, 0); + write_hvwc_reg(window, VREG(SEIDR), val); +} + +/* + * Initialize Reserved Send Buffer Count for the send window. It involves + * writing to the register, reading it back to confirm that the hardware + * has enough buffers to reserve. See section 1.3.1.2.1 of VAS workbook. + * + * Since we can only make a best-effort attempt to fulfill the request, + * we don't return any errors if we cannot. + * + * TODO: Reserved (aka dedicated) send buffers are not supported yet. + */ +static void init_rsvd_tx_buf_count(struct vas_window *txwin, + struct vas_winctx *winctx) +{ + write_hvwc_reg(txwin, VREG(TX_RSVD_BUF_COUNT), 0ULL); +} + +/* + * init_winctx_regs() + * Initialize window context registers for a receive window. + * Except for caching control and marking window open, the registers + * are initialized in the order listed in Section 3.1.4 (Window Context + * Cache Register Details) of the VAS workbook although they don't need + * to be. + * + * Design note: For NX receive windows, NX allocates the FIFO buffer in OPAL + * (so that it can get a large contiguous area) and passes that buffer + * to kernel via device tree. We now write that buffer address to the + * FIFO BAR. Would it make sense to do this all in OPAL? i.e have OPAL + * write the per-chip RX FIFO addresses to the windows during boot-up + * as a one-time task? That could work for NX but what about other + * receivers? Let the receivers tell us the rx-fifo buffers for now. + */ +int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) +{ + u64 val; + int fifo_size; + + reset_window_regs(window); + + val = 0ULL; + val = SET_FIELD(VAS_LPID, val, winctx->lpid); + write_hvwc_reg(window, VREG(LPID), val); + + val = 0ULL; + val = SET_FIELD(VAS_PID_ID, val, winctx->pidr); + write_hvwc_reg(window, VREG(PID), val); + + init_xlate_regs(window, winctx->user_win); + + val = 0ULL; + val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0); + write_hvwc_reg(window, VREG(FAULT_TX_WIN), val); + + /* In PowerNV, interrupts go to HV. */ + write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL); + + val = 0ULL; + val = SET_FIELD(VAS_HV_INTR_SRC_RA, val, winctx->irq_port); + write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), val); + + val = 0ULL; + val = SET_FIELD(VAS_PSWID_EA_HANDLE, val, winctx->pswid); + write_hvwc_reg(window, VREG(PSWID), val); + + write_hvwc_reg(window, VREG(SPARE1), 0ULL); + write_hvwc_reg(window, VREG(SPARE2), 0ULL); + write_hvwc_reg(window, VREG(SPARE3), 0ULL); + + /* + * NOTE: VAS expects the FIFO address to be copied into the LFIFO_BAR + * register as is - do NOT shift the address into VAS_LFIFO_BAR + * bit fields! Ok to set the page migration select fields - + * VAS ignores the lower 10+ bits in the address anyway, because + * the minimum FIFO size is 1K? + * + * See also: Design note in function header. + */ + val = __pa(winctx->rx_fifo); + val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0); + write_hvwc_reg(window, VREG(LFIFO_BAR), val); + + val = 0ULL; + val = SET_FIELD(VAS_LDATA_STAMP, val, winctx->data_stamp); + write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), val); + + val = 0ULL; + val = SET_FIELD(VAS_LDMA_TYPE, val, winctx->dma_type); + val = SET_FIELD(VAS_LDMA_FIFO_DISABLE, val, winctx->fifo_disable); + write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), val); + + write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL); + write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL); + + val = 0ULL; + val = SET_FIELD(VAS_LRX_WCRED, val, winctx->wcreds_max); + write_hvwc_reg(window, VREG(LRX_WCRED), val); + + val = 0ULL; + val = SET_FIELD(VAS_TX_WCRED, val, winctx->wcreds_max); + write_hvwc_reg(window, VREG(TX_WCRED), val); + + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL); + + fifo_size = winctx->rx_fifo_size / 1024; + + val = 0ULL; + val = SET_FIELD(VAS_LFIFO_SIZE, val, ilog2(fifo_size)); + write_hvwc_reg(window, VREG(LFIFO_SIZE), val); + + /* Update window control and caching control registers last so + * we mark the window open only after fully initializing it and + * pushing context to cache. + */ + + write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL); + + init_rsvd_tx_buf_count(window, winctx); + + /* for a send window, point to the matching receive window */ + val = 0ULL; + val = SET_FIELD(VAS_LRX_WIN_ID, val, winctx->rx_win_id); + write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), val); + + write_hvwc_reg(window, VREG(SPARE4), 0ULL); + + val = 0ULL; + val = SET_FIELD(VAS_NOTIFY_DISABLE, val, winctx->notify_disable); + val = SET_FIELD(VAS_INTR_DISABLE, val, winctx->intr_disable); + val = SET_FIELD(VAS_NOTIFY_EARLY, val, winctx->notify_early); + val = SET_FIELD(VAS_NOTIFY_OSU_INTR, val, winctx->notify_os_intr_reg); + write_hvwc_reg(window, VREG(LNOTIFY_CTL), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_PID, val, winctx->lnotify_pid); + write_hvwc_reg(window, VREG(LNOTIFY_PID), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_LPID, val, winctx->lnotify_lpid); + write_hvwc_reg(window, VREG(LNOTIFY_LPID), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_TID, val, winctx->lnotify_tid); + write_hvwc_reg(window, VREG(LNOTIFY_TID), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_MIN_SCOPE, val, winctx->min_scope); + val = SET_FIELD(VAS_LNOTIFY_MAX_SCOPE, val, winctx->max_scope); + write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), val); + + /* Skip read-only registers NX_UTIL and NX_UTIL_SE */ + + write_hvwc_reg(window, VREG(SPARE5), 0ULL); + write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL); + write_hvwc_reg(window, VREG(SPARE6), 0ULL); + + /* Finally, push window context to memory and... */ + val = 0ULL; + val = SET_FIELD(VAS_PUSH_TO_MEM, val, 1); + write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val); + + /* ... mark the window open for business */ + val = 0ULL; + val = SET_FIELD(VAS_WINCTL_REJ_NO_CREDIT, val, winctx->rej_no_credit); + val = SET_FIELD(VAS_WINCTL_PIN, val, winctx->pin_win); + val = SET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val, winctx->tx_wcred_mode); + val = SET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val, winctx->rx_wcred_mode); + val = SET_FIELD(VAS_WINCTL_TX_WORD_MODE, val, winctx->tx_word_mode); + val = SET_FIELD(VAS_WINCTL_RX_WORD_MODE, val, winctx->rx_word_mode); + val = SET_FIELD(VAS_WINCTL_FAULT_WIN, val, winctx->fault_win); + val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win); + val = SET_FIELD(VAS_WINCTL_OPEN, val, 1); + write_hvwc_reg(window, VREG(WINCTL), val); + + return 0; +} + +static DEFINE_SPINLOCK(vas_ida_lock); + +static void vas_release_window_id(struct ida *ida, int winid) +{ + spin_lock(&vas_ida_lock); + ida_remove(ida, winid); + spin_unlock(&vas_ida_lock); +} + +static int vas_assign_window_id(struct ida *ida) +{ + int rc, winid; + + do { + rc = ida_pre_get(ida, GFP_KERNEL); + if (!rc) + return -EAGAIN; + + spin_lock(&vas_ida_lock); + rc = ida_get_new(ida, &winid); + spin_unlock(&vas_ida_lock); + } while (rc == -EAGAIN); + + if (rc) + return rc; + + if (winid > VAS_WINDOWS_PER_CHIP) { + pr_err("Too many (%d) open windows\n", winid); + vas_release_window_id(ida, winid); + return -EAGAIN; + } + + return winid; +} + +static void vas_window_free(struct vas_window *window) +{ + int winid = window->winid; + struct vas_instance *vinst = window->vinst; + + unmap_winctx_mmio_bars(window); + kfree(window); + + vas_release_window_id(&vinst->ida, winid); +} + +static struct vas_window *vas_window_alloc(struct vas_instance *vinst) +{ + int winid; + struct vas_window *window; + + winid = vas_assign_window_id(&vinst->ida); + if (winid < 0) + return ERR_PTR(winid); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_free; + + window->vinst = vinst; + window->winid = winid; + + if (map_winctx_mmio_bars(window)) + goto out_free; + + return window; + +out_free: + kfree(window); + vas_release_window_id(&vinst->ida, winid); + return ERR_PTR(-ENOMEM); +} + +static void put_rx_win(struct vas_window *rxwin) +{ + /* Better not be a send window! */ + WARN_ON_ONCE(rxwin->tx_win); + + atomic_dec(&rxwin->num_txwins); +} + +/* + * Get the VAS receive window associated with NX engine identified + * by @cop and if applicable, @pswid. + * + * See also function header of set_vinst_win(). + */ +static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst, + enum vas_cop_type cop, u32 pswid) +{ + struct vas_window *rxwin; + + mutex_lock(&vinst->mutex); + + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) + rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); + else + rxwin = ERR_PTR(-EINVAL); + + if (!IS_ERR(rxwin)) + atomic_inc(&rxwin->num_txwins); + + mutex_unlock(&vinst->mutex); + + return rxwin; +} + +/* + * We have two tables of windows in a VAS instance. The first one, + * ->windows[], contains all the windows in the instance and allows + * looking up a window by its id. It is used to look up send windows + * during fault handling and receive windows when pairing user space + * send/receive windows. + * + * The second table, ->rxwin[], contains receive windows that are + * associated with NX engines. This table has VAS_COP_TYPE_MAX + * entries and is used to look up a receive window by its + * coprocessor type. + * + * Here, we save @window in the ->windows[] table. If it is a receive + * window, we also save the window in the ->rxwin[] table. + */ +static void set_vinst_win(struct vas_instance *vinst, + struct vas_window *window) +{ + int id = window->winid; + + mutex_lock(&vinst->mutex); + + /* + * There should only be one receive window for a coprocessor type + * unless its a user (FTW) window. + */ + if (!window->user_win && !window->tx_win) { + WARN_ON_ONCE(vinst->rxwin[window->cop]); + vinst->rxwin[window->cop] = window; + } + + WARN_ON_ONCE(vinst->windows[id] != NULL); + vinst->windows[id] = window; + + mutex_unlock(&vinst->mutex); +} + +/* + * Clear this window from the table(s) of windows for this VAS instance. + * See also function header of set_vinst_win(). + */ +static void clear_vinst_win(struct vas_window *window) +{ + int id = window->winid; + struct vas_instance *vinst = window->vinst; + + mutex_lock(&vinst->mutex); + + if (!window->user_win && !window->tx_win) { + WARN_ON_ONCE(!vinst->rxwin[window->cop]); + vinst->rxwin[window->cop] = NULL; + } + + WARN_ON_ONCE(vinst->windows[id] != window); + vinst->windows[id] = NULL; + + mutex_unlock(&vinst->mutex); +} + +static void init_winctx_for_rxwin(struct vas_window *rxwin, + struct vas_rx_win_attr *rxattr, + struct vas_winctx *winctx) +{ + /* + * We first zero (memset()) all fields and only set non-zero fields. + * Following fields are 0/false but maybe deserve a comment: + * + * ->notify_os_intr_reg In powerNV, send intrs to HV + * ->notify_disable False for NX windows + * ->intr_disable False for Fault Windows + * ->xtra_write False for NX windows + * ->notify_early NA for NX windows + * ->rsvd_txbuf_count NA for Rx windows + * ->lpid, ->pid, ->tid NA for Rx windows + */ + + memset(winctx, 0, sizeof(struct vas_winctx)); + + winctx->rx_fifo = rxattr->rx_fifo; + winctx->rx_fifo_size = rxattr->rx_fifo_size; + winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->pin_win = rxattr->pin_win; + + winctx->nx_win = rxattr->nx_win; + winctx->fault_win = rxattr->fault_win; + winctx->rx_word_mode = rxattr->rx_win_ord_mode; + winctx->tx_word_mode = rxattr->tx_win_ord_mode; + winctx->rx_wcred_mode = rxattr->rx_wcred_mode; + winctx->tx_wcred_mode = rxattr->tx_wcred_mode; + + if (winctx->nx_win) { + winctx->data_stamp = true; + winctx->intr_disable = true; + winctx->pin_win = true; + + WARN_ON_ONCE(winctx->fault_win); + WARN_ON_ONCE(!winctx->rx_word_mode); + WARN_ON_ONCE(!winctx->tx_word_mode); + WARN_ON_ONCE(winctx->notify_after_count); + } else if (winctx->fault_win) { + winctx->notify_disable = true; + } else if (winctx->user_win) { + /* + * Section 1.8.1 Low Latency Core-Core Wake up of + * the VAS workbook: + * + * - disable credit checks ([tr]x_wcred_mode = false) + * - disable FIFO writes + * - enable ASB_Notify, disable interrupt + */ + winctx->fifo_disable = true; + winctx->intr_disable = true; + winctx->rx_fifo = NULL; + } + + winctx->lnotify_lpid = rxattr->lnotify_lpid; + winctx->lnotify_pid = rxattr->lnotify_pid; + winctx->lnotify_tid = rxattr->lnotify_tid; + winctx->pswid = rxattr->pswid; + winctx->dma_type = VAS_DMA_TYPE_INJECT; + winctx->tc_mode = rxattr->tc_mode; + + winctx->min_scope = VAS_SCOPE_LOCAL; + winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; +} + +static bool rx_win_args_valid(enum vas_cop_type cop, + struct vas_rx_win_attr *attr) +{ + dump_rx_win_attr(attr); + + if (cop >= VAS_COP_TYPE_MAX) + return false; + + if (cop != VAS_COP_TYPE_FTW && + attr->rx_fifo_size < VAS_RX_FIFO_SIZE_MIN) + return false; + + if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) + return false; + + if (attr->nx_win) { + /* cannot be fault or user window if it is nx */ + if (attr->fault_win || attr->user_win) + return false; + /* + * Section 3.1.4.32: NX Windows must not disable notification, + * and must not enable interrupts or early notification. + */ + if (attr->notify_disable || !attr->intr_disable || + attr->notify_early) + return false; + } else if (attr->fault_win) { + /* cannot be both fault and user window */ + if (attr->user_win) + return false; + + /* + * Section 3.1.4.32: Fault windows must disable notification + * but not interrupts. + */ + if (!attr->notify_disable || attr->intr_disable) + return false; + + } else if (attr->user_win) { + /* + * User receive windows are only for fast-thread-wakeup + * (FTW). They don't need a FIFO and must disable interrupts + */ + if (attr->rx_fifo || attr->rx_fifo_size || !attr->intr_disable) + return false; + } else { + /* Rx window must be one of NX or Fault or User window. */ + return false; + } + + return true; +} + +void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop) +{ + memset(rxattr, 0, sizeof(*rxattr)); + + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + rxattr->pin_win = true; + rxattr->nx_win = true; + rxattr->fault_win = false; + rxattr->intr_disable = true; + rxattr->rx_wcred_mode = true; + rxattr->tx_wcred_mode = true; + rxattr->rx_win_ord_mode = true; + rxattr->tx_win_ord_mode = true; + } else if (cop == VAS_COP_TYPE_FAULT) { + rxattr->pin_win = true; + rxattr->fault_win = true; + rxattr->notify_disable = true; + rxattr->rx_wcred_mode = true; + rxattr->tx_wcred_mode = true; + rxattr->rx_win_ord_mode = true; + rxattr->tx_win_ord_mode = true; + } else if (cop == VAS_COP_TYPE_FTW) { + rxattr->user_win = true; + rxattr->intr_disable = true; + + /* + * As noted in the VAS Workbook we disable credit checks. + * If we enable credit checks in the future, we must also + * implement a mechanism to return the user credits or new + * paste operations will fail. + */ + } +} +EXPORT_SYMBOL_GPL(vas_init_rx_win_attr); + +struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, + struct vas_rx_win_attr *rxattr) +{ + struct vas_window *rxwin; + struct vas_winctx winctx; + struct vas_instance *vinst; + + if (!rx_win_args_valid(cop, rxattr)) + return ERR_PTR(-EINVAL); + + vinst = find_vas_instance(vasid); + if (!vinst) { + pr_devel("vasid %d not found!\n", vasid); + return ERR_PTR(-EINVAL); + } + pr_devel("Found instance %d\n", vasid); + + rxwin = vas_window_alloc(vinst); + if (IS_ERR(rxwin)) { + pr_devel("Unable to allocate memory for Rx window\n"); + return rxwin; + } + + rxwin->tx_win = false; + rxwin->nx_win = rxattr->nx_win; + rxwin->user_win = rxattr->user_win; + rxwin->cop = cop; + if (rxattr->user_win) + rxwin->pid = task_pid_vnr(current); + + init_winctx_for_rxwin(rxwin, rxattr, &winctx); + init_winctx_regs(rxwin, &winctx); + + set_vinst_win(vinst, rxwin); + + return rxwin; +} +EXPORT_SYMBOL_GPL(vas_rx_win_open); + +void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop) +{ + memset(txattr, 0, sizeof(*txattr)); + + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + txattr->rej_no_credit = false; + txattr->rx_wcred_mode = true; + txattr->tx_wcred_mode = true; + txattr->rx_win_ord_mode = true; + txattr->tx_win_ord_mode = true; + } else if (cop == VAS_COP_TYPE_FTW) { + txattr->user_win = true; + } +} +EXPORT_SYMBOL_GPL(vas_init_tx_win_attr); + +static void init_winctx_for_txwin(struct vas_window *txwin, + struct vas_tx_win_attr *txattr, + struct vas_winctx *winctx) +{ + /* + * We first zero all fields and only set non-zero ones. Following + * are some fields set to 0/false for the stated reason: + * + * ->notify_os_intr_reg In powernv, send intrs to HV + * ->rsvd_txbuf_count Not supported yet. + * ->notify_disable False for NX windows + * ->xtra_write False for NX windows + * ->notify_early NA for NX windows + * ->lnotify_lpid NA for Tx windows + * ->lnotify_pid NA for Tx windows + * ->lnotify_tid NA for Tx windows + * ->tx_win_cred_mode Ignore for now for NX windows + * ->rx_win_cred_mode Ignore for now for NX windows + */ + memset(winctx, 0, sizeof(struct vas_winctx)); + + winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + + winctx->user_win = txattr->user_win; + winctx->nx_win = txwin->rxwin->nx_win; + winctx->pin_win = txattr->pin_win; + + winctx->rx_wcred_mode = txattr->rx_wcred_mode; + winctx->tx_wcred_mode = txattr->tx_wcred_mode; + winctx->rx_word_mode = txattr->rx_win_ord_mode; + winctx->tx_word_mode = txattr->tx_win_ord_mode; + + if (winctx->nx_win) { + winctx->data_stamp = true; + winctx->intr_disable = true; + } + + winctx->lpid = txattr->lpid; + winctx->pidr = txattr->pidr; + winctx->rx_win_id = txwin->rxwin->winid; + + winctx->dma_type = VAS_DMA_TYPE_INJECT; + winctx->tc_mode = txattr->tc_mode; + winctx->min_scope = VAS_SCOPE_LOCAL; + winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + + winctx->pswid = 0; +} + +static bool tx_win_args_valid(enum vas_cop_type cop, + struct vas_tx_win_attr *attr) +{ + if (attr->tc_mode != VAS_THRESH_DISABLED) + return false; + + if (cop > VAS_COP_TYPE_MAX) + return false; + + if (attr->user_win && + (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count)) + return false; + + return true; +} + +struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, + struct vas_tx_win_attr *attr) +{ + int rc; + struct vas_window *txwin; + struct vas_window *rxwin; + struct vas_winctx winctx; + struct vas_instance *vinst; + + if (!tx_win_args_valid(cop, attr)) + return ERR_PTR(-EINVAL); + + vinst = find_vas_instance(vasid); + if (!vinst) { + pr_devel("vasid %d not found!\n", vasid); + return ERR_PTR(-EINVAL); + } + + rxwin = get_vinst_rxwin(vinst, cop, attr->pswid); + if (IS_ERR(rxwin)) { + pr_devel("No RxWin for vasid %d, cop %d\n", vasid, cop); + return rxwin; + } + + txwin = vas_window_alloc(vinst); + if (IS_ERR(txwin)) { + rc = PTR_ERR(txwin); + goto put_rxwin; + } + + txwin->tx_win = 1; + txwin->rxwin = rxwin; + txwin->nx_win = txwin->rxwin->nx_win; + txwin->pid = attr->pid; + txwin->user_win = attr->user_win; + + init_winctx_for_txwin(txwin, attr, &winctx); + + init_winctx_regs(txwin, &winctx); + + /* + * If its a kernel send window, map the window address into the + * kernel's address space. For user windows, user must issue an + * mmap() to map the window into their address space. + * + * NOTE: If kernel ever resubmits a user CRB after handling a page + * fault, we will need to map this into kernel as well. + */ + if (!txwin->user_win) { + txwin->paste_kaddr = map_paste_region(txwin); + if (IS_ERR(txwin->paste_kaddr)) { + rc = PTR_ERR(txwin->paste_kaddr); + goto free_window; + } + } + + set_vinst_win(vinst, txwin); + + return txwin; + +free_window: + vas_window_free(txwin); + +put_rxwin: + put_rx_win(rxwin); + return ERR_PTR(rc); + +} +EXPORT_SYMBOL_GPL(vas_tx_win_open); + +int vas_copy_crb(void *crb, int offset) +{ + return vas_copy(crb, offset); +} +EXPORT_SYMBOL_GPL(vas_copy_crb); + +#define RMA_LSMP_REPORT_ENABLE PPC_BIT(53) +int vas_paste_crb(struct vas_window *txwin, int offset, bool re) +{ + int rc; + void *addr; + uint64_t val; + + /* + * Only NX windows are supported for now and hardware assumes + * report-enable flag is set for NX windows. Ensure software + * complies too. + */ + WARN_ON_ONCE(txwin->nx_win && !re); + + addr = txwin->paste_kaddr; + if (re) { + /* + * Set the REPORT_ENABLE bit (equivalent to writing + * to 1K offset of the paste address) + */ + val = SET_FIELD(RMA_LSMP_REPORT_ENABLE, 0ULL, 1); + addr += val; + } + + /* + * Map the raw CR value from vas_paste() to an error code (there + * is just pass or fail for now though). + */ + rc = vas_paste(addr, offset); + if (rc == 2) + rc = 0; + else + rc = -EINVAL; + + print_fifo_msg_count(txwin); + + return rc; +} +EXPORT_SYMBOL_GPL(vas_paste_crb); + +static void poll_window_busy_state(struct vas_window *window) +{ + int busy; + u64 val; + +retry: + /* + * Poll Window Busy flag + */ + val = read_hvwc_reg(window, VREG(WIN_STATUS)); + busy = GET_FIELD(VAS_WIN_BUSY, val); + if (busy) { + val = 0; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + goto retry; + } +} + +static void poll_window_castout(struct vas_window *window) +{ + int cached; + u64 val; + + /* Cast window context out of the cache */ +retry: + val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL)); + cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val); + if (cached) { + val = 0ULL; + val = SET_FIELD(VAS_CASTOUT_REQ, val, 1); + val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0); + write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + goto retry; + } +} + +/* + * Close a window. + * + * See Section 1.12.1 of VAS workbook v1.05 for details on closing window: + * - Disable new paste operations (unmap paste address) + * - Poll for the "Window Busy" bit to be cleared + * - Clear the Open/Enable bit for the Window. + * - Poll for return of window Credits (implies FIFO empty for Rx win?) + * - Unpin and cast window context out of cache + * + * Besides the hardware, kernel has some bookkeeping of course. + */ +int vas_win_close(struct vas_window *window) +{ + u64 val; + + if (!window) + return 0; + + if (!window->tx_win && atomic_read(&window->num_txwins) != 0) { + pr_devel("Attempting to close an active Rx window!\n"); + WARN_ON_ONCE(1); + return -EBUSY; + } + + unmap_paste_region(window); + + clear_vinst_win(window); + + poll_window_busy_state(window); + + /* Unpin window from cache and close it */ + val = read_hvwc_reg(window, VREG(WINCTL)); + val = SET_FIELD(VAS_WINCTL_PIN, val, 0); + val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); + write_hvwc_reg(window, VREG(WINCTL), val); + + poll_window_castout(window); + + /* if send window, drop reference to matching receive window */ + if (window->tx_win) + put_rx_win(window->rxwin); + + vas_window_free(window); + + return 0; +} +EXPORT_SYMBOL_GPL(vas_win_close); diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c new file mode 100644 index 000000000000..565a4878fefa --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas.c @@ -0,0 +1,151 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/platform_device.h> +#include <linux/of_platform.h> +#include <linux/of_address.h> +#include <linux/of.h> + +#include "vas.h" + +static DEFINE_MUTEX(vas_mutex); +static LIST_HEAD(vas_instances); + +static int init_vas_instance(struct platform_device *pdev) +{ + int rc, vasid; + struct resource *res; + struct vas_instance *vinst; + struct device_node *dn = pdev->dev.of_node; + + rc = of_property_read_u32(dn, "ibm,vas-id", &vasid); + if (rc) { + pr_err("No ibm,vas-id property for %s?\n", pdev->name); + return -ENODEV; + } + + if (pdev->num_resources != 4) { + pr_err("Unexpected DT configuration for [%s, %d]\n", + pdev->name, vasid); + return -ENODEV; + } + + vinst = kzalloc(sizeof(*vinst), GFP_KERNEL); + if (!vinst) + return -ENOMEM; + + INIT_LIST_HEAD(&vinst->node); + ida_init(&vinst->ida); + mutex_init(&vinst->mutex); + vinst->vas_id = vasid; + vinst->pdev = pdev; + + res = &pdev->resource[0]; + vinst->hvwc_bar_start = res->start; + + res = &pdev->resource[1]; + vinst->uwc_bar_start = res->start; + + res = &pdev->resource[2]; + vinst->paste_base_addr = res->start; + + res = &pdev->resource[3]; + if (res->end > 62) { + pr_err("Bad 'paste_win_id_shift' in DT, %llx\n", res->end); + goto free_vinst; + } + + vinst->paste_win_id_shift = 63 - res->end; + + pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, " + "paste_win_id_shift 0x%llx\n", pdev->name, vasid, + vinst->paste_base_addr, vinst->paste_win_id_shift); + + mutex_lock(&vas_mutex); + list_add(&vinst->node, &vas_instances); + mutex_unlock(&vas_mutex); + + dev_set_drvdata(&pdev->dev, vinst); + + return 0; + +free_vinst: + kfree(vinst); + return -ENODEV; + +} + +/* + * Although this is read/used multiple times, it is written to only + * during initialization. + */ +struct vas_instance *find_vas_instance(int vasid) +{ + struct list_head *ent; + struct vas_instance *vinst; + + mutex_lock(&vas_mutex); + list_for_each(ent, &vas_instances) { + vinst = list_entry(ent, struct vas_instance, node); + if (vinst->vas_id == vasid) { + mutex_unlock(&vas_mutex); + return vinst; + } + } + mutex_unlock(&vas_mutex); + + pr_devel("Instance %d not found\n", vasid); + return NULL; +} + +static int vas_probe(struct platform_device *pdev) +{ + return init_vas_instance(pdev); +} + +static const struct of_device_id powernv_vas_match[] = { + { .compatible = "ibm,vas",}, + {}, +}; + +static struct platform_driver vas_driver = { + .driver = { + .name = "vas", + .of_match_table = powernv_vas_match, + }, + .probe = vas_probe, +}; + +static int __init vas_init(void) +{ + int found = 0; + struct device_node *dn; + + platform_driver_register(&vas_driver); + + for_each_compatible_node(dn, NULL, "ibm,vas") { + of_platform_device_create(dn, NULL, NULL); + found++; + } + + if (!found) + return -ENODEV; + + pr_devel("Found %d instances\n", found); + + return 0; +} +device_initcall(vas_init); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h new file mode 100644 index 000000000000..38dee5d50f31 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas.h @@ -0,0 +1,467 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _VAS_H +#define _VAS_H +#include <linux/atomic.h> +#include <linux/idr.h> +#include <asm/vas.h> +#include <linux/io.h> + +/* + * Overview of Virtual Accelerator Switchboard (VAS). + * + * VAS is a hardware "switchboard" that allows senders and receivers to + * exchange messages with _minimal_ kernel involvment. The receivers are + * typically NX coprocessor engines that perform compression or encryption + * in hardware, but receivers can also be other software threads. + * + * Senders are user/kernel threads that submit compression/encryption or + * other requests to the receivers. Senders must format their messages as + * Coprocessor Request Blocks (CRB)s and submit them using the "copy" and + * "paste" instructions which were introduced in Power9. + * + * A Power node can have (upto?) 8 Power chips. There is one instance of + * VAS in each Power9 chip. Each instance of VAS has 64K windows or ports, + * Senders and receivers must each connect to a separate window before they + * can exchange messages through the switchboard. + * + * Each window is described by two types of window contexts: + * + * Hypervisor Window Context (HVWC) of size VAS_HVWC_SIZE bytes + * + * OS/User Window Context (UWC) of size VAS_UWC_SIZE bytes. + * + * A window context can be viewed as a set of 64-bit registers. The settings + * in these registers configure/control/determine the behavior of the VAS + * hardware when messages are sent/received through the window. The registers + * in the HVWC are configured by the kernel while the registers in the UWC can + * be configured by the kernel or by the user space application that is using + * the window. + * + * The HVWCs for all windows on a specific instance of VAS are in a contiguous + * range of hardware addresses or Base address region (BAR) referred to as the + * HVWC BAR for the instance. Similarly the UWCs for all windows on an instance + * are referred to as the UWC BAR for the instance. + * + * The two BARs for each instance are defined Power9 MMIO Ranges spreadsheet + * and available to the kernel in the VAS node's "reg" property in the device + * tree: + * + * /proc/device-tree/vasm@.../reg + * + * (see vas_probe() for details on the reg property). + * + * The kernel maps the HVWC and UWC BAR regions into the kernel address + * space (hvwc_map and uwc_map). The kernel can then access the window + * contexts of a specific window using: + * + * hvwc = hvwc_map + winid * VAS_HVWC_SIZE. + * uwc = uwc_map + winid * VAS_UWC_SIZE. + * + * where winid is the window index (0..64K). + * + * As mentioned, a window context is used to "configure" a window. Besides + * this configuration address, each _send_ window also has a unique hardware + * "paste" address that is used to submit requests/CRBs (see vas_paste_crb()). + * + * The hardware paste address for a window is computed using the "paste + * base address" and "paste win id shift" reg properties in the VAS device + * tree node using: + * + * paste_addr = paste_base + ((winid << paste_win_id_shift)) + * + * (again, see vas_probe() for ->paste_base_addr and ->paste_win_id_shift). + * + * The kernel maps this hardware address into the sender's address space + * after which they can use the 'paste' instruction (new in Power9) to + * send a message (submit a request aka CRB) to the coprocessor. + * + * NOTE: In the initial version, senders can only in-kernel drivers/threads. + * Support for user space threads will be added in follow-on patches. + * + * TODO: Do we need to map the UWC into user address space so they can return + * credits? Its NA for NX but may be needed for other receive windows. + * + */ + +#define VAS_WINDOWS_PER_CHIP (64 << 10) + +/* + * Hypervisor and OS/USer Window Context sizes + */ +#define VAS_HVWC_SIZE 512 +#define VAS_UWC_SIZE PAGE_SIZE + +/* + * Initial per-process credits. + * Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED) + * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED) + * + * TODO: Needs tuning for per-process credits + */ +#define VAS_WCREDS_MIN 16 +#define VAS_WCREDS_MAX ((64 << 10) - 1) +#define VAS_WCREDS_DEFAULT (1 << 10) + +/* + * VAS Window Context Register Offsets and bitmasks. + * See Section 3.1.4 of VAS Work book + */ +#define VAS_LPID_OFFSET 0x010 +#define VAS_LPID PPC_BITMASK(0, 11) + +#define VAS_PID_OFFSET 0x018 +#define VAS_PID_ID PPC_BITMASK(0, 19) + +#define VAS_XLATE_MSR_OFFSET 0x020 +#define VAS_XLATE_MSR_DR PPC_BIT(0) +#define VAS_XLATE_MSR_TA PPC_BIT(1) +#define VAS_XLATE_MSR_PR PPC_BIT(2) +#define VAS_XLATE_MSR_US PPC_BIT(3) +#define VAS_XLATE_MSR_HV PPC_BIT(4) +#define VAS_XLATE_MSR_SF PPC_BIT(5) + +#define VAS_XLATE_LPCR_OFFSET 0x028 +#define VAS_XLATE_LPCR_PAGE_SIZE PPC_BITMASK(0, 2) +#define VAS_XLATE_LPCR_ISL PPC_BIT(3) +#define VAS_XLATE_LPCR_TC PPC_BIT(4) +#define VAS_XLATE_LPCR_SC PPC_BIT(5) + +#define VAS_XLATE_CTL_OFFSET 0x030 +#define VAS_XLATE_MODE PPC_BITMASK(0, 1) + +#define VAS_AMR_OFFSET 0x040 +#define VAS_AMR PPC_BITMASK(0, 63) + +#define VAS_SEIDR_OFFSET 0x048 +#define VAS_SEIDR PPC_BITMASK(0, 63) + +#define VAS_FAULT_TX_WIN_OFFSET 0x050 +#define VAS_FAULT_TX_WIN PPC_BITMASK(48, 63) + +#define VAS_OSU_INTR_SRC_RA_OFFSET 0x060 +#define VAS_OSU_INTR_SRC_RA PPC_BITMASK(8, 63) + +#define VAS_HV_INTR_SRC_RA_OFFSET 0x070 +#define VAS_HV_INTR_SRC_RA PPC_BITMASK(8, 63) + +#define VAS_PSWID_OFFSET 0x078 +#define VAS_PSWID_EA_HANDLE PPC_BITMASK(0, 31) + +#define VAS_SPARE1_OFFSET 0x080 +#define VAS_SPARE2_OFFSET 0x088 +#define VAS_SPARE3_OFFSET 0x090 +#define VAS_SPARE4_OFFSET 0x130 +#define VAS_SPARE5_OFFSET 0x160 +#define VAS_SPARE6_OFFSET 0x188 + +#define VAS_LFIFO_BAR_OFFSET 0x0A0 +#define VAS_LFIFO_BAR PPC_BITMASK(8, 53) +#define VAS_PAGE_MIGRATION_SELECT PPC_BITMASK(54, 56) + +#define VAS_LDATA_STAMP_CTL_OFFSET 0x0A8 +#define VAS_LDATA_STAMP PPC_BITMASK(0, 1) +#define VAS_XTRA_WRITE PPC_BIT(2) + +#define VAS_LDMA_CACHE_CTL_OFFSET 0x0B0 +#define VAS_LDMA_TYPE PPC_BITMASK(0, 1) +#define VAS_LDMA_FIFO_DISABLE PPC_BIT(2) + +#define VAS_LRFIFO_PUSH_OFFSET 0x0B8 +#define VAS_LRFIFO_PUSH PPC_BITMASK(0, 15) + +#define VAS_CURR_MSG_COUNT_OFFSET 0x0C0 +#define VAS_CURR_MSG_COUNT PPC_BITMASK(0, 7) + +#define VAS_LNOTIFY_AFTER_COUNT_OFFSET 0x0C8 +#define VAS_LNOTIFY_AFTER_COUNT PPC_BITMASK(0, 7) + +#define VAS_LRX_WCRED_OFFSET 0x0E0 +#define VAS_LRX_WCRED PPC_BITMASK(0, 15) + +#define VAS_LRX_WCRED_ADDER_OFFSET 0x190 +#define VAS_LRX_WCRED_ADDER PPC_BITMASK(0, 15) + +#define VAS_TX_WCRED_OFFSET 0x0F0 +#define VAS_TX_WCRED PPC_BITMASK(4, 15) + +#define VAS_TX_WCRED_ADDER_OFFSET 0x1A0 +#define VAS_TX_WCRED_ADDER PPC_BITMASK(4, 15) + +#define VAS_LFIFO_SIZE_OFFSET 0x100 +#define VAS_LFIFO_SIZE PPC_BITMASK(0, 3) + +#define VAS_WINCTL_OFFSET 0x108 +#define VAS_WINCTL_OPEN PPC_BIT(0) +#define VAS_WINCTL_REJ_NO_CREDIT PPC_BIT(1) +#define VAS_WINCTL_PIN PPC_BIT(2) +#define VAS_WINCTL_TX_WCRED_MODE PPC_BIT(3) +#define VAS_WINCTL_RX_WCRED_MODE PPC_BIT(4) +#define VAS_WINCTL_TX_WORD_MODE PPC_BIT(5) +#define VAS_WINCTL_RX_WORD_MODE PPC_BIT(6) +#define VAS_WINCTL_RSVD_TXBUF PPC_BIT(7) +#define VAS_WINCTL_THRESH_CTL PPC_BITMASK(8, 9) +#define VAS_WINCTL_FAULT_WIN PPC_BIT(10) +#define VAS_WINCTL_NX_WIN PPC_BIT(11) + +#define VAS_WIN_STATUS_OFFSET 0x110 +#define VAS_WIN_BUSY PPC_BIT(1) + +#define VAS_WIN_CTX_CACHING_CTL_OFFSET 0x118 +#define VAS_CASTOUT_REQ PPC_BIT(0) +#define VAS_PUSH_TO_MEM PPC_BIT(1) +#define VAS_WIN_CACHE_STATUS PPC_BIT(4) + +#define VAS_TX_RSVD_BUF_COUNT_OFFSET 0x120 +#define VAS_RXVD_BUF_COUNT PPC_BITMASK(58, 63) + +#define VAS_LRFIFO_WIN_PTR_OFFSET 0x128 +#define VAS_LRX_WIN_ID PPC_BITMASK(0, 15) + +/* + * Local Notification Control Register controls what happens in _response_ + * to a paste command and hence applies only to receive windows. + */ +#define VAS_LNOTIFY_CTL_OFFSET 0x138 +#define VAS_NOTIFY_DISABLE PPC_BIT(0) +#define VAS_INTR_DISABLE PPC_BIT(1) +#define VAS_NOTIFY_EARLY PPC_BIT(2) +#define VAS_NOTIFY_OSU_INTR PPC_BIT(3) + +#define VAS_LNOTIFY_PID_OFFSET 0x140 +#define VAS_LNOTIFY_PID PPC_BITMASK(0, 19) + +#define VAS_LNOTIFY_LPID_OFFSET 0x148 +#define VAS_LNOTIFY_LPID PPC_BITMASK(0, 11) + +#define VAS_LNOTIFY_TID_OFFSET 0x150 +#define VAS_LNOTIFY_TID PPC_BITMASK(0, 15) + +#define VAS_LNOTIFY_SCOPE_OFFSET 0x158 +#define VAS_LNOTIFY_MIN_SCOPE PPC_BITMASK(0, 1) +#define VAS_LNOTIFY_MAX_SCOPE PPC_BITMASK(2, 3) + +#define VAS_NX_UTIL_OFFSET 0x1B0 +#define VAS_NX_UTIL PPC_BITMASK(0, 63) + +/* SE: Side effects */ +#define VAS_NX_UTIL_SE_OFFSET 0x1B8 +#define VAS_NX_UTIL_SE PPC_BITMASK(0, 63) + +#define VAS_NX_UTIL_ADDER_OFFSET 0x180 +#define VAS_NX_UTIL_ADDER PPC_BITMASK(32, 63) + +/* + * Local Notify Scope Control Register. (Receive windows only). + */ +enum vas_notify_scope { + VAS_SCOPE_LOCAL, + VAS_SCOPE_GROUP, + VAS_SCOPE_VECTORED_GROUP, + VAS_SCOPE_UNUSED, +}; + +/* + * Local DMA Cache Control Register (Receive windows only). + */ +enum vas_dma_type { + VAS_DMA_TYPE_INJECT, + VAS_DMA_TYPE_WRITE, +}; + +/* + * Local Notify Scope Control Register. (Receive windows only). + * Not applicable to NX receive windows. + */ +enum vas_notify_after_count { + VAS_NOTIFY_AFTER_256 = 0, + VAS_NOTIFY_NONE, + VAS_NOTIFY_AFTER_2 +}; + +/* + * One per instance of VAS. Each instance will have a separate set of + * receive windows, one per coprocessor type. + * + * See also function header of set_vinst_win() for details on ->windows[] + * and ->rxwin[] tables. + */ +struct vas_instance { + int vas_id; + struct ida ida; + struct list_head node; + struct platform_device *pdev; + + u64 hvwc_bar_start; + u64 uwc_bar_start; + u64 paste_base_addr; + u64 paste_win_id_shift; + + struct mutex mutex; + struct vas_window *rxwin[VAS_COP_TYPE_MAX]; + struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; +}; + +/* + * In-kernel state a VAS window. One per window. + */ +struct vas_window { + /* Fields common to send and receive windows */ + struct vas_instance *vinst; + int winid; + bool tx_win; /* True if send window */ + bool nx_win; /* True if NX window */ + bool user_win; /* True if user space window */ + void *hvwc_map; /* HV window context */ + void *uwc_map; /* OS/User window context */ + pid_t pid; /* Linux process id of owner */ + + /* Fields applicable only to send windows */ + void *paste_kaddr; + char *paste_addr_name; + struct vas_window *rxwin; + + /* Feilds applicable only to receive windows */ + enum vas_cop_type cop; + atomic_t num_txwins; +}; + +/* + * Container for the hardware state of a window. One per-window. + * + * A VAS Window context is a 512-byte area in the hardware that contains + * a set of 64-bit registers. Individual bit-fields in these registers + * determine the configuration/operation of the hardware. struct vas_winctx + * is a container for the register fields in the window context. + */ +struct vas_winctx { + void *rx_fifo; + int rx_fifo_size; + int wcreds_max; + int rsvd_txbuf_count; + + bool user_win; + bool nx_win; + bool fault_win; + bool rsvd_txbuf_enable; + bool pin_win; + bool rej_no_credit; + bool tx_wcred_mode; + bool rx_wcred_mode; + bool tx_word_mode; + bool rx_word_mode; + bool data_stamp; + bool xtra_write; + bool notify_disable; + bool intr_disable; + bool fifo_disable; + bool notify_early; + bool notify_os_intr_reg; + + int lpid; + int pidr; /* value from SPRN_PID, not linux pid */ + int lnotify_lpid; + int lnotify_pid; + int lnotify_tid; + u32 pswid; + int rx_win_id; + int fault_win_id; + int tc_mode; + + u64 irq_port; + + enum vas_dma_type dma_type; + enum vas_notify_scope min_scope; + enum vas_notify_scope max_scope; + enum vas_notify_after_count notify_after_count; +}; + +extern struct vas_instance *find_vas_instance(int vasid); + +/* + * VREG(x): + * Expand a register's short name (eg: LPID) into two parameters: + * - the register's short name in string form ("LPID"), and + * - the name of the macro (eg: VAS_LPID_OFFSET), defining the + * register's offset in the window context + */ +#define VREG_SFX(n, s) __stringify(n), VAS_##n##s +#define VREG(r) VREG_SFX(r, _OFFSET) + +#ifdef vas_debug +static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr) +{ + pr_err("fault %d, notify %d, intr %d early %d\n", + attr->fault_win, attr->notify_disable, + attr->intr_disable, attr->notify_early); + + pr_err("rx_fifo_size %d, max value %d\n", + attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX); +} + +static inline void vas_log_write(struct vas_window *win, char *name, + void *regptr, u64 val) +{ + if (val) + pr_err("%swin #%d: %s reg %p, val 0x%016llx\n", + win->tx_win ? "Tx" : "Rx", win->winid, name, + regptr, val); +} + +#else /* vas_debug */ + +#define vas_log_write(win, name, reg, val) +#define dump_rx_win_attr(attr) + +#endif /* vas_debug */ + +static inline void write_uwc_reg(struct vas_window *win, char *name, + s32 reg, u64 val) +{ + void *regptr; + + regptr = win->uwc_map + reg; + vas_log_write(win, name, regptr, val); + + out_be64(regptr, val); +} + +static inline void write_hvwc_reg(struct vas_window *win, char *name, + s32 reg, u64 val) +{ + void *regptr; + + regptr = win->hvwc_map + reg; + vas_log_write(win, name, regptr, val); + + out_be64(regptr, val); +} + +static inline u64 read_hvwc_reg(struct vas_window *win, + char *name __maybe_unused, s32 reg) +{ + return in_be64(win->hvwc_map+reg); +} + +#ifdef vas_debug + +static void print_fifo_msg_count(struct vas_window *txwin) +{ + uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o); + pr_devel("Winid %d, Msg count %llu\n", txwin->winid, + (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); +} +#else /* vas_debug */ + +#define print_fifo_msg_count(window) + +#endif /* vas_debug */ + +#endif /* _VAS_H */ diff --git a/arch/powerpc/platforms/ps3/repository.c b/arch/powerpc/platforms/ps3/repository.c index 814a7eaa7769..50dbaf24b1ee 100644 --- a/arch/powerpc/platforms/ps3/repository.c +++ b/arch/powerpc/platforms/ps3/repository.c @@ -170,14 +170,8 @@ int ps3_repository_read_bus_str(unsigned int bus_index, const char *bus_str, int ps3_repository_read_bus_id(unsigned int bus_index, u64 *bus_id) { - int result; - - result = read_node(PS3_LPAR_ID_PME, - make_first_field("bus", bus_index), - make_field("id", 0), - 0, 0, - bus_id, NULL); - return result; + return read_node(PS3_LPAR_ID_PME, make_first_field("bus", bus_index), + make_field("id", 0), 0, 0, bus_id, NULL); } int ps3_repository_read_bus_type(unsigned int bus_index, @@ -224,15 +218,9 @@ int ps3_repository_read_dev_str(unsigned int bus_index, int ps3_repository_read_dev_id(unsigned int bus_index, unsigned int dev_index, u64 *dev_id) { - int result; - - result = read_node(PS3_LPAR_ID_PME, - make_first_field("bus", bus_index), - make_field("dev", dev_index), - make_field("id", 0), - 0, - dev_id, NULL); - return result; + return read_node(PS3_LPAR_ID_PME, make_first_field("bus", bus_index), + make_field("dev", dev_index), make_field("id", 0), 0, + dev_id, NULL); } int ps3_repository_read_dev_type(unsigned int bus_index, diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 6244bc849469..9dabea6e1443 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -104,20 +104,6 @@ static void __noreturn ps3_halt(void) ps3_sys_manager_halt(); /* never returns */ } -static void ps3_panic(char *str) -{ - DBG("%s:%d %s\n", __func__, __LINE__, str); - - smp_send_stop(); - printk("\n"); - printk(" System does not reboot automatically.\n"); - printk(" Please press POWER button.\n"); - printk("\n"); - - while(1) - lv1_pause(1); -} - #if defined(CONFIG_FB_PS3) || defined(CONFIG_FB_PS3_MODULE) || \ defined(CONFIG_PS3_FLASH) || defined(CONFIG_PS3_FLASH_MODULE) static void __init prealloc(struct ps3_prealloc *p) @@ -269,7 +255,6 @@ define_machine(ps3) { .probe = ps3_probe, .setup_arch = ps3_setup_arch, .init_IRQ = ps3_init_IRQ, - .panic = ps3_panic, .get_boot_time = ps3_get_boot_time, .set_dabr = ps3_set_dabr, .calibrate_decr = ps3_calibrate_decr, diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 3a6dfd14f64b..71dd69d9ec64 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -7,6 +7,7 @@ config PPC_PSERIES select PCI select PCI_MSI select PPC_XICS + select PPC_XIVE_SPAPR select PPC_ICP_NATIVE select PPC_ICP_HV select PPC_ICS_RTAS diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 39187696ee74..783f36364690 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -254,18 +254,15 @@ cc_error: return first_dn; } -int dlpar_attach_node(struct device_node *dn) +int dlpar_attach_node(struct device_node *dn, struct device_node *parent) { int rc; - dn->parent = pseries_of_derive_parent(dn->full_name); - if (IS_ERR(dn->parent)) - return PTR_ERR(dn->parent); + dn->parent = parent; rc = of_attach_node(dn); if (rc) { - printk(KERN_ERR "Failed to add device node %s\n", - dn->full_name); + printk(KERN_ERR "Failed to add device node %pOF\n", dn); return rc; } diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 1eef46d9cf30..6b812ad990e4 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -247,14 +247,13 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) /* Initialize the fake PE */ memset(&pe, 0, sizeof(struct eeh_pe)); - pe.phb = edev->phb; + pe.phb = pdn->phb; pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); /* Enable EEH on the device */ ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); if (!ret) { /* Retrieve PE address */ - edev->config_addr = (pdn->busno << 16) | (pdn->devfn << 8); edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); pe.addr = edev->pe_config_addr; @@ -279,7 +278,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) /* This device doesn't support EEH, but it may have an * EEH parent, in which case we mark it as supported. */ - edev->config_addr = pdn_to_eeh_dev(pdn->parent)->config_addr; edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; eeh_add_to_parent_pe(edev); } diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c index 32187dc76730..6eeb0d4bab61 100644 --- a/arch/powerpc/platforms/pseries/event_sources.c +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -36,8 +36,8 @@ void request_event_sources_irqs(struct device_node *np, virqs[count] = irq_create_of_mapping(&oirq); if (!virqs[count]) { pr_err("event-sources: Unable to allocate " - "interrupt number for %s\n", - np->full_name); + "interrupt number for %pOF\n", + np); WARN_ON(1); } else { count++; @@ -48,7 +48,7 @@ void request_event_sources_irqs(struct device_node *np, for (i = 0; i < count; i++) { if (request_irq(virqs[i], handler, 0, name, NULL)) { pr_err("event-sources: Unable to request interrupt " - "%d for %s\n", virqs[i], np->full_name); + "%d for %pOF\n", virqs[i], np); WARN_ON(1); return; } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 6afd1efd3633..fc0d8f97c03a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -34,6 +34,7 @@ #include <asm/machdep.h> #include <asm/vdso_datapage.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/plpar_wrappers.h> #include "pseries.h" @@ -109,7 +110,10 @@ static void pseries_mach_cpu_die(void) local_irq_disable(); idle_task_exit(); - xics_teardown_cpu(); + if (xive_enabled()) + xive_teardown_cpu(); + else + xics_teardown_cpu(); if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { set_cpu_current_state(cpu, CPU_STATE_INACTIVE); @@ -174,7 +178,10 @@ static int pseries_cpu_disable(void) boot_cpuid = cpumask_any(cpu_online_mask); /* FIXME: abstract this to not be platform specific later on */ - xics_migrate_irqs_away(); + if (xive_enabled()) + xive_smp_disable_cpu(); + else + xics_migrate_irqs_away(); return 0; } @@ -264,8 +271,8 @@ static int pseries_add_processor(struct device_node *np) /* If we get here, it most likely means that NR_CPUS is * less than the partition's max processors setting. */ - printk(KERN_ERR "Cannot add cpu %s; this system configuration" - " supports %d logical cpus.\n", np->full_name, + printk(KERN_ERR "Cannot add cpu %pOF; this system configuration" + " supports %d logical cpus.\n", np, num_possible_cpus()); goto out_unlock; } @@ -463,7 +470,7 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return -EINVAL; } - rc = dlpar_attach_node(dn); + rc = dlpar_attach_node(dn, parent); if (rc) { saved_rc = rc; pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n", diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index ca9b2f4aaa22..1d48ab424bd9 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -336,7 +336,38 @@ static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) return mem_block; } +static int dlpar_change_lmb_state(struct of_drconf_cell *lmb, bool online) +{ + struct memory_block *mem_block; + int rc; + + mem_block = lmb_to_memblock(lmb); + if (!mem_block) + return -EINVAL; + + if (online && mem_block->dev.offline) + rc = device_online(&mem_block->dev); + else if (!online && !mem_block->dev.offline) + rc = device_offline(&mem_block->dev); + else + rc = 0; + + put_device(&mem_block->dev); + + return rc; +} + +static int dlpar_online_lmb(struct of_drconf_cell *lmb) +{ + return dlpar_change_lmb_state(lmb, true); +} + #ifdef CONFIG_MEMORY_HOTREMOVE +static int dlpar_offline_lmb(struct of_drconf_cell *lmb) +{ + return dlpar_change_lmb_state(lmb, false); +} + static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) { unsigned long block_sz, start_pfn; @@ -431,19 +462,13 @@ static int dlpar_add_lmb(struct of_drconf_cell *); static int dlpar_remove_lmb(struct of_drconf_cell *lmb) { - struct memory_block *mem_block; unsigned long block_sz; int nid, rc; if (!lmb_is_removable(lmb)) return -EINVAL; - mem_block = lmb_to_memblock(lmb); - if (!mem_block) - return -EINVAL; - - rc = device_offline(&mem_block->dev); - put_device(&mem_block->dev); + rc = dlpar_offline_lmb(lmb); if (rc) return rc; @@ -737,20 +762,6 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index, } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static int dlpar_online_lmb(struct of_drconf_cell *lmb) -{ - struct memory_block *mem_block; - int rc; - - mem_block = lmb_to_memblock(lmb); - if (!mem_block) - return -EINVAL; - - rc = device_online(&mem_block->dev); - put_device(&mem_block->dev); - return rc; -} - static int dlpar_add_lmb(struct of_drconf_cell *lmb) { unsigned long block_sz; @@ -817,6 +828,9 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) return -EINVAL; for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) { + if (lmbs[i].flags & DRCONF_MEM_ASSIGNED) + continue; + rc = dlpar_acquire_drc(lmbs[i].drc_index); if (rc) continue; @@ -859,6 +873,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) lmbs[i].base_addr, lmbs[i].drc_index); lmbs[i].reserved = 0; } + rc = 0; } return rc; diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index 74b5b8e239c8..c511a1743a44 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -23,7 +23,7 @@ .globl hcall_tracepoint_refcount hcall_tracepoint_refcount: - .llong 0 + .8byte 0 .section ".text" #endif diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c index 52146b1356d2..408a86044133 100644 --- a/arch/powerpc/platforms/pseries/ibmebus.c +++ b/arch/powerpc/platforms/pseries/ibmebus.c @@ -150,8 +150,7 @@ static const struct dma_map_ops ibmebus_dma_ops = { static int ibmebus_match_path(struct device *dev, void *data) { struct device_node *dn = to_platform_device(dev)->dev.of_node; - return (dn->full_name && - (strcasecmp((char *)data, dn->full_name) == 0)); + return (of_find_node_by_path(data) == dn); } static int ibmebus_match_node(struct device *dev, void *data) @@ -395,7 +394,7 @@ static ssize_t devspec_show(struct device *dev, struct platform_device *ofdev; ofdev = to_platform_device(dev); - return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name); + return sprintf(buf, "%pOF\n", ofdev->dev.of_node); } static DEVICE_ATTR_RO(devspec); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 8374adee27e3..7c181467d0ad 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -511,8 +511,8 @@ static void iommu_table_setparms(struct pci_controller *phb, basep = of_get_property(node, "linux,tce-base", NULL); sizep = of_get_property(node, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) { - printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has " - "missing tce entries !\n", dn->full_name); + printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has " + "missing tce entries !\n", dn); return; } @@ -587,7 +587,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) dn = pci_bus_to_OF_node(bus); - pr_debug("pci_dma_bus_setup_pSeries: setting up bus %s\n", dn->full_name); + pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn); if (bus->self) { /* This is not a root bus, any setup will be done for the @@ -701,8 +701,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) dn = pci_bus_to_OF_node(bus); - pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %s\n", - dn->full_name); + pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", + dn); /* Find nearest ibm,dma-window, walking up the device tree */ for (pdn = dn; pdn != NULL; pdn = pdn->parent) { @@ -718,8 +718,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci = PCI_DN(pdn); - pr_debug(" parent is %s, iommu_table: 0x%p\n", - pdn->full_name, ppci->table_group); + pr_debug(" parent is %pOF, iommu_table: 0x%p\n", + pdn, ppci->table_group); if (!ppci->table_group) { ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); @@ -817,28 +817,28 @@ static void remove_ddw(struct device_node *np, bool remove_prop) ret = tce_clearrange_multi_pSeriesLP(0, 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); if (ret) - pr_warning("%s failed to clear tces in window.\n", - np->full_name); + pr_warning("%pOF failed to clear tces in window.\n", + np); else - pr_debug("%s successfully cleared tces in window.\n", - np->full_name); + pr_debug("%pOF successfully cleared tces in window.\n", + np); ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); if (ret) - pr_warning("%s: failed to remove direct window: rtas returned " + pr_warning("%pOF: failed to remove direct window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", - np->full_name, ret, ddw_avail[2], liobn); + np, ret, ddw_avail[2], liobn); else - pr_debug("%s: successfully removed direct window: rtas returned " + pr_debug("%pOF: successfully removed direct window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", - np->full_name, ret, ddw_avail[2], liobn); + np, ret, ddw_avail[2], liobn); delprop: if (remove_prop) ret = of_remove_property(np, win64); if (ret) - pr_warning("%s: failed to remove direct window property: %d\n", - np->full_name, ret); + pr_warning("%pOF: failed to remove direct window property: %d\n", + np, ret); } static u64 find_existing_ddw(struct device_node *pdn) @@ -1004,7 +1004,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * list. */ list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { - if (!strcmp(fpdn->pdn->full_name, pdn->full_name)) + if (fpdn->pdn == pdn) goto out_unlock; } @@ -1087,8 +1087,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) ddwprop->tce_shift = cpu_to_be32(page_shift); ddwprop->window_shift = cpu_to_be32(len); - dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n", - create.liobn, dn->full_name); + dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", + create.liobn, dn); window = kzalloc(sizeof(*window), GFP_KERNEL); if (!window) @@ -1097,15 +1097,15 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, win64->value, tce_setrange_multi_pSeriesLP_walk); if (ret) { - dev_info(&dev->dev, "failed to map direct window for %s: %d\n", - dn->full_name, ret); + dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", + dn, ret); goto out_free_window; } ret = of_add_property(pdn, win64); if (ret) { - dev_err(&dev->dev, "unable to add dma window property for %s: %d", - pdn->full_name, ret); + dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", + pdn, ret); goto out_free_window; } @@ -1158,7 +1158,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) * already allocated. */ dn = pci_device_to_OF_node(dev); - pr_debug(" node is %s\n", dn->full_name); + pr_debug(" node is %pOF\n", dn); for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; pdn = pdn->parent) { @@ -1169,11 +1169,11 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pdn || !PCI_DN(pdn)) { printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " - "no DMA window found for pci dev=%s dn=%s\n", - pci_name(dev), of_node_full_name(dn)); + "no DMA window found for pci dev=%s dn=%pOF\n", + pci_name(dev), dn); return; } - pr_debug(" parent is %s\n", pdn->full_name); + pr_debug(" parent is %pOF\n", pdn); pci = PCI_DN(pdn); if (!pci->table_group) { @@ -1213,7 +1213,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) /* only attempt to use a new window if 64-bit DMA is requested */ if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { dn = pci_device_to_OF_node(pdev); - dev_dbg(dev, "node is %s\n", dn->full_name); + dev_dbg(dev, "node is %pOF\n", dn); /* * the device tree might contain the dma-window properties diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index 6681ac97fb18..eeb13429d685 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -15,6 +15,7 @@ #include <asm/firmware.h> #include <asm/kexec.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/smp.h> #include <asm/plpar_wrappers.h> @@ -51,5 +52,8 @@ void pseries_kexec_cpu_down(int crash_shutdown, int secondary) } } - xics_kexec_teardown_cpu(secondary); + if (xive_enabled()) + xive_kexec_teardown_cpu(secondary); + else + xics_kexec_teardown_cpu(secondary); } diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 2da4851eff99..210ce632d63e 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -229,7 +229,7 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) if (!dn) return -ENOENT; - rc = dlpar_attach_node(dn); + rc = dlpar_attach_node(dn, parent_dn); if (rc) dlpar_free_cc_nodes(dn); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 326ef0dd6038..b7496948129e 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -132,19 +132,14 @@ static void rtas_teardown_msi_irqs(struct pci_dev *pdev) static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) { struct device_node *dn; - struct pci_dn *pdn; const __be32 *p; u32 req_msi; - pdn = pci_get_pdn(pdev); - if (!pdn) - return -ENODEV; - - dn = pdn->node; + dn = pci_device_to_OF_node(pdev); p = of_get_property(dn, prop_name, NULL); if (!p) { - pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name); + pr_debug("rtas_msi: No %s on %pOF\n", prop_name, dn); return -ENOENT; } @@ -182,8 +177,8 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) while (dn) { p = of_get_property(dn, "ibm,pe-total-#msi", NULL); if (p) { - pr_debug("rtas_msi: found prop on dn %s\n", - dn->full_name); + pr_debug("rtas_msi: found prop on dn %pOF\n", + dn); *total = be32_to_cpup(p); return dn; } @@ -197,7 +192,6 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) { struct device_node *dn; - struct pci_dn *pdn; struct eeh_dev *edev; /* Found our PE and assume 8 at that point. */ @@ -210,8 +204,7 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) edev = pdn_to_eeh_dev(PCI_DN(dn)); if (edev->pe) edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list); - pdn = eeh_dev_to_pdn(edev); - dn = pdn ? pdn->node : NULL; + dn = pci_device_to_OF_node(edev->pdev); if (!dn) return NULL; @@ -222,7 +215,7 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) /* Hardcode of 8 for old firmwares */ *total = 8; - pr_debug("rtas_msi: using PE dn %s\n", dn->full_name); + pr_debug("rtas_msi: using PE dn %pOF\n", dn); return dn; } @@ -242,7 +235,7 @@ static void *count_non_bridge_devices(struct device_node *dn, void *data) const __be32 *p; u32 class; - pr_debug("rtas_msi: counting %s\n", dn->full_name); + pr_debug("rtas_msi: counting %pOF\n", dn); p = of_get_property(dn, "class-code", NULL); class = p ? be32_to_cpup(p) : 0; @@ -300,7 +293,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request) goto out; } - pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name); + pr_debug("rtas_msi: found PE %pOF\n", pe_dn); memset(&counts, 0, sizeof(struct msi_counts)); diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 547fd13e4f8e..561917fa54a8 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -38,7 +38,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; - pr_debug("PCI: Initializing new hotplug PHB %s\n", dn->full_name); + pr_debug("PCI: Initializing new hotplug PHB %pOF\n", dn); phb = pcibios_alloc_controller(dn); if (!phb) diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 1361a9db534b..4470a3194311 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -46,7 +46,7 @@ extern void dlpar_free_cc_nodes(struct device_node *); extern void dlpar_free_cc_property(struct property *); extern struct device_node *dlpar_configure_connector(__be32, struct device_node *); -extern int dlpar_attach_node(struct device_node *); +extern int dlpar_attach_node(struct device_node *, struct device_node *); extern int dlpar_detach_node(struct device_node *); extern int dlpar_acquire_drc(u32 drc_index); extern int dlpar_release_drc(u32 drc_index); diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c index 164a13d3998a..35c891aabef0 100644 --- a/arch/powerpc/platforms/pseries/pseries_energy.c +++ b/arch/powerpc/platforms/pseries/pseries_energy.c @@ -229,10 +229,9 @@ static int __init pseries_energy_init(void) int cpu, err; struct device *cpu_dev; - if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) { - printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n"); - return 0; - } + if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) + return 0; /* H_BEST_ENERGY hcall not supported */ + /* Create the sysfs files */ err = device_create_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index bb70b26334f0..4923ffe230cf 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -379,6 +379,21 @@ static void fwnmi_release_errinfo(void) int pSeries_system_reset_exception(struct pt_regs *regs) { +#ifdef __LITTLE_ENDIAN__ + /* + * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try + * to detect the bad SRR1 pattern here. Flip the NIP back to correct + * endian for reporting purposes. Unfortunately the MSR can't be fixed, + * so clear it. It will be missing MSR_RI so we won't try to recover. + */ + if ((be64_to_cpu(regs->msr) & + (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| + MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { + regs->nip = be64_to_cpu((__be64)regs->nip); + regs->msr = 0; + } +#endif + if (fwnmi_active) { struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); if (errhdr) { diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 011ef2180fe6..296c188fd5ca 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -362,20 +362,13 @@ static int do_update_property(char *buf, size_t bufsize) static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count, loff_t *off) { - int rv = 0; + int rv; char *kbuf; char *tmp; - if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) { - rv = -ENOMEM; - goto out; - } - if (copy_from_user(kbuf, buf, count)) { - rv = -EFAULT; - goto out; - } - - kbuf[count] = '\0'; + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); tmp = strchr(kbuf, ' '); if (!tmp) { diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index b5d86426e97b..5f1beb8367ac 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -57,6 +57,7 @@ #include <asm/nvram.h> #include <asm/pmc.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/ppc-pci.h> #include <asm/i8259.h> #include <asm/udbg.h> @@ -176,8 +177,11 @@ static void __init pseries_setup_i8259_cascade(void) static void __init pseries_init_irq(void) { - xics_init(); - pseries_setup_i8259_cascade(); + /* Try using a XIVE if available, otherwise use a XICS */ + if (!xive_spapr_init()) { + xics_init(); + pseries_setup_i8259_cascade(); + } } static void pseries_lpar_enable_pmcs(void) @@ -722,7 +726,6 @@ define_machine(pseries) { .pcibios_fixup = pSeries_final_fixup, .restart = rtas_restart, .halt = rtas_halt, - .panic = rtas_os_term, .get_boot_time = rtas_get_boot_time, .get_rtc_time = rtas_get_rtc_time, .set_rtc_time = rtas_set_rtc_time, diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 24785f63fb40..2e184829e5d4 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -41,6 +41,7 @@ #include <asm/vdso_datapage.h> #include <asm/cputhreads.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/dbell.h> #include <asm/plpar_wrappers.h> #include <asm/code-patching.h> @@ -136,7 +137,9 @@ out: static void smp_setup_cpu(int cpu) { - if (cpu != boot_cpuid) + if (xive_enabled()) + xive_smp_setup_cpu(); + else if (cpu != boot_cpuid) xics_setup_cpu(); if (firmware_has_feature(FW_FEATURE_SPLPAR)) @@ -181,6 +184,13 @@ static int smp_pSeries_kick_cpu(int nr) return 0; } +static int pseries_smp_prepare_cpu(int cpu) +{ + if (xive_enabled()) + return xive_smp_prepare_cpu(cpu); + return 0; +} + static void smp_pseries_cause_ipi(int cpu) { /* POWER9 should not use this handler */ @@ -211,7 +221,7 @@ static int pseries_cause_nmi_ipi(int cpu) return 0; } -static __init void pSeries_smp_probe(void) +static __init void pSeries_smp_probe_xics(void) { xics_smp_probe(); @@ -221,11 +231,24 @@ static __init void pSeries_smp_probe(void) smp_ops->cause_ipi = icp_ops->cause_ipi; } +static __init void pSeries_smp_probe(void) +{ + if (xive_enabled()) + /* + * Don't use P9 doorbells when XIVE is enabled. IPIs + * using MMIOs should be faster + */ + xive_smp_probe(); + else + pSeries_smp_probe_xics(); +} + static struct smp_ops_t pseries_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */ .cause_nmi_ipi = pseries_cause_nmi_ipi, .probe = pSeries_smp_probe, + .prepare_cpu = pseries_smp_prepare_cpu, .kick_cpu = smp_pSeries_kick_cpu, .setup_cpu = smp_setup_cpu, .cpu_bootable = smp_generic_cpu_bootable, diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 8a47f168476b..12277bc9fd9e 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1357,14 +1357,14 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) */ parent_node = of_get_parent(of_node); if (parent_node) { - if (!strcmp(parent_node->full_name, "/ibm,platform-facilities")) + if (!strcmp(parent_node->type, "ibm,platform-facilities")) family = PFO; - else if (!strcmp(parent_node->full_name, "/vdevice")) + else if (!strcmp(parent_node->type, "vdevice")) family = VDEVICE; else { - pr_warn("%s: parent(%s) of %s not recognized.\n", + pr_warn("%s: parent(%pOF) of %s not recognized.\n", __func__, - parent_node->full_name, + parent_node, of_node_name); of_node_put(parent_node); return NULL; @@ -1555,7 +1555,7 @@ static ssize_t devspec_show(struct device *dev, { struct device_node *of_node = dev->of_node; - return sprintf(buf, "%s\n", of_node_full_name(of_node)); + return sprintf(buf, "%pOF\n", of_node); } static DEVICE_ATTR_RO(devspec); diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S index 3696ea6c4826..4aad9dd10ace 100644 --- a/arch/powerpc/purgatory/trampoline.S +++ b/arch/powerpc/purgatory/trampoline.S @@ -67,7 +67,7 @@ master: mr %r16,%r3 /* save dt address in reg16 */ li %r4,20 LWZX_BE %r6,%r3,%r4 /* fetch __be32 version number at byte 20 */ - cmpwi %r0,%r6,2 /* v2 or later? */ + cmpwi %cr0,%r6,2 /* v2 or later? */ blt 1f li %r4,28 STWX_BE %r17,%r3,%r4 /* Store my cpu as __be32 at byte 28 */ @@ -104,13 +104,13 @@ master: .balign 8 .globl kernel kernel: - .llong 0x0 + .8byte 0x0 .size kernel, . - kernel .balign 8 .globl dt_offset dt_offset: - .llong 0x0 + .8byte 0x0 .size dt_offset, . - dt_offset diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index c0ae11d4f62f..79416fa2e3ba 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -36,25 +36,15 @@ obj-$(CONFIG_AXON_RAM) += axonram.o obj-$(CONFIG_PPC_INDIRECT_PCI) += indirect_pci.o obj-$(CONFIG_PPC_I8259) += i8259.o obj-$(CONFIG_IPIC) += ipic.o -obj-$(CONFIG_4xx) += uic.o -obj-$(CONFIG_PPC4xx_OCM) += ppc4xx_ocm.o -obj-$(CONFIG_4xx_SOC) += ppc4xx_soc.o obj-$(CONFIG_XILINX_VIRTEX) += xilinx_intc.o obj-$(CONFIG_XILINX_PCI) += xilinx_pci.o obj-$(CONFIG_OF_RTC) += of_rtc.o -ifeq ($(CONFIG_PCI),y) -obj-$(CONFIG_4xx) += ppc4xx_pci.o -endif -obj-$(CONFIG_PPC4xx_HSTA_MSI) += ppc4xx_hsta_msi.o -obj-$(CONFIG_PPC4xx_MSI) += ppc4xx_msi.o -obj-$(CONFIG_PPC4xx_CPM) += ppc4xx_cpm.o -obj-$(CONFIG_PPC4xx_GPIO) += ppc4xx_gpio.o obj-$(CONFIG_CPM) += cpm_common.o +obj-$(CONFIG_CPM1) += cpm1.o obj-$(CONFIG_CPM2) += cpm2.o cpm2_pic.o obj-$(CONFIG_QUICC_ENGINE) += cpm_common.o obj-$(CONFIG_PPC_DCR) += dcr.o -obj-$(CONFIG_8xx) += mpc8xx_pic.o cpm1.o obj-$(CONFIG_UCODE_PATCH) += micropatch.o obj-$(CONFIG_PPC_MPC512x) += mpc5xxx_clocks.o diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index 2799706106c6..c60e84e4558d 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -110,7 +110,7 @@ axon_ram_irq_handler(int irq, void *dev) static blk_qc_t axon_ram_make_request(struct request_queue *queue, struct bio *bio) { - struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data; + struct axon_ram_bank *bank = bio->bi_disk->private_data; unsigned long phys_mem, phys_end; void *user_mem; struct bio_vec vec; @@ -188,15 +188,12 @@ static int axon_ram_probe(struct platform_device *device) axon_ram_bank_id++; - dev_info(&device->dev, "Found memory controller on %s\n", - device->dev.of_node->full_name); + dev_info(&device->dev, "Found memory controller on %pOF\n", + device->dev.of_node); - bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL); - if (bank == NULL) { - dev_err(&device->dev, "Out of memory\n"); - rc = -ENOMEM; - goto failed; - } + bank = kzalloc(sizeof(*bank), GFP_KERNEL); + if (!bank) + return -ENOMEM; device->dev.platform_data = bank; @@ -292,25 +289,22 @@ static int axon_ram_probe(struct platform_device *device) return 0; failed: - if (bank != NULL) { - if (bank->irq_id) - free_irq(bank->irq_id, device); - if (bank->disk != NULL) { - if (bank->disk->major > 0) - unregister_blkdev(bank->disk->major, - bank->disk->disk_name); - if (bank->disk->flags & GENHD_FL_UP) - del_gendisk(bank->disk); - put_disk(bank->disk); - } - kill_dax(bank->dax_dev); - put_dax(bank->dax_dev); - device->dev.platform_data = NULL; - if (bank->io_addr != 0) - iounmap((void __iomem *) bank->io_addr); - kfree(bank); + if (bank->irq_id) + free_irq(bank->irq_id, device); + if (bank->disk != NULL) { + if (bank->disk->major > 0) + unregister_blkdev(bank->disk->major, + bank->disk->disk_name); + if (bank->disk->flags & GENHD_FL_UP) + del_gendisk(bank->disk); + put_disk(bank->disk); } - + kill_dax(bank->dax_dev); + put_dax(bank->dax_dev); + device->dev.platform_data = NULL; + if (bank->io_addr != 0) + iounmap((void __iomem *) bank->io_addr); + kfree(bank); return rc; } diff --git a/arch/powerpc/sysdev/dcr.c b/arch/powerpc/sysdev/dcr.c index 121e26fffd50..d72eda568b7d 100644 --- a/arch/powerpc/sysdev/dcr.c +++ b/arch/powerpc/sysdev/dcr.c @@ -195,8 +195,8 @@ dcr_host_mmio_t dcr_map_mmio(struct device_node *dev, dcr_host_mmio_t ret = { .token = NULL, .stride = 0, .base = dcr_n }; u64 addr; - pr_debug("dcr_map(%s, 0x%x, 0x%x)\n", - dev->full_name, dcr_n, dcr_c); + pr_debug("dcr_map(%pOF, 0x%x, 0x%x)\n", + dev, dcr_n, dcr_c); addr = of_translate_dcr_address(dev, dcr_n, &ret.stride); pr_debug("translates to addr: 0x%llx, stride: 0x%x\n", diff --git a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c index 37a69097e022..00ccf3e4fcb4 100644 --- a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c +++ b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c @@ -101,8 +101,8 @@ int __init instantiate_cache_sram(struct platform_device *dev, if (!request_mem_region(cache_sram->base_phys, cache_sram->size, "fsl_85xx_cache_sram")) { - dev_err(&dev->dev, "%s: request memory failed\n", - dev->dev.of_node->full_name); + dev_err(&dev->dev, "%pOF: request memory failed\n", + dev->dev.of_node); ret = -ENXIO; goto out_free; } @@ -110,16 +110,16 @@ int __init instantiate_cache_sram(struct platform_device *dev, cache_sram->base_virt = ioremap_prot(cache_sram->base_phys, cache_sram->size, _PAGE_COHERENT | PAGE_KERNEL); if (!cache_sram->base_virt) { - dev_err(&dev->dev, "%s: ioremap_prot failed\n", - dev->dev.of_node->full_name); + dev_err(&dev->dev, "%pOF: ioremap_prot failed\n", + dev->dev.of_node); ret = -ENOMEM; goto out_release; } cache_sram->rh = rh_create(sizeof(unsigned int)); if (IS_ERR(cache_sram->rh)) { - dev_err(&dev->dev, "%s: Unable to create remote heap\n", - dev->dev.of_node->full_name); + dev_err(&dev->dev, "%pOF: Unable to create remote heap\n", + dev->dev.of_node); ret = PTR_ERR(cache_sram->rh); goto out_unmap; } diff --git a/arch/powerpc/sysdev/fsl_gtm.c b/arch/powerpc/sysdev/fsl_gtm.c index a6f0b96ce2c9..d902306f4718 100644 --- a/arch/powerpc/sysdev/fsl_gtm.c +++ b/arch/powerpc/sysdev/fsl_gtm.c @@ -388,8 +388,8 @@ static int __init fsl_gtm_init(void) gtm = kzalloc(sizeof(*gtm), GFP_KERNEL); if (!gtm) { - pr_err("%s: unable to allocate memory\n", - np->full_name); + pr_err("%pOF: unable to allocate memory\n", + np); continue; } @@ -397,7 +397,7 @@ static int __init fsl_gtm_init(void) clock = of_get_property(np, "clock-frequency", &size); if (!clock || size != sizeof(*clock)) { - pr_err("%s: no clock-frequency\n", np->full_name); + pr_err("%pOF: no clock-frequency\n", np); goto err; } gtm->clock = *clock; @@ -407,8 +407,8 @@ static int __init fsl_gtm_init(void) irq = irq_of_parse_and_map(np, i); if (!irq) { - pr_err("%s: not enough interrupts specified\n", - np->full_name); + pr_err("%pOF: not enough interrupts specified\n", + np); goto err; } gtm->timers[i].irq = irq; @@ -417,8 +417,8 @@ static int __init fsl_gtm_init(void) gtm->regs = of_iomap(np, 0); if (!gtm->regs) { - pr_err("%s: unable to iomap registers\n", - np->full_name); + pr_err("%pOF: unable to iomap registers\n", + np); goto err; } diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 8a244828782e..44cbf4c12ea1 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -214,8 +214,8 @@ static int fsl_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) phandle = np->phandle; else { dev_err(&pdev->dev, - "node %s has an invalid fsl,msi phandle %u\n", - hose->dn->full_name, np->phandle); + "node %pOF has an invalid fsl,msi phandle %u\n", + hose->dn, np->phandle); return -EINVAL; } } @@ -438,16 +438,16 @@ static int fsl_of_msi_probe(struct platform_device *dev) if ((features->fsl_pic_ip & FSL_PIC_IP_MASK) != FSL_PIC_IP_VMPIC) { err = of_address_to_resource(dev->dev.of_node, 0, &res); if (err) { - dev_err(&dev->dev, "invalid resource for node %s\n", - dev->dev.of_node->full_name); + dev_err(&dev->dev, "invalid resource for node %pOF\n", + dev->dev.of_node); goto error_out; } msi->msi_regs = ioremap(res.start, resource_size(&res)); if (!msi->msi_regs) { err = -ENOMEM; - dev_err(&dev->dev, "could not map node %s\n", - dev->dev.of_node->full_name); + dev_err(&dev->dev, "could not map node %pOF\n", + dev->dev.of_node); goto error_out; } msi->msiir_offset = @@ -522,8 +522,8 @@ static int fsl_of_msi_probe(struct platform_device *dev) for (irq_index = 0, i = 0; i < len / (2 * sizeof(u32)); i++) { if (p[i * 2] % IRQS_PER_MSI_REG || p[i * 2 + 1] % IRQS_PER_MSI_REG) { - pr_warn("%s: %s: msi available range of %u at %u is not IRQ-aligned\n", - __func__, dev->dev.of_node->full_name, + pr_warn("%s: %pOF: msi available range of %u at %u is not IRQ-aligned\n", + __func__, dev->dev.of_node, p[i * 2 + 1], p[i * 2]); err = -EINVAL; goto error_out; diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index d3a597456b6e..22d98057f773 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -202,7 +202,6 @@ static void setup_pci_atmu(struct pci_controller *hose) u32 pcicsrbar = 0, pcicsrbar_sz; u32 piwar = PIWAR_EN | PIWAR_PF | PIWAR_TGI_LOCAL | PIWAR_READ_SNOOP | PIWAR_WRITE_SNOOP; - const char *name = hose->dn->full_name; const u64 *reg; int len; bool setup_inbound; @@ -290,12 +289,12 @@ static void setup_pci_atmu(struct pci_controller *hose) paddr_lo -= offset; if (paddr_hi == paddr_lo) { - pr_err("%s: No outbound window space\n", name); + pr_err("%pOF: No outbound window space\n", hose->dn); return; } if (paddr_lo == 0) { - pr_err("%s: No space for inbound window\n", name); + pr_err("%pOF: No space for inbound window\n", hose->dn); return; } @@ -313,7 +312,7 @@ static void setup_pci_atmu(struct pci_controller *hose) paddr_lo = min(paddr_lo, (u64)pcicsrbar); - pr_info("%s: PCICSRBAR @ 0x%x\n", name, pcicsrbar); + pr_info("%pOF: PCICSRBAR @ 0x%x\n", hose->dn, pcicsrbar); /* Setup inbound mem window */ mem = memblock_end_of_DRAM(); @@ -336,12 +335,12 @@ static void setup_pci_atmu(struct pci_controller *hose) u64 address = be64_to_cpup(reg); if ((address >= mem) && (address < (mem + PAGE_SIZE))) { - pr_info("%s: extending DDR ATMU to cover MSIIR", name); + pr_info("%pOF: extending DDR ATMU to cover MSIIR", hose->dn); mem += PAGE_SIZE; } else { /* TODO: Create a new ATMU for MSIIR */ - pr_warn("%s: msi-address-64 address of %llx is " - "unsupported\n", name, address); + pr_warn("%pOF: msi-address-64 address of %llx is " + "unsupported\n", hose->dn, address); } } @@ -354,8 +353,8 @@ static void setup_pci_atmu(struct pci_controller *hose) if ((1ull << mem_log) != mem) { mem_log++; if ((1ull << mem_log) > mem) - pr_info("%s: Setting PCI inbound window " - "greater than memory size\n", name); + pr_info("%pOF: Setting PCI inbound window " + "greater than memory size\n", hose->dn); } piwar |= ((mem_log - 1) & PIWAR_SZ_MASK); @@ -402,7 +401,7 @@ static void setup_pci_atmu(struct pci_controller *hose) */ ppc_md.dma_set_mask = fsl_pci_dma_set_mask; - pr_info("%s: Setup 64-bit PCI DMA window\n", name); + pr_info("%pOF: Setup 64-bit PCI DMA window\n", hose->dn); } } else { u64 paddr = 0; @@ -443,18 +442,18 @@ static void setup_pci_atmu(struct pci_controller *hose) #ifdef CONFIG_SWIOTLB ppc_swiotlb_enable = 1; #else - pr_err("%s: ERROR: Memory size exceeds PCI ATMU ability to " + pr_err("%pOF: ERROR: Memory size exceeds PCI ATMU ability to " "map - enable CONFIG_SWIOTLB to avoid dma errors.\n", - name); + hose->dn); #endif /* adjusting outbound windows could reclaim space in mem map */ if (paddr_hi < 0xffffffffull) - pr_warning("%s: WARNING: Outbound window cfg leaves " + pr_warning("%pOF: WARNING: Outbound window cfg leaves " "gaps in memory map. Adjusting the memory map " "could reduce unnecessary bounce buffering.\n", - name); + hose->dn); - pr_info("%s: DMA window size is 0x%llx\n", name, + pr_info("%pOF: DMA window size is 0x%llx\n", hose->dn, (u64)hose->dma_window_size); } } @@ -532,11 +531,11 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) dev = pdev->dev.of_node; if (!of_device_is_available(dev)) { - pr_warning("%s: disabled\n", dev->full_name); + pr_warning("%pOF: disabled\n", dev); return -ENODEV; } - pr_debug("Adding PCI host bridge %s\n", dev->full_name); + pr_debug("Adding PCI host bridge %pOF\n", dev); /* Fetch host bridge registers address */ if (of_address_to_resource(dev, 0, &rsrc)) { @@ -547,8 +546,8 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) /* Get bus range if any */ bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) - printk(KERN_WARNING "Can't get bus-range for %s, assume" - " bus 0\n", dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume" + " bus 0\n", dev); pci_add_flags(PCI_REASSIGN_ALL_BUS); hose = pcibios_alloc_controller(dev); @@ -809,11 +808,11 @@ int __init mpc83xx_add_bridge(struct device_node *dev) is_mpc83xx_pci = 1; if (!of_device_is_available(dev)) { - pr_warning("%s: disabled by the firmware.\n", - dev->full_name); + pr_warning("%pOF: disabled by the firmware.\n", + dev); return -ENODEV; } - pr_debug("Adding PCI host bridge %s\n", dev->full_name); + pr_debug("Adding PCI host bridge %pOF\n", dev); /* Fetch host bridge registers address */ if (of_address_to_resource(dev, 0, &rsrc_reg)) { @@ -848,8 +847,8 @@ int __init mpc83xx_add_bridge(struct device_node *dev) /* Get bus range if any */ bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s, assume" - " bus 0\n", dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume" + " bus 0\n", dev); } pci_add_flags(PCI_REASSIGN_ALL_BUS); diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 1c41c51f22cb..9234be1e66f5 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -450,12 +450,12 @@ int fsl_rio_setup(struct platform_device *dev) rc = of_address_to_resource(dev->dev.of_node, 0, ®s); if (rc) { - dev_err(&dev->dev, "Can't get %s property 'reg'\n", - dev->dev.of_node->full_name); + dev_err(&dev->dev, "Can't get %pOF property 'reg'\n", + dev->dev.of_node); return -EFAULT; } - dev_info(&dev->dev, "Of-device full name %s\n", - dev->dev.of_node->full_name); + dev_info(&dev->dev, "Of-device full name %pOF\n", + dev->dev.of_node); dev_info(&dev->dev, "Regs: %pR\n", ®s); rio_regs_win = ioremap(regs.start, resource_size(®s)); @@ -494,8 +494,8 @@ int fsl_rio_setup(struct platform_device *dev) } rc = of_address_to_resource(rmu_node, 0, &rmu_regs); if (rc) { - dev_err(&dev->dev, "Can't get %s property 'reg'\n", - rmu_node->full_name); + dev_err(&dev->dev, "Can't get %pOF property 'reg'\n", + rmu_node); goto err_rmu; } rmu_regs_win = ioremap(rmu_regs.start, resource_size(&rmu_regs)); @@ -529,8 +529,8 @@ int fsl_rio_setup(struct platform_device *dev) aw = of_n_addr_cells(np); dt_range = of_get_property(np, "reg", &rlen); if (!dt_range) { - pr_err("%s: unable to find 'reg' property\n", - np->full_name); + pr_err("%pOF: unable to find 'reg' property\n", + np); rc = -ENOMEM; goto err_pw; } @@ -557,8 +557,8 @@ int fsl_rio_setup(struct platform_device *dev) aw = of_n_addr_cells(np); dt_range = of_get_property(np, "reg", &rlen); if (!dt_range) { - pr_err("%s: unable to find 'reg' property\n", - np->full_name); + pr_err("%pOF: unable to find 'reg' property\n", + np); rc = -ENOMEM; goto err; } @@ -569,15 +569,15 @@ int fsl_rio_setup(struct platform_device *dev) for_each_child_of_node(dev->dev.of_node, np) { port_index = of_get_property(np, "cell-index", NULL); if (!port_index) { - dev_err(&dev->dev, "Can't get %s property 'cell-index'\n", - np->full_name); + dev_err(&dev->dev, "Can't get %pOF property 'cell-index'\n", + np); continue; } dt_range = of_get_property(np, "ranges", &rlen); if (!dt_range) { - dev_err(&dev->dev, "Can't get %s property 'ranges'\n", - np->full_name); + dev_err(&dev->dev, "Can't get %pOF property 'ranges'\n", + np); continue; } @@ -598,8 +598,8 @@ int fsl_rio_setup(struct platform_device *dev) range_start = of_read_number(dt_range + aw, paw); range_size = of_read_number(dt_range + aw + paw, sw); - dev_info(&dev->dev, "%s: LAW start 0x%016llx, size 0x%016llx.\n", - np->full_name, range_start, range_size); + dev_info(&dev->dev, "%pOF: LAW start 0x%016llx, size 0x%016llx.\n", + np, range_start, range_size); port = kzalloc(sizeof(struct rio_mport), GFP_KERNEL); if (!port) @@ -757,8 +757,8 @@ err_rio_regs: */ static int fsl_of_rio_rpn_probe(struct platform_device *dev) { - printk(KERN_INFO "Setting up RapidIO peer-to-peer network %s\n", - dev->dev.of_node->full_name); + printk(KERN_INFO "Setting up RapidIO peer-to-peer network %pOF\n", + dev->dev.of_node); return fsl_rio_setup(dev); }; diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c index c1826de4e749..ab7a74c75be8 100644 --- a/arch/powerpc/sysdev/fsl_rmu.c +++ b/arch/powerpc/sysdev/fsl_rmu.c @@ -1074,8 +1074,8 @@ int fsl_rio_setup_rmu(struct rio_mport *mport, struct device_node *node) priv = mport->priv; if (!node) { - dev_warn(priv->dev, "Can't get %s property 'fsl,rmu'\n", - priv->dev->of_node->full_name); + dev_warn(priv->dev, "Can't get %pOF property 'fsl,rmu'\n", + priv->dev->of_node); return -EINVAL; } @@ -1086,8 +1086,8 @@ int fsl_rio_setup_rmu(struct rio_mport *mport, struct device_node *node) aw = of_n_addr_cells(node); msg_addr = of_get_property(node, "reg", &mlen); if (!msg_addr) { - pr_err("%s: unable to find 'reg' property of message-unit\n", - node->full_name); + pr_err("%pOF: unable to find 'reg' property of message-unit\n", + node); kfree(rmu); return -ENOMEM; } @@ -1098,8 +1098,8 @@ int fsl_rio_setup_rmu(struct rio_mport *mport, struct device_node *node) rmu->txirq = irq_of_parse_and_map(node, 0); rmu->rxirq = irq_of_parse_and_map(node, 1); - printk(KERN_INFO "%s: txirq: %d, rxirq %d\n", - node->full_name, rmu->txirq, rmu->rxirq); + printk(KERN_INFO "%pOF: txirq: %d, rxirq %d\n", + node, rmu->txirq, rmu->rxirq); priv->rmm_handle = rmu; diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index 19101f9cfcfc..1f614fb2be56 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -98,7 +98,7 @@ u32 fsl_get_sys_freq(void) } EXPORT_SYMBOL(fsl_get_sys_freq); -#if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx) +#if defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE) u32 get_brgfreq(void) { diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h index d73daa4f0ccf..2640446f8bc4 100644 --- a/arch/powerpc/sysdev/fsl_soc.h +++ b/arch/powerpc/sysdev/fsl_soc.h @@ -7,7 +7,7 @@ struct spi_device; extern phys_addr_t get_immrbase(void); -#if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx) +#if defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE) extern u32 get_brgfreq(void); extern u32 get_baudrate(void); #else diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index f267ee0afc08..16f1edd78c40 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -315,6 +315,7 @@ static struct ipic_info ipic_info[] = { .prio_mask = 7, }, [48] = { + .ack = IPIC_SEPNR, .mask = IPIC_SEMSR, .prio = IPIC_SMPRR_A, .force = IPIC_SEFCR, diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index b9aac951a90f..ead3e2549ebf 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1650,8 +1650,8 @@ void __init mpic_init(struct mpic *mpic) if (mpic->flags & MPIC_SECONDARY) { int virq = irq_of_parse_and_map(mpic->node, 0); if (virq) { - printk(KERN_INFO "%s: hooking up to IRQ %d\n", - mpic->node->full_name, virq); + printk(KERN_INFO "%pOF: hooking up to IRQ %d\n", + mpic->node, virq); irq_set_handler_data(virq, mpic); irq_set_chained_handler(virq, &mpic_cascade); } diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c index db2286be5d9a..eb69a5186243 100644 --- a/arch/powerpc/sysdev/mpic_msgr.c +++ b/arch/powerpc/sysdev/mpic_msgr.c @@ -192,7 +192,7 @@ static int mpic_msgr_probe(struct platform_device *dev) return -ENOMEM; } } - dev_info(&dev->dev, "Of-device full name %s\n", np->full_name); + dev_info(&dev->dev, "Of-device full name %pOF\n", np); /* IO map the message register block. */ of_address_to_resource(np, 0, &rsrc); diff --git a/arch/powerpc/sysdev/mpic_msi.c b/arch/powerpc/sysdev/mpic_msi.c index 1d48a5385905..9ed860aee9c3 100644 --- a/arch/powerpc/sysdev/mpic_msi.c +++ b/arch/powerpc/sysdev/mpic_msi.c @@ -60,7 +60,7 @@ static int mpic_msi_reserve_u3_hwirqs(struct mpic *mpic) np = NULL; while ((np = of_find_all_nodes(np))) { - pr_debug("mpic: mapping hwirqs for %s\n", np->full_name); + pr_debug("mpic: mapping hwirqs for %pOF\n", np); index = 0; while (of_irq_parse_one(np, index++, &oirq) == 0) { diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c index 9d9b06217f8b..a418579591be 100644 --- a/arch/powerpc/sysdev/mpic_timer.c +++ b/arch/powerpc/sysdev/mpic_timer.c @@ -466,8 +466,7 @@ static int timer_group_get_irq(struct device_node *np, p = of_get_property(np, "fsl,available-ranges", &len); if (p && len % (2 * sizeof(u32)) != 0) { - pr_err("%s: malformed available-ranges property.\n", - np->full_name); + pr_err("%pOF: malformed available-ranges property.\n", np); return -EINVAL; } @@ -484,8 +483,7 @@ static int timer_group_get_irq(struct device_node *np, for (j = 0; j < count; j++) { irq = irq_of_parse_and_map(np, irq_index); if (!irq) { - pr_err("%s: irq parse and map failed.\n", - np->full_name); + pr_err("%pOF: irq parse and map failed.\n", np); return -EINVAL; } @@ -508,8 +506,7 @@ static void timer_group_init(struct device_node *np) priv = kzalloc(sizeof(struct timer_group_priv), GFP_KERNEL); if (!priv) { - pr_err("%s: cannot allocate memory for group.\n", - np->full_name); + pr_err("%pOF: cannot allocate memory for group.\n", np); return; } @@ -518,29 +515,27 @@ static void timer_group_init(struct device_node *np) priv->regs = of_iomap(np, i++); if (!priv->regs) { - pr_err("%s: cannot ioremap timer register address.\n", - np->full_name); + pr_err("%pOF: cannot ioremap timer register address.\n", np); goto out; } if (priv->flags & FSL_GLOBAL_TIMER) { priv->group_tcr = of_iomap(np, i++); if (!priv->group_tcr) { - pr_err("%s: cannot ioremap tcr address.\n", - np->full_name); + pr_err("%pOF: cannot ioremap tcr address.\n", np); goto out; } } ret = timer_group_get_freq(np, priv); if (ret < 0) { - pr_err("%s: cannot get timer frequency.\n", np->full_name); + pr_err("%pOF: cannot get timer frequency.\n", np); goto out; } ret = timer_group_get_irq(np, priv); if (ret < 0) { - pr_err("%s: cannot get timer irqs.\n", np->full_name); + pr_err("%pOF: cannot get timer irqs.\n", np); goto out; } diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 5ebd3f018295..c4dae27172b3 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -86,13 +86,13 @@ int msi_bitmap_reserve_dt_hwirqs(struct msi_bitmap *bmp) p = of_get_property(bmp->of_node, "msi-available-ranges", &len); if (!p) { pr_debug("msi_bitmap: no msi-available-ranges property " \ - "found on %s\n", bmp->of_node->full_name); + "found on %pOF\n", bmp->of_node); return 1; } if (len % (2 * sizeof(u32)) != 0) { printk(KERN_WARNING "msi_bitmap: Malformed msi-available-ranges" - " property on %s\n", bmp->of_node->full_name); + " property on %pOF\n", bmp->of_node); return -EINVAL; } diff --git a/arch/powerpc/sysdev/mv64x60_dev.c b/arch/powerpc/sysdev/mv64x60_dev.c index 026bbc3b2c47..185a67e742a6 100644 --- a/arch/powerpc/sysdev/mv64x60_dev.c +++ b/arch/powerpc/sysdev/mv64x60_dev.c @@ -452,8 +452,8 @@ static int __init mv64x60_device_setup(void) err = mv64x60_mpsc_device_setup(np, id++); if (err) printk(KERN_ERR "Failed to initialize MV64x60 " - "serial device %s: error %d.\n", - np->full_name, err); + "serial device %pOF: error %d.\n", + np, err); } id = 0; @@ -463,8 +463,8 @@ static int __init mv64x60_device_setup(void) if (IS_ERR(pdev)) { err = PTR_ERR(pdev); printk(KERN_ERR "Failed to initialize MV64x60 " - "network block %s: error %d.\n", - np->full_name, err); + "network block %pOF: error %d.\n", + np, err); continue; } for_each_child_of_node(np, np2) { @@ -474,9 +474,9 @@ static int __init mv64x60_device_setup(void) err = mv64x60_eth_device_setup(np2, id2++, pdev); if (err) printk(KERN_ERR "Failed to initialize " - "MV64x60 network device %s: " + "MV64x60 network device %pOF: " "error %d.\n", - np2->full_name, err); + np2, err); } } @@ -485,8 +485,8 @@ static int __init mv64x60_device_setup(void) err = mv64x60_i2c_device_setup(np, id++); if (err) printk(KERN_ERR "Failed to initialize MV64x60 I2C " - "bus %s: error %d.\n", - np->full_name, err); + "bus %pOF: error %d.\n", + np, err); } /* support up to one watchdog timer */ @@ -494,8 +494,8 @@ static int __init mv64x60_device_setup(void) if (np) { if ((err = mv64x60_wdt_device_setup(np, id))) printk(KERN_ERR "Failed to initialize MV64x60 " - "Watchdog %s: error %d.\n", - np->full_name, err); + "Watchdog %pOF: error %d.\n", + np, err); of_node_put(np); } diff --git a/arch/powerpc/sysdev/mv64x60_pci.c b/arch/powerpc/sysdev/mv64x60_pci.c index 330d56613c5a..d52b3b81e05f 100644 --- a/arch/powerpc/sysdev/mv64x60_pci.c +++ b/arch/powerpc/sysdev/mv64x60_pci.c @@ -70,7 +70,7 @@ static ssize_t mv64x60_hs_reg_write(struct file *filp, struct kobject *kobj, return count; } -static struct bin_attribute mv64x60_hs_reg_attr = { /* Hotswap register */ +static const struct bin_attribute mv64x60_hs_reg_attr = { /* Hotswap register */ .attr = { .name = "hs_reg", .mode = S_IRUGO | S_IWUSR, @@ -136,8 +136,8 @@ static int __init mv64x60_add_bridge(struct device_node *dev) /* Get bus range if any */ bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) - printk(KERN_WARNING "Can't get bus-range for %s, assume" - " bus 0\n", dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume" + " bus 0\n", dev); hose = pcibios_alloc_controller(dev); if (!hose) diff --git a/arch/powerpc/sysdev/of_rtc.c b/arch/powerpc/sysdev/of_rtc.c index 6f54b54b1328..153fdac4720f 100644 --- a/arch/powerpc/sysdev/of_rtc.c +++ b/arch/powerpc/sysdev/of_rtc.c @@ -38,21 +38,21 @@ void __init of_instantiate_rtc(void) res = kmalloc(sizeof(*res), GFP_KERNEL); if (!res) { printk(KERN_ERR "OF RTC: Out of memory " - "allocating resource structure for %s\n", - node->full_name); + "allocating resource structure for %pOF\n", + node); continue; } err = of_address_to_resource(node, 0, res); if (err) { printk(KERN_ERR "OF RTC: Error " - "translating resources for %s\n", - node->full_name); + "translating resources for %pOF\n", + node); continue; } - printk(KERN_INFO "OF_RTC: %s is a %s @ 0x%llx-0x%llx\n", - node->full_name, plat_name, + printk(KERN_INFO "OF_RTC: %pOF is a %s @ 0x%llx-0x%llx\n", + node, plat_name, (unsigned long long)res->start, (unsigned long long)res->end); platform_device_register_simple(plat_name, -1, res, 1); diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c index 76ea32c1b664..0f6fd5d04d33 100644 --- a/arch/powerpc/sysdev/scom.c +++ b/arch/powerpc/sysdev/scom.c @@ -194,12 +194,13 @@ static int scom_debug_init_one(struct dentry *root, struct device_node *dn, ent->dn = of_node_get(dn); snprintf(ent->name, 16, "%08x", i); - ent->path.data = (void*) dn->full_name; - ent->path.size = strlen(dn->full_name); + ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn); + ent->path.size = strlen((char *)ent->path.data); dir = debugfs_create_dir(ent->name, root); if (!dir) { of_node_put(dn); + kfree(ent->path.data); kfree(ent); return -1; } diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c index 6afddae2fb47..f02d4576138c 100644 --- a/arch/powerpc/sysdev/simple_gpio.c +++ b/arch/powerpc/sysdev/simple_gpio.c @@ -142,7 +142,6 @@ void __init simple_gpiochip_init(const char *compatible) } continue; err: - pr_err("%s: registration failed, status %d\n", - np->full_name, ret); + pr_err("%pOF: registration failed, status %d\n", np, ret); } } diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c index 5692dd569b9b..28ff1f53cefc 100644 --- a/arch/powerpc/sysdev/tsi108_pci.c +++ b/arch/powerpc/sysdev/tsi108_pci.c @@ -213,8 +213,8 @@ int __init tsi108_setup_pci(struct device_node *dev, u32 cfg_phys, int primary) /* Get bus range if any */ bus_range = of_get_property(dev, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s, assume" - " bus 0\n", dev->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, assume" + " bus 0\n", dev); } hose = pcibios_alloc_controller(dev); diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig index 12ccd7373d2f..3e3e25b5e30d 100644 --- a/arch/powerpc/sysdev/xive/Kconfig +++ b/arch/powerpc/sysdev/xive/Kconfig @@ -9,3 +9,8 @@ config PPC_XIVE_NATIVE default n select PPC_XIVE depends on PPC_POWERNV + +config PPC_XIVE_SPAPR + bool + default n + select PPC_XIVE diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile index 3fab303fc169..536d6e5706e3 100644 --- a/arch/powerpc/sysdev/xive/Makefile +++ b/arch/powerpc/sysdev/xive/Makefile @@ -2,3 +2,4 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-y += common.o obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o +obj-$(CONFIG_PPC_XIVE_SPAPR) += spapr.o diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 6595462b1fc8..f387318678b9 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -40,7 +40,8 @@ #undef DEBUG_ALL #ifdef DEBUG_ALL -#define DBG_VERBOSE(fmt...) pr_devel(fmt) +#define DBG_VERBOSE(fmt, ...) pr_devel("cpu %d - " fmt, \ + smp_processor_id(), ## __VA_ARGS__) #else #define DBG_VERBOSE(fmt...) do { } while(0) #endif @@ -190,7 +191,7 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) * This is used to perform the magic loads from an ESB * described in xive.h */ -static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset) +static notrace u8 xive_esb_read(struct xive_irq_data *xd, u32 offset) { u64 val; @@ -198,13 +199,28 @@ static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset) if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) offset |= offset << 4; - val = in_be64(xd->eoi_mmio + offset); + if ((xd->flags & XIVE_IRQ_FLAG_H_INT_ESB) && xive_ops->esb_rw) + val = xive_ops->esb_rw(xd->hw_irq, offset, 0, 0); + else + val = in_be64(xd->eoi_mmio + offset); return (u8)val; } +static void xive_esb_write(struct xive_irq_data *xd, u32 offset, u64 data) +{ + /* Handle HW errata */ + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) + offset |= offset << 4; + + if ((xd->flags & XIVE_IRQ_FLAG_H_INT_ESB) && xive_ops->esb_rw) + xive_ops->esb_rw(xd->hw_irq, offset, data, 1); + else + out_be64(xd->eoi_mmio + offset, data); +} + #ifdef CONFIG_XMON -static void xive_dump_eq(const char *name, struct xive_q *q) +static notrace void xive_dump_eq(const char *name, struct xive_q *q) { u32 i0, i1, idx; @@ -218,7 +234,7 @@ static void xive_dump_eq(const char *name, struct xive_q *q) q->toggle, i0, i1); } -void xmon_xive_do_dump(int cpu) +notrace void xmon_xive_do_dump(int cpu) { struct xive_cpu *xc = per_cpu(xive_cpu, cpu); @@ -227,7 +243,7 @@ void xmon_xive_do_dump(int cpu) xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); #ifdef CONFIG_SMP { - u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET); + u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET); xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, val & XIVE_ESB_VAL_P ? 'P' : 'p', val & XIVE_ESB_VAL_P ? 'Q' : 'q'); @@ -297,7 +313,7 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) { /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) - out_be64(xd->eoi_mmio + XIVE_ESB_STORE_EOI, 0); + xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { /* * The FW told us to call it. This happens for some @@ -326,10 +342,10 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) * properly. */ if (xd->flags & XIVE_IRQ_FLAG_LSI) - in_be64(xd->eoi_mmio); + xive_esb_read(xd, XIVE_ESB_LOAD_EOI); else { - eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); - DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val); + eoi_val = xive_esb_read(xd, XIVE_ESB_SET_PQ_00); + DBG_VERBOSE("eoi_val=%x\n", eoi_val); /* Re-trigger if needed */ if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio) @@ -383,12 +399,12 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd, * ESB accordingly on unmask. */ if (mask) { - val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01); + val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); xd->saved_p = !!(val & XIVE_ESB_VAL_P); } else if (xd->saved_p) - xive_poke_esb(xd, XIVE_ESB_SET_PQ_10); + xive_esb_read(xd, XIVE_ESB_SET_PQ_10); else - xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); + xive_esb_read(xd, XIVE_ESB_SET_PQ_00); } /* @@ -447,7 +463,7 @@ static int xive_find_target_in_mask(const struct cpumask *mask, int cpu, first, num, i; /* Pick up a starting point CPU in the mask based on fuzz */ - num = cpumask_weight(mask); + num = min_t(int, cpumask_weight(mask), nr_cpu_ids); first = fuzz % num; /* Locate it */ @@ -672,6 +688,10 @@ static int xive_irq_set_affinity(struct irq_data *d, if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) return -EINVAL; + /* Don't do anything if the interrupt isn't started */ + if (!irqd_is_started(d)) + return IRQ_SET_MASK_OK; + /* * If existing target is already in the new mask, and is * online then do nothing. @@ -768,7 +788,7 @@ static int xive_irq_retrigger(struct irq_data *d) * To perform a retrigger, we first set the PQ bits to * 11, then perform an EOI. */ - xive_poke_esb(xd, XIVE_ESB_SET_PQ_11); + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); /* * Note: We pass "0" to the hw_irq argument in order to @@ -803,7 +823,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) irqd_set_forwarded_to_vcpu(d); /* Set it to PQ=10 state to prevent further sends */ - pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_10); + pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); /* No target ? nothing to do */ if (xd->target == XIVE_INVALID_TARGET) { @@ -832,7 +852,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * for sure the queue slot is no longer in use. */ if (pq & 2) { - pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_11); + pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11); xd->saved_p = true; /* @@ -989,6 +1009,9 @@ static void xive_ipi_eoi(struct irq_data *d) { struct xive_cpu *xc = __this_cpu_read(xive_cpu); + DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio); + /* Handle possible race with unplug and drop stale IPIs */ if (!xc) return; @@ -1368,6 +1391,19 @@ void xive_flush_interrupt(void) #endif /* CONFIG_SMP */ +void xive_teardown_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + if (xive_ops->teardown_cpu) + xive_ops->teardown_cpu(cpu, xc); +} + void xive_kexec_teardown_cpu(int secondary) { struct xive_cpu *xc = __this_cpu_read(xive_cpu); @@ -1395,8 +1431,8 @@ void xive_shutdown(void) xive_ops->shutdown(); } -bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, - u8 max_prio) +bool __init xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, + u8 max_prio) { xive_tima = area; xive_tima_offset = offset; @@ -1424,6 +1460,22 @@ bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, return true; } +__be32 *xive_queue_page_alloc(unsigned int cpu, u32 queue_shift) +{ + unsigned int alloc_order; + struct page *pages; + __be32 *qpage; + + alloc_order = xive_alloc_order(queue_shift); + pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order); + if (!pages) + return ERR_PTR(-ENOMEM); + qpage = (__be32 *)page_address(pages); + memset(qpage, 0, 1 << queue_shift); + + return qpage; +} + static int __init xive_off(char *arg) { xive_cmdline_disabled = true; diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 0f95476b01f6..44f3a25ca630 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -82,6 +82,8 @@ int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) return -ENOMEM; } + data->hw_irq = hw_irq; + if (!data->trig_page) return 0; if (data->trig_page == data->eoi_page) { @@ -202,17 +204,12 @@ EXPORT_SYMBOL_GPL(xive_native_disable_queue); static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) { struct xive_q *q = &xc->queue[prio]; - unsigned int alloc_order; - struct page *pages; __be32 *qpage; - alloc_order = (xive_queue_shift > PAGE_SHIFT) ? - (xive_queue_shift - PAGE_SHIFT) : 0; - pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order); - if (!pages) - return -ENOMEM; - qpage = (__be32 *)page_address(pages); - memset(qpage, 0, 1 << xive_queue_shift); + qpage = xive_queue_page_alloc(cpu, xive_queue_shift); + if (IS_ERR(qpage)) + return PTR_ERR(qpage); + return xive_native_configure_queue(get_hard_smp_processor_id(cpu), q, prio, qpage, xive_queue_shift, false); } @@ -227,8 +224,7 @@ static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 * from an IPI and iounmap isn't safe */ __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio); - alloc_order = (xive_queue_shift > PAGE_SHIFT) ? - (xive_queue_shift - PAGE_SHIFT) : 0; + alloc_order = xive_alloc_order(xive_queue_shift); free_pages((unsigned long)q->qpage, alloc_order); q->qpage = NULL; } @@ -531,7 +527,7 @@ u32 xive_native_default_eq_shift(void) } EXPORT_SYMBOL_GPL(xive_native_default_eq_shift); -bool xive_native_init(void) +bool __init xive_native_init(void) { struct device_node *np; struct resource r; @@ -551,7 +547,7 @@ bool xive_native_init(void) pr_devel("not found !\n"); return false; } - pr_devel("Found %s\n", np->full_name); + pr_devel("Found %pOF\n", np); /* Resource 1 is HV window */ if (of_address_to_resource(np, 1, &r)) { diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c new file mode 100644 index 000000000000..f24a70bc6855 --- /dev/null +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -0,0 +1,662 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/irq.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/mm.h> + +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/hvcall.h> + +#include "xive-internal.h" + +static u32 xive_queue_shift; + +struct xive_irq_bitmap { + unsigned long *bitmap; + unsigned int base; + unsigned int count; + spinlock_t lock; + struct list_head list; +}; + +static LIST_HEAD(xive_irq_bitmaps); + +static int xive_irq_bitmap_add(int base, int count) +{ + struct xive_irq_bitmap *xibm; + + xibm = kzalloc(sizeof(*xibm), GFP_ATOMIC); + if (!xibm) + return -ENOMEM; + + spin_lock_init(&xibm->lock); + xibm->base = base; + xibm->count = count; + xibm->bitmap = kzalloc(xibm->count, GFP_KERNEL); + list_add(&xibm->list, &xive_irq_bitmaps); + + pr_info("Using IRQ range [%x-%x]", xibm->base, + xibm->base + xibm->count - 1); + return 0; +} + +static int __xive_irq_bitmap_alloc(struct xive_irq_bitmap *xibm) +{ + int irq; + + irq = find_first_zero_bit(xibm->bitmap, xibm->count); + if (irq != xibm->count) { + set_bit(irq, xibm->bitmap); + irq += xibm->base; + } else { + irq = -ENOMEM; + } + + return irq; +} + +static int xive_irq_bitmap_alloc(void) +{ + struct xive_irq_bitmap *xibm; + unsigned long flags; + int irq = -ENOENT; + + list_for_each_entry(xibm, &xive_irq_bitmaps, list) { + spin_lock_irqsave(&xibm->lock, flags); + irq = __xive_irq_bitmap_alloc(xibm); + spin_unlock_irqrestore(&xibm->lock, flags); + if (irq >= 0) + break; + } + return irq; +} + +static void xive_irq_bitmap_free(int irq) +{ + unsigned long flags; + struct xive_irq_bitmap *xibm; + + list_for_each_entry(xibm, &xive_irq_bitmaps, list) { + if ((irq >= xibm->base) && (irq < xibm->base + xibm->count)) { + spin_lock_irqsave(&xibm->lock, flags); + clear_bit(irq - xibm->base, xibm->bitmap); + spin_unlock_irqrestore(&xibm->lock, flags); + break; + } + } +} + +static long plpar_int_get_source_info(unsigned long flags, + unsigned long lisn, + unsigned long *src_flags, + unsigned long *eoi_page, + unsigned long *trig_page, + unsigned long *esb_shift) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + rc = plpar_hcall(H_INT_GET_SOURCE_INFO, retbuf, flags, lisn); + if (rc) { + pr_err("H_INT_GET_SOURCE_INFO lisn=%ld failed %ld\n", lisn, rc); + return rc; + } + + *src_flags = retbuf[0]; + *eoi_page = retbuf[1]; + *trig_page = retbuf[2]; + *esb_shift = retbuf[3]; + + pr_devel("H_INT_GET_SOURCE_INFO flags=%lx eoi=%lx trig=%lx shift=%lx\n", + retbuf[0], retbuf[1], retbuf[2], retbuf[3]); + + return 0; +} + +#define XIVE_SRC_SET_EISN (1ull << (63 - 62)) +#define XIVE_SRC_MASK (1ull << (63 - 63)) /* unused */ + +static long plpar_int_set_source_config(unsigned long flags, + unsigned long lisn, + unsigned long target, + unsigned long prio, + unsigned long sw_irq) +{ + long rc; + + + pr_devel("H_INT_SET_SOURCE_CONFIG flags=%lx lisn=%lx target=%lx prio=%lx sw_irq=%lx\n", + flags, lisn, target, prio, sw_irq); + + + rc = plpar_hcall_norets(H_INT_SET_SOURCE_CONFIG, flags, lisn, + target, prio, sw_irq); + if (rc) { + pr_err("H_INT_SET_SOURCE_CONFIG lisn=%ld target=%lx prio=%lx failed %ld\n", + lisn, target, prio, rc); + return rc; + } + + return 0; +} + +static long plpar_int_get_queue_info(unsigned long flags, + unsigned long target, + unsigned long priority, + unsigned long *esn_page, + unsigned long *esn_size) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + rc = plpar_hcall(H_INT_GET_QUEUE_INFO, retbuf, flags, target, priority); + if (rc) { + pr_err("H_INT_GET_QUEUE_INFO cpu=%ld prio=%ld failed %ld\n", + target, priority, rc); + return rc; + } + + *esn_page = retbuf[0]; + *esn_size = retbuf[1]; + + pr_devel("H_INT_GET_QUEUE_INFO page=%lx size=%lx\n", + retbuf[0], retbuf[1]); + + return 0; +} + +#define XIVE_EQ_ALWAYS_NOTIFY (1ull << (63 - 63)) + +static long plpar_int_set_queue_config(unsigned long flags, + unsigned long target, + unsigned long priority, + unsigned long qpage, + unsigned long qsize) +{ + long rc; + + pr_devel("H_INT_SET_QUEUE_CONFIG flags=%lx target=%lx priority=%lx qpage=%lx qsize=%lx\n", + flags, target, priority, qpage, qsize); + + rc = plpar_hcall_norets(H_INT_SET_QUEUE_CONFIG, flags, target, + priority, qpage, qsize); + if (rc) { + pr_err("H_INT_SET_QUEUE_CONFIG cpu=%ld prio=%ld qpage=%lx returned %ld\n", + target, priority, qpage, rc); + return rc; + } + + return 0; +} + +static long plpar_int_sync(unsigned long flags, unsigned long lisn) +{ + long rc; + + rc = plpar_hcall_norets(H_INT_SYNC, flags, lisn); + if (rc) { + pr_err("H_INT_SYNC lisn=%ld returned %ld\n", lisn, rc); + return rc; + } + + return 0; +} + +#define XIVE_ESB_FLAG_STORE (1ull << (63 - 63)) + +static long plpar_int_esb(unsigned long flags, + unsigned long lisn, + unsigned long offset, + unsigned long in_data, + unsigned long *out_data) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + pr_devel("H_INT_ESB flags=%lx lisn=%lx offset=%lx in=%lx\n", + flags, lisn, offset, in_data); + + rc = plpar_hcall(H_INT_ESB, retbuf, flags, lisn, offset, in_data); + if (rc) { + pr_err("H_INT_ESB lisn=%ld offset=%ld returned %ld\n", + lisn, offset, rc); + return rc; + } + + *out_data = retbuf[0]; + + return 0; +} + +static u64 xive_spapr_esb_rw(u32 lisn, u32 offset, u64 data, bool write) +{ + unsigned long read_data; + long rc; + + rc = plpar_int_esb(write ? XIVE_ESB_FLAG_STORE : 0, + lisn, offset, data, &read_data); + if (rc) + return -1; + + return write ? 0 : read_data; +} + +#define XIVE_SRC_H_INT_ESB (1ull << (63 - 60)) +#define XIVE_SRC_LSI (1ull << (63 - 61)) +#define XIVE_SRC_TRIGGER (1ull << (63 - 62)) +#define XIVE_SRC_STORE_EOI (1ull << (63 - 63)) + +static int xive_spapr_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) +{ + long rc; + unsigned long flags; + unsigned long eoi_page; + unsigned long trig_page; + unsigned long esb_shift; + + memset(data, 0, sizeof(*data)); + + rc = plpar_int_get_source_info(0, hw_irq, &flags, &eoi_page, &trig_page, + &esb_shift); + if (rc) + return -EINVAL; + + if (flags & XIVE_SRC_H_INT_ESB) + data->flags |= XIVE_IRQ_FLAG_H_INT_ESB; + if (flags & XIVE_SRC_STORE_EOI) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (flags & XIVE_SRC_LSI) + data->flags |= XIVE_IRQ_FLAG_LSI; + data->eoi_page = eoi_page; + data->esb_shift = esb_shift; + data->trig_page = trig_page; + + /* + * No chip-id for the sPAPR backend. This has an impact how we + * pick a target. See xive_pick_irq_target(). + */ + data->src_chip = XIVE_INVALID_CHIP_ID; + + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); + if (!data->eoi_mmio) { + pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + + data->hw_irq = hw_irq; + + /* Full function page supports trigger */ + if (flags & XIVE_SRC_TRIGGER) { + data->trig_mmio = data->eoi_mmio; + return 0; + } + + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); + if (!data->trig_mmio) { + pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + return 0; +} + +static int xive_spapr_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) +{ + long rc; + + rc = plpar_int_set_source_config(XIVE_SRC_SET_EISN, hw_irq, target, + prio, sw_irq); + + return rc == 0 ? 0 : -ENXIO; +} + +/* This can be called multiple time to change a queue configuration */ +static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order) +{ + s64 rc = 0; + unsigned long esn_page; + unsigned long esn_size; + u64 flags, qpage_phys; + + /* If there's an actual queue page, clean it */ + if (order) { + if (WARN_ON(!qpage)) + return -EINVAL; + qpage_phys = __pa(qpage); + } else { + qpage_phys = 0; + } + + /* Initialize the rest of the fields */ + q->msk = order ? ((1u << (order - 2)) - 1) : 0; + q->idx = 0; + q->toggle = 0; + + rc = plpar_int_get_queue_info(0, target, prio, &esn_page, &esn_size); + if (rc) { + pr_err("Error %lld getting queue info prio %d\n", rc, prio); + rc = -EIO; + goto fail; + } + + /* TODO: add support for the notification page */ + q->eoi_phys = esn_page; + + /* Default is to always notify */ + flags = XIVE_EQ_ALWAYS_NOTIFY; + + /* Configure and enable the queue in HW */ + rc = plpar_int_set_queue_config(flags, target, prio, qpage_phys, order); + if (rc) { + pr_err("Error %lld setting queue for prio %d\n", rc, prio); + rc = -EIO; + } else { + q->qpage = qpage; + } +fail: + return rc; +} + +static int xive_spapr_setup_queue(unsigned int cpu, struct xive_cpu *xc, + u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + __be32 *qpage; + + qpage = xive_queue_page_alloc(cpu, xive_queue_shift); + if (IS_ERR(qpage)) + return PTR_ERR(qpage); + + return xive_spapr_configure_queue(cpu, q, prio, qpage, + xive_queue_shift); +} + +static void xive_spapr_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, + u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + long rc; + + rc = plpar_int_set_queue_config(0, cpu, prio, 0, 0); + if (rc) + pr_err("Error %ld setting queue for prio %d\n", rc, prio); + + alloc_order = xive_alloc_order(xive_queue_shift); + free_pages((unsigned long)q->qpage, alloc_order); + q->qpage = NULL; +} + +static bool xive_spapr_match(struct device_node *node) +{ + /* Ignore cascaded controllers for the moment */ + return 1; +} + +#ifdef CONFIG_SMP +static int xive_spapr_get_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + int irq = xive_irq_bitmap_alloc(); + + if (irq < 0) { + pr_err("Failed to allocate IPI on CPU %d\n", cpu); + return -ENXIO; + } + + xc->hw_ipi = irq; + return 0; +} + +static void xive_spapr_put_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + xive_irq_bitmap_free(xc->hw_ipi); +} +#endif /* CONFIG_SMP */ + +static void xive_spapr_shutdown(void) +{ + long rc; + + rc = plpar_hcall_norets(H_INT_RESET, 0); + if (rc) + pr_err("H_INT_RESET failed %ld\n", rc); +} + +/* + * Perform an "ack" cycle on the current thread. Grab the pending + * active priorities and update the CPPR to the most favored one. + */ +static void xive_spapr_update_pending(struct xive_cpu *xc) +{ + u8 nsr, cppr; + u16 ack; + + /* + * Perform the "Acknowledge O/S to Register" cycle. + * + * Let's speedup the access to the TIMA using the raw I/O + * accessor as we don't need the synchronisation routine of + * the higher level ones + */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* + * Grab the CPPR and the "NSR" field which indicates the source + * of the interrupt (if any) + */ + cppr = ack & 0xff; + nsr = ack >> 8; + + if (nsr & TM_QW1_NSR_EO) { + if (cppr == 0xff) + return; + /* Mark the priority pending */ + xc->pending_prio |= 1 << cppr; + + /* + * A new interrupt should never have a CPPR less favored + * than our current one. + */ + if (cppr >= xc->cppr) + pr_err("CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->cppr); + + /* Update our idea of what the CPPR is */ + xc->cppr = cppr; + } +} + +static void xive_spapr_eoi(u32 hw_irq) +{ + /* Not used */; +} + +static void xive_spapr_setup_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + /* Only some debug on the TIMA settings */ + pr_debug("(HW value: %08x %08x %08x)\n", + in_be32(xive_tima + TM_QW1_OS + TM_WORD0), + in_be32(xive_tima + TM_QW1_OS + TM_WORD1), + in_be32(xive_tima + TM_QW1_OS + TM_WORD2)); +} + +static void xive_spapr_teardown_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + /* Nothing to do */; +} + +static void xive_spapr_sync_source(u32 hw_irq) +{ + /* Specs are unclear on what this is doing */ + plpar_int_sync(0, hw_irq); +} + +static const struct xive_ops xive_spapr_ops = { + .populate_irq_data = xive_spapr_populate_irq_data, + .configure_irq = xive_spapr_configure_irq, + .setup_queue = xive_spapr_setup_queue, + .cleanup_queue = xive_spapr_cleanup_queue, + .match = xive_spapr_match, + .shutdown = xive_spapr_shutdown, + .update_pending = xive_spapr_update_pending, + .eoi = xive_spapr_eoi, + .setup_cpu = xive_spapr_setup_cpu, + .teardown_cpu = xive_spapr_teardown_cpu, + .sync_source = xive_spapr_sync_source, + .esb_rw = xive_spapr_esb_rw, +#ifdef CONFIG_SMP + .get_ipi = xive_spapr_get_ipi, + .put_ipi = xive_spapr_put_ipi, +#endif /* CONFIG_SMP */ + .name = "spapr", +}; + +/* + * get max priority from "/ibm,plat-res-int-priorities" + */ +static bool xive_get_max_prio(u8 *max_prio) +{ + struct device_node *rootdn; + const __be32 *reg; + u32 len; + int prio, found; + + rootdn = of_find_node_by_path("/"); + if (!rootdn) { + pr_err("not root node found !\n"); + return false; + } + + reg = of_get_property(rootdn, "ibm,plat-res-int-priorities", &len); + if (!reg) { + pr_err("Failed to read 'ibm,plat-res-int-priorities' property\n"); + return false; + } + + if (len % (2 * sizeof(u32)) != 0) { + pr_err("invalid 'ibm,plat-res-int-priorities' property\n"); + return false; + } + + /* HW supports priorities in the range [0-7] and 0xFF is a + * wildcard priority used to mask. We scan the ranges reserved + * by the hypervisor to find the lowest priority we can use. + */ + found = 0xFF; + for (prio = 0; prio < 8; prio++) { + int reserved = 0; + int i; + + for (i = 0; i < len / (2 * sizeof(u32)); i++) { + int base = be32_to_cpu(reg[2 * i]); + int range = be32_to_cpu(reg[2 * i + 1]); + + if (prio >= base && prio < base + range) + reserved++; + } + + if (!reserved) + found = prio; + } + + if (found == 0xFF) { + pr_err("no valid priority found in 'ibm,plat-res-int-priorities'\n"); + return false; + } + + *max_prio = found; + return true; +} + +bool __init xive_spapr_init(void) +{ + struct device_node *np; + struct resource r; + void __iomem *tima; + struct property *prop; + u8 max_prio; + u32 val; + u32 len; + const __be32 *reg; + int i; + + if (xive_cmdline_disabled) + return false; + + pr_devel("%s()\n", __func__); + np = of_find_compatible_node(NULL, NULL, "ibm,power-ivpe"); + if (!np) { + pr_devel("not found !\n"); + return false; + } + pr_devel("Found %s\n", np->full_name); + + /* Resource 1 is the OS ring TIMA */ + if (of_address_to_resource(np, 1, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + return false; + } + tima = ioremap(r.start, resource_size(&r)); + if (!tima) { + pr_err("Failed to map thread mgmnt area\n"); + return false; + } + + if (!xive_get_max_prio(&max_prio)) + return false; + + /* Feed the IRQ number allocator with the ranges given in the DT */ + reg = of_get_property(np, "ibm,xive-lisn-ranges", &len); + if (!reg) { + pr_err("Failed to read 'ibm,xive-lisn-ranges' property\n"); + return false; + } + + if (len % (2 * sizeof(u32)) != 0) { + pr_err("invalid 'ibm,xive-lisn-ranges' property\n"); + return false; + } + + for (i = 0; i < len / (2 * sizeof(u32)); i++, reg += 2) + xive_irq_bitmap_add(be32_to_cpu(reg[0]), + be32_to_cpu(reg[1])); + + /* Iterate the EQ sizes and pick one */ + of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, reg, val) { + xive_queue_shift = val; + if (val == PAGE_SHIFT) + break; + } + + /* Initialize XIVE core with our backend */ + if (!xive_core_init(&xive_spapr_ops, tima, TM_QW1_OS, max_prio)) + return false; + + pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10)); + return true; +} diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h index d07ef2d29caf..f34abed0c05f 100644 --- a/arch/powerpc/sysdev/xive/xive-internal.h +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -47,6 +47,7 @@ struct xive_ops { void (*update_pending)(struct xive_cpu *xc); void (*eoi)(u32 hw_irq); void (*sync_source)(u32 hw_irq); + u64 (*esb_rw)(u32 hw_irq, u32 offset, u64 data, bool write); #ifdef CONFIG_SMP int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc); void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc); @@ -56,6 +57,12 @@ struct xive_ops { bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, u8 max_prio); +__be32 *xive_queue_page_alloc(unsigned int cpu, u32 queue_shift); + +static inline u32 xive_alloc_order(u32 queue_shift) +{ + return (queue_shift > PAGE_SHIFT) ? (queue_shift - PAGE_SHIFT) : 0; +} extern bool xive_cmdline_disabled; diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 0b2f771593eb..1dd88315cff4 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -5,6 +5,10 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror GCOV_PROFILE := n UBSAN_SANITIZE := n +# Disable ftrace for the entire directory +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -mno-sched-epilog,,$(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))) + ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y += xmon.o nonstdio.o spr_access.o diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 08e367e3e8c3..33351c6704b1 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -89,6 +89,7 @@ static unsigned long nidump = 16; static unsigned long ncsum = 4096; static int termch; static char tmpstr[128]; +static int tracing_enabled; static long bus_error_jmp[JMP_BUF_LEN]; static int catch_memory_errors; @@ -234,6 +235,7 @@ Commands:\n\ "\ dr dump stream of raw bytes\n\ dt dump the tracing buffers (uses printk)\n\ + dtc dump the tracing buffers for current CPU (uses printk)\n\ " #ifdef CONFIG_PPC_POWERNV " dx# dump xive on CPU #\n\ @@ -461,6 +463,9 @@ static int xmon_core(struct pt_regs *regs, int fromipi) local_irq_save(flags); hard_irq_disable(); + tracing_enabled = tracing_is_on(); + tracing_off(); + bp = in_breakpoint_table(regs->nip, &offset); if (bp != NULL) { regs->nip = bp->address + offset; @@ -981,6 +986,8 @@ cmds(struct pt_regs *excp) break; case 'x': case 'X': + if (tracing_enabled) + tracing_on(); return cmd; case EOF: printf(" <no input ...>\n"); @@ -1732,23 +1739,25 @@ static void dump_206_sprs(void) /* Actually some of these pre-date 2.06, but whatevs */ - printf("srr0 = %.16x srr1 = %.16x dsisr = %.8x\n", + printf("srr0 = %.16lx srr1 = %.16lx dsisr = %.8x\n", mfspr(SPRN_SRR0), mfspr(SPRN_SRR1), mfspr(SPRN_DSISR)); - printf("dscr = %.16x ppr = %.16x pir = %.8x\n", + printf("dscr = %.16lx ppr = %.16lx pir = %.8x\n", mfspr(SPRN_DSCR), mfspr(SPRN_PPR), mfspr(SPRN_PIR)); + printf("amr = %.16lx uamor = %.16lx\n", + mfspr(SPRN_AMR), mfspr(SPRN_UAMOR)); if (!(mfmsr() & MSR_HV)) return; - printf("sdr1 = %.16x hdar = %.16x hdsisr = %.8x\n", + printf("sdr1 = %.16lx hdar = %.16lx hdsisr = %.8x\n", mfspr(SPRN_SDR1), mfspr(SPRN_HDAR), mfspr(SPRN_HDSISR)); - printf("hsrr0 = %.16x hsrr1 = %.16x hdec = %.8x\n", + printf("hsrr0 = %.16lx hsrr1 = %.16lx hdec = %.16lx\n", mfspr(SPRN_HSRR0), mfspr(SPRN_HSRR1), mfspr(SPRN_HDEC)); - printf("lpcr = %.16x pcr = %.16x lpidr = %.8x\n", + printf("lpcr = %.16lx pcr = %.16lx lpidr = %.8x\n", mfspr(SPRN_LPCR), mfspr(SPRN_PCR), mfspr(SPRN_LPID)); - printf("hsprg0 = %.16x hsprg1 = %.16x\n", - mfspr(SPRN_HSPRG0), mfspr(SPRN_HSPRG1)); - printf("dabr = %.16x dabrx = %.16x\n", + printf("hsprg0 = %.16lx hsprg1 = %.16lx amor = %.16lx\n", + mfspr(SPRN_HSPRG0), mfspr(SPRN_HSPRG1), mfspr(SPRN_AMOR)); + printf("dabr = %.16lx dabrx = %.16lx\n", mfspr(SPRN_DABR), mfspr(SPRN_DABRX)); #endif } @@ -1761,42 +1770,65 @@ static void dump_207_sprs(void) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return; - printf("dpdes = %.16x tir = %.16x cir = %.8x\n", + printf("dpdes = %.16lx tir = %.16lx cir = %.8x\n", mfspr(SPRN_DPDES), mfspr(SPRN_TIR), mfspr(SPRN_CIR)); - printf("fscr = %.16x tar = %.16x pspb = %.8x\n", + printf("fscr = %.16lx tar = %.16lx pspb = %.8x\n", mfspr(SPRN_FSCR), mfspr(SPRN_TAR), mfspr(SPRN_PSPB)); msr = mfmsr(); if (msr & MSR_TM) { /* Only if TM has been enabled in the kernel */ - printf("tfhar = %.16x tfiar = %.16x texasr = %.16x\n", + printf("tfhar = %.16lx tfiar = %.16lx texasr = %.16lx\n", mfspr(SPRN_TFHAR), mfspr(SPRN_TFIAR), mfspr(SPRN_TEXASR)); } - printf("mmcr0 = %.16x mmcr1 = %.16x mmcr2 = %.16x\n", + printf("mmcr0 = %.16lx mmcr1 = %.16lx mmcr2 = %.16lx\n", mfspr(SPRN_MMCR0), mfspr(SPRN_MMCR1), mfspr(SPRN_MMCR2)); printf("pmc1 = %.8x pmc2 = %.8x pmc3 = %.8x pmc4 = %.8x\n", mfspr(SPRN_PMC1), mfspr(SPRN_PMC2), mfspr(SPRN_PMC3), mfspr(SPRN_PMC4)); - printf("mmcra = %.16x siar = %.16x pmc5 = %.8x\n", + printf("mmcra = %.16lx siar = %.16lx pmc5 = %.8x\n", mfspr(SPRN_MMCRA), mfspr(SPRN_SIAR), mfspr(SPRN_PMC5)); - printf("sdar = %.16x sier = %.16x pmc6 = %.8x\n", + printf("sdar = %.16lx sier = %.16lx pmc6 = %.8x\n", mfspr(SPRN_SDAR), mfspr(SPRN_SIER), mfspr(SPRN_PMC6)); - printf("ebbhr = %.16x ebbrr = %.16x bescr = %.16x\n", + printf("ebbhr = %.16lx ebbrr = %.16lx bescr = %.16lx\n", mfspr(SPRN_EBBHR), mfspr(SPRN_EBBRR), mfspr(SPRN_BESCR)); + printf("iamr = %.16lx\n", mfspr(SPRN_IAMR)); if (!(msr & MSR_HV)) return; - printf("hfscr = %.16x dhdes = %.16x rpr = %.16x\n", + printf("hfscr = %.16lx dhdes = %.16lx rpr = %.16lx\n", mfspr(SPRN_HFSCR), mfspr(SPRN_DHDES), mfspr(SPRN_RPR)); - printf("dawr = %.16x dawrx = %.16x ciabr = %.16x\n", + printf("dawr = %.16lx dawrx = %.16lx ciabr = %.16lx\n", mfspr(SPRN_DAWR), mfspr(SPRN_DAWRX), mfspr(SPRN_CIABR)); #endif } +static void dump_300_sprs(void) +{ +#ifdef CONFIG_PPC64 + bool hv = mfmsr() & MSR_HV; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return; + + printf("pidr = %.16lx tidr = %.16lx\n", + mfspr(SPRN_PID), mfspr(SPRN_TIDR)); + printf("asdr = %.16lx psscr = %.16lx\n", + mfspr(SPRN_ASDR), hv ? mfspr(SPRN_PSSCR) + : mfspr(SPRN_PSSCR_PR)); + + if (!hv) + return; + + printf("ptcr = %.16lx\n", + mfspr(SPRN_PTCR)); +#endif +} + static void dump_one_spr(int spr, bool show_unimplemented) { unsigned long val; @@ -1850,6 +1882,7 @@ static void super_regs(void) dump_206_sprs(); dump_207_sprs(); + dump_300_sprs(); return; } @@ -2231,6 +2264,17 @@ static void xmon_rawdump (unsigned long adrs, long ndump) printf("\n"); } +static void dump_tracing(void) +{ + int c; + + c = inchar(); + if (c == 'c') + ftrace_dump(DUMP_ORIG); + else + ftrace_dump(DUMP_ALL); +} + #ifdef CONFIG_PPC64 static void dump_one_paca(int cpu) { @@ -2507,6 +2551,11 @@ dump(void) } #endif + if (c == 't') { + dump_tracing(); + return; + } + if (c == '\n') termch = c; @@ -2525,9 +2574,6 @@ dump(void) dump_log_buf(); } else if (c == 'o') { dump_opal_msglog(); - } else if (c == 't') { - ftrace_dump(DUMP_ALL); - tracing_on(); } else if (c == 'r') { scanhex(&ndump); if (ndump == 0) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 7eeb75d758c1..48af970320cb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -222,6 +222,10 @@ config HAVE_MARCH_Z13_FEATURES def_bool n select HAVE_MARCH_ZEC12_FEATURES +config HAVE_MARCH_Z14_FEATURES + def_bool n + select HAVE_MARCH_Z13_FEATURES + choice prompt "Processor type" default MARCH_Z196 @@ -282,6 +286,14 @@ config MARCH_Z13 2964 series). The kernel will be slightly faster but will not work on older machines. +config MARCH_Z14 + bool "IBM z14" + select HAVE_MARCH_Z14_FEATURES + help + Select this to enable optimizations for IBM z14 (3906 series). + The kernel will be slightly faster but will not work on older + machines. + endchoice config MARCH_Z900_TUNE @@ -305,6 +317,9 @@ config MARCH_ZEC12_TUNE config MARCH_Z13_TUNE def_bool TUNE_Z13 || MARCH_Z13 && TUNE_DEFAULT +config MARCH_Z14_TUNE + def_bool TUNE_Z14 || MARCH_Z14 && TUNE_DEFAULT + choice prompt "Tune code generation" default TUNE_DEFAULT @@ -343,6 +358,9 @@ config TUNE_ZEC12 config TUNE_Z13 bool "IBM z13" +config TUNE_Z14 + bool "IBM z14" + endchoice config 64BIT diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 54e00526b8df..dac821cfcd43 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -31,7 +31,8 @@ mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109 mflags-$(CONFIG_MARCH_Z10) := -march=z10 mflags-$(CONFIG_MARCH_Z196) := -march=z196 mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12 -mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z14) := -march=z14 export CC_FLAGS_MARCH := $(mflags-y) @@ -44,7 +45,8 @@ cflags-$(CONFIG_MARCH_Z9_109_TUNE) += -mtune=z9-109 cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10 cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196 cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12 -cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index b3c88479feba..6e2c9f7e47fa 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -16,4 +16,5 @@ generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += preempt.h generic-y += trace_clock.h +generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index b9300f8aee10..07a82bc933a7 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -8,11 +8,12 @@ #include <linux/sched/task_stack.h> #include <linux/thread_info.h> -#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64)) +#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \ + typeof(0?(__force t)0:0ULL), u64)) #define __SC_DELOUSE(t,v) ({ \ BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \ - (t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ + (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ }) #define PSW32_MASK_PER 0x40000000UL diff --git a/arch/s390/include/asm/cpcmd.h b/arch/s390/include/asm/cpcmd.h index 3dfadb5d648f..ca2b0624ad46 100644 --- a/arch/s390/include/asm/cpcmd.h +++ b/arch/s390/include/asm/cpcmd.h @@ -10,9 +10,8 @@ /* * the lowlevel function for cpcmd - * the caller of __cpcmd has to ensure that the response buffer is below 2 GB */ -extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); /* * cpcmd is the in-kernel interface for issuing CP commands @@ -25,8 +24,8 @@ extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code * response_code: return pointer for VM's error code * return value: the size of the response. The caller can check if the buffer * was large enough by comparing the return value and rlen - * NOTE: If the response buffer is not below 2 GB, cpcmd can sleep + * NOTE: If the response buffer is not in real storage, cpcmd can sleep */ -extern int cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int cpcmd(const char *cmd, char *response, int rlen, int *response_code); #endif /* _ASM_S390_CPCMD_H */ diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index c5befc5a3bf5..b71735eab23f 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -9,9 +9,7 @@ #ifndef _EBCDIC_H #define _EBCDIC_H -#ifndef _S390_TYPES_H -#include <types.h> -#endif +#include <linux/types.h> extern __u8 _ascebc_500[256]; /* ASCII -> EBCDIC 500 conversion table */ extern __u8 _ebcasc_500[256]; /* EBCDIC 500 -> ASCII conversion table */ diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index c92ed0170be2..65998a1f5d43 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -191,7 +191,7 @@ struct arch_elf_state { } while (0) #define CORE_DUMP_USE_REGSET -#define ELF_EXEC_PAGESIZE 4096 +#define ELF_EXEC_PAGESIZE PAGE_SIZE /* * This is the base location for PIE (ET_DYN with INTERP) loads. On diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index a4811aa0304d..8f8eec9e1198 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -21,17 +21,12 @@ : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc"); -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, newval, ret; load_kernel_asce(); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; pagefault_disable(); switch (op) { @@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index edb5161df7e2..6810bd757312 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -81,7 +81,7 @@ struct ipl_parameter_block { struct ipl_block_fcp fcp; struct ipl_block_ccw ccw; } ipl_info; -} __attribute__((packed,aligned(4096))); +} __packed __aligned(PAGE_SIZE); /* * IPL validity flags diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 8a5b082797f8..a6870ea6ea8b 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -95,46 +95,46 @@ struct lowcore { __u64 int_clock; /* 0x0310 */ __u64 mcck_clock; /* 0x0318 */ __u64 clock_comparator; /* 0x0320 */ + __u64 boot_clock[2]; /* 0x0328 */ /* Current process. */ - __u64 current_task; /* 0x0328 */ - __u8 pad_0x318[0x320-0x318]; /* 0x0330 */ - __u64 kernel_stack; /* 0x0338 */ + __u64 current_task; /* 0x0338 */ + __u64 kernel_stack; /* 0x0340 */ /* Interrupt, panic and restart stack. */ - __u64 async_stack; /* 0x0340 */ - __u64 panic_stack; /* 0x0348 */ - __u64 restart_stack; /* 0x0350 */ + __u64 async_stack; /* 0x0348 */ + __u64 panic_stack; /* 0x0350 */ + __u64 restart_stack; /* 0x0358 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0358 */ - __u64 restart_data; /* 0x0360 */ - __u64 restart_source; /* 0x0368 */ + __u64 restart_fn; /* 0x0360 */ + __u64 restart_data; /* 0x0368 */ + __u64 restart_source; /* 0x0370 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0370 */ - __u64 user_asce; /* 0x0378 */ + __u64 kernel_asce; /* 0x0378 */ + __u64 user_asce; /* 0x0380 */ /* * The lpp and current_pid fields form a * 64-bit value that is set as program * parameter with the LPP instruction. */ - __u32 lpp; /* 0x0380 */ - __u32 current_pid; /* 0x0384 */ + __u32 lpp; /* 0x0388 */ + __u32 current_pid; /* 0x038c */ /* SMP info area */ - __u32 cpu_nr; /* 0x0388 */ - __u32 softirq_pending; /* 0x038c */ - __u64 percpu_offset; /* 0x0390 */ - __u64 vdso_per_cpu_data; /* 0x0398 */ - __u64 machine_flags; /* 0x03a0 */ - __u32 preempt_count; /* 0x03a8 */ - __u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */ - __u64 gmap; /* 0x03b0 */ - __u32 spinlock_lockval; /* 0x03b8 */ - __u32 fpu_flags; /* 0x03bc */ - __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ + __u32 cpu_nr; /* 0x0390 */ + __u32 softirq_pending; /* 0x0394 */ + __u64 percpu_offset; /* 0x0398 */ + __u64 vdso_per_cpu_data; /* 0x03a0 */ + __u64 machine_flags; /* 0x03a8 */ + __u32 preempt_count; /* 0x03b0 */ + __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ + __u64 gmap; /* 0x03b8 */ + __u32 spinlock_lockval; /* 0x03c0 */ + __u32 fpu_flags; /* 0x03c4 */ + __u8 pad_0x03c8[0x0400-0x03c8]; /* 0x03c8 */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h deleted file mode 100644 index b79813d9cf68..000000000000 --- a/arch/s390/include/asm/mman.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/mman.h" - */ -#ifndef __S390_MMAN_H__ -#define __S390_MMAN_H__ - -#include <uapi/asm/mman.h> - -#endif /* __S390_MMAN_H__ */ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 24bc41622a98..72e9ca83a668 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -12,6 +12,7 @@ #include <linux/mm_types.h> #include <asm/tlbflush.h> #include <asm/ctl_reg.h> +#include <asm-generic/mm_hooks.h> static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) @@ -33,7 +34,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.use_cmma = 0; #endif switch (mm->context.asce_limit) { - case 1UL << 42: + case _REGION2_SIZE: /* * forked 3-level task, fall through to set new asce with new * mm->pgd @@ -49,12 +50,12 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION1; break; - case 1UL << 53: + case _REGION1_SIZE: /* forked 4-level task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; break; - case 1UL << 31: + case _REGION3_SIZE: /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; @@ -138,30 +139,4 @@ static inline void activate_mm(struct mm_struct *prev, set_user_asce(next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ -} - -static inline void arch_exit_mmap(struct mm_struct *mm) -{ -} - -static inline void arch_unmap(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void arch_bprm_mm_init(struct mm_struct *mm, - struct vm_area_struct *vma) -{ -} - -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool execute, bool foreign) -{ - /* by default, allow everything */ - return true; -} #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 9d91cf3e427f..c8e211b9a002 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -72,7 +72,7 @@ union mci { u64 ar : 1; /* 33 access register validity */ u64 da : 1; /* 34 delayed access exception */ u64 : 1; /* 35 */ - u64 gs : 1; /* 36 guarded storage registers */ + u64 gs : 1; /* 36 guarded storage registers validity */ u64 : 5; /* 37-41 */ u64 pr : 1; /* 42 tod programmable register validity */ u64 fc : 1; /* 43 fp control register validity */ diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index 42267a2fe29e..ca21b28a7b17 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -13,6 +13,7 @@ #define ESSA_SET_POT_VOLATILE 4 #define ESSA_SET_STABLE_RESIDENT 5 #define ESSA_SET_STABLE_IF_RESIDENT 6 +#define ESSA_SET_STABLE_NODAT 7 #define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 624deaa44230..5d5c2b3500a4 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -10,10 +10,14 @@ #include <linux/const.h> #include <asm/types.h> +#define _PAGE_SHIFT 12 +#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) +#define _PAGE_MASK (~(_PAGE_SIZE - 1)) + /* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#define PAGE_SHIFT _PAGE_SHIFT +#define PAGE_SIZE _PAGE_SIZE +#define PAGE_MASK _PAGE_MASK #define PAGE_DEFAULT_ACC 0 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) @@ -133,6 +137,9 @@ static inline int page_reset_referenced(unsigned long addr) struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); +void arch_set_page_dat(struct page *page, int order); +void arch_set_page_nodat(struct page *page, int order); +int arch_test_page_nodat(struct page *page); void arch_set_page_states(int make_stable); static inline int devmem_is_allowed(unsigned long pfn) @@ -145,16 +152,26 @@ static inline int devmem_is_allowed(unsigned long pfn) #endif /* !__ASSEMBLY__ */ -#define __PAGE_OFFSET 0x0UL -#define PAGE_OFFSET 0x0UL -#define __pa(x) (unsigned long)(x) -#define __va(x) (void *)(unsigned long)(x) -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define __PAGE_OFFSET 0x0UL +#define PAGE_OFFSET 0x0UL + +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)(unsigned long)(x)) + +#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) + +#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) +#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr)) +#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) + +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) + #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index bb0ff1bb0c4a..a0d9167519b1 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -15,6 +15,8 @@ #include <linux/gfp.h> #include <linux/mm.h> +#define CRST_ALLOC_ORDER 2 + unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); @@ -42,16 +44,16 @@ static inline void clear_table(unsigned long *s, unsigned long val, size_t n) static inline void crst_table_init(unsigned long *crst, unsigned long entry) { - clear_table(crst, entry, sizeof(unsigned long)*2048); + clear_table(crst, entry, _CRST_TABLE_SIZE); } static inline unsigned long pgd_entry_type(struct mm_struct *mm) { - if (mm->context.asce_limit <= (1UL << 31)) + if (mm->context.asce_limit <= _REGION3_SIZE) return _SEGMENT_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 42)) + if (mm->context.asce_limit <= _REGION2_SIZE) return _REGION3_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 53)) + if (mm->context.asce_limit <= _REGION1_SIZE) return _REGION2_ENTRY_EMPTY; return _REGION1_ENTRY_EMPTY; } @@ -119,7 +121,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) if (!table) return NULL; - if (mm->context.asce_limit == (1UL << 31)) { + if (mm->context.asce_limit == _REGION3_SIZE) { /* Forking a compat process with 2 page table levels */ if (!pgtable_pmd_page_ctor(virt_to_page(table))) { crst_table_free(mm, table); @@ -131,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (mm->context.asce_limit == (1UL << 31)) + if (mm->context.asce_limit == _REGION3_SIZE) pgtable_pmd_page_dtor(virt_to_page(pgd)); crst_table_free(mm, (unsigned long *) pgd); } @@ -158,4 +160,8 @@ static inline void pmd_populate(struct mm_struct *mm, extern void rcu_table_freelist_finish(void); +void vmem_map_init(void); +void *vmem_crst_alloc(unsigned long val); +pte_t *vmem_pte_alloc(void); + #endif /* _S390_PGALLOC_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57057fb1cc07..dce708e061ea 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -11,19 +11,6 @@ #ifndef _ASM_S390_PGTABLE_H #define _ASM_S390_PGTABLE_H -/* - * The Linux memory management assumes a three-level page table setup. - * For s390 64 bit we use up to four of the five levels the hardware - * provides (region first tables are not used). - * - * The "pgd_xxx()" functions are trivial for a folded two-level - * setup: the pgd is never bad, and a pmd always exists (as it's folded - * into the pgd entry) - * - * This file contains the functions and defines necessary to modify and use - * the S390 page table tree. - */ -#ifndef __ASSEMBLY__ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -34,9 +21,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -extern void vmem_map_init(void); -pmd_t *vmem_pmd_alloc(void); -pte_t *vmem_pte_alloc(void); enum { PG_DIRECT_MAP_4K = 0, @@ -77,38 +61,6 @@ extern unsigned long zero_page_mask; #define __HAVE_COLOR_ZERO_PAGE /* TODO: s390 cannot support io_remap_pfn_range... */ -#endif /* !__ASSEMBLY__ */ - -/* - * PMD_SHIFT determines the size of the area a second-level page - * table can map - * PGDIR_SHIFT determines what a third-level page table entry can map - */ -#define PMD_SHIFT 20 -#define PUD_SHIFT 31 -#define P4D_SHIFT 42 -#define PGDIR_SHIFT 53 - -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) -#define P4D_SIZE (1UL << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* - * entries per page directory level: the S390 is two-level, so - * we don't really have any PMD directory physically. - * for S390 segment-table entries are combined to one PGD - * that leads to 1024 pte per pgd - */ -#define PTRS_PER_PTE 256 -#define PTRS_PER_PMD 2048 -#define PTRS_PER_PUD 2048 -#define PTRS_PER_P4D 2048 -#define PTRS_PER_PGD 2048 #define FIRST_USER_ADDRESS 0UL @@ -123,7 +75,6 @@ extern unsigned long zero_page_mask; #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e)) -#ifndef __ASSEMBLY__ /* * The vmalloc and module area will always be on the topmost area of the * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules. @@ -269,7 +220,7 @@ static inline int is_module_addr(void *addr) */ /* Bits in the segment/region table address-space-control-element */ -#define _ASCE_ORIGIN ~0xfffUL/* segment table origin */ +#define _ASCE_ORIGIN ~0xfffUL/* region/segment table origin */ #define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ #define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ #define _ASCE_SPACE_SWITCH 0x40 /* space switch event */ @@ -320,9 +271,9 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL #define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */ -#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* segment table origin */ -#define _SEGMENT_ENTRY_PROTECT 0x200 /* page protection bit */ -#define _SEGMENT_ENTRY_NOEXEC 0x100 /* region no-execute bit */ +#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */ +#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ +#define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY (0) @@ -340,6 +291,54 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _CRST_ENTRIES 2048 /* number of region/segment table entries */ +#define _PAGE_ENTRIES 256 /* number of page table entries */ + +#define _CRST_TABLE_SIZE (_CRST_ENTRIES * 8) +#define _PAGE_TABLE_SIZE (_PAGE_ENTRIES * 8) + +#define _REGION1_SHIFT 53 +#define _REGION2_SHIFT 42 +#define _REGION3_SHIFT 31 +#define _SEGMENT_SHIFT 20 + +#define _REGION1_INDEX (0x7ffUL << _REGION1_SHIFT) +#define _REGION2_INDEX (0x7ffUL << _REGION2_SHIFT) +#define _REGION3_INDEX (0x7ffUL << _REGION3_SHIFT) +#define _SEGMENT_INDEX (0x7ffUL << _SEGMENT_SHIFT) +#define _PAGE_INDEX (0xffUL << _PAGE_SHIFT) + +#define _REGION1_SIZE (1UL << _REGION1_SHIFT) +#define _REGION2_SIZE (1UL << _REGION2_SHIFT) +#define _REGION3_SIZE (1UL << _REGION3_SHIFT) +#define _SEGMENT_SIZE (1UL << _SEGMENT_SHIFT) + +#define _REGION1_MASK (~(_REGION1_SIZE - 1)) +#define _REGION2_MASK (~(_REGION2_SIZE - 1)) +#define _REGION3_MASK (~(_REGION3_SIZE - 1)) +#define _SEGMENT_MASK (~(_SEGMENT_SIZE - 1)) + +#define PMD_SHIFT _SEGMENT_SHIFT +#define PUD_SHIFT _REGION3_SHIFT +#define P4D_SHIFT _REGION2_SHIFT +#define PGDIR_SHIFT _REGION1_SHIFT + +#define PMD_SIZE _SEGMENT_SIZE +#define PUD_SIZE _REGION3_SIZE +#define P4D_SIZE _REGION2_SIZE +#define PGDIR_SIZE _REGION1_SIZE + +#define PMD_MASK _SEGMENT_MASK +#define PUD_MASK _REGION3_MASK +#define P4D_MASK _REGION2_MASK +#define PGDIR_MASK _REGION1_MASK + +#define PTRS_PER_PTE _PAGE_ENTRIES +#define PTRS_PER_PMD _CRST_ENTRIES +#define PTRS_PER_PUD _CRST_ENTRIES +#define PTRS_PER_P4D _CRST_ENTRIES +#define PTRS_PER_PGD _CRST_ENTRIES + /* * Segment table and region3 table entry encoding * (R = read-only, I = invalid, y = young bit): @@ -376,6 +375,7 @@ static inline int is_module_addr(void *addr) /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL +#define _PGSTE_GPS_NODAT 0x0000000040000000UL #define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL #define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL #define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL @@ -505,7 +505,7 @@ static inline int mm_alloc_pgste(struct mm_struct *mm) * In the case that a guest uses storage keys * faults should no longer be backed by zero pages */ -#define mm_forbids_zeropage mm_use_skey +#define mm_forbids_zeropage mm_has_pgste static inline int mm_use_skey(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -952,15 +952,30 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 -static inline void __ptep_ipte(unsigned long address, pte_t *ptep, int local) +#define IPTE_NODAT 0x400 +#define IPTE_GUEST_ASCE 0x800 + +static inline void __ptep_ipte(unsigned long address, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) { unsigned long pto = (unsigned long) ptep; - /* Invalidation + TLB flush for the pte */ + if (__builtin_constant_p(opt) && opt == 0) { + /* Invalidation + TLB flush for the pte */ + asm volatile( + " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" + : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), + [m4] "i" (local)); + return; + } + + /* Invalidate ptes with options + TLB flush of the ptes */ + opt = opt | (asce & _ASCE_ORIGIN); asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" - : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), - [m4] "i" (local)); + " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (opt) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); } static inline void __ptep_ipte_range(unsigned long address, int nr, @@ -1341,31 +1356,61 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_GLOBAL 0 #define IDTE_LOCAL 1 -static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp, int local) +#define IDTE_PTOA 0x0800 +#define IDTE_NODAT 0x1000 +#define IDTE_GUEST_ASCE 0x2000 + +static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, + unsigned long opt, unsigned long asce, + int local) { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pmdp) - : [r1] "a" (sto), [r2] "a" ((address & HPAGE_MASK)), - [m4] "i" (local) - : "cc" ); + sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)), + [m4] "i" (local) + : "cc" ); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } -static inline void __pudp_idte(unsigned long address, pud_t *pudp, int local) +static inline void __pudp_idte(unsigned long addr, pud_t *pudp, + unsigned long opt, unsigned long asce, + int local) { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t); + r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pudp) - : [r1] "a" (r3o), [r2] "a" ((address & PUD_MASK)), - [m4] "i" (local) - : "cc"); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)), + [m4] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); @@ -1548,8 +1593,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#endif /* !__ASSEMBLY__ */ - #define kern_addr_valid(addr) (1) extern int vmem_add_mapping(unsigned long start, unsigned long size); diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 998b61cd0e56..eaee69e7c42a 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -80,7 +80,7 @@ struct qdr { u32 qkey : 4; u32 : 28; struct qdesfmt0 qdf0[126]; -} __attribute__ ((packed, aligned(4096))); +} __packed __aligned(PAGE_SIZE); #define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 #define QIB_RFLAGS_ENABLE_QEBSM 0x80 diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index cd78155b1829..490e035b3716 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -29,8 +29,10 @@ #define MACHINE_FLAG_TE _BITUL(11) #define MACHINE_FLAG_TLB_LC _BITUL(12) #define MACHINE_FLAG_VX _BITUL(13) -#define MACHINE_FLAG_NX _BITUL(14) -#define MACHINE_FLAG_GS _BITUL(15) +#define MACHINE_FLAG_TLB_GUEST _BITUL(14) +#define MACHINE_FLAG_NX _BITUL(15) +#define MACHINE_FLAG_GS _BITUL(16) +#define MACHINE_FLAG_SCC _BITUL(17) #define LPP_MAGIC _BITUL(31) #define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) @@ -68,8 +70,10 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) #define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) +#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) +#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) /* * Console mode. Override with conmode= @@ -104,9 +108,16 @@ extern void pfault_fini(void); #define pfault_fini() do { } while (0) #endif /* CONFIG_PFAULT */ +#ifdef CONFIG_VMCP +void vmcp_cma_reserve(void); +#else +static inline void vmcp_cma_reserve(void) { } +#endif + void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -extern void cmma_init(void); +void cmma_init(void); +void cmma_init_nodat(void); extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index f7838ecd83c6..8182b521c42f 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -92,17 +92,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) { typecheck(int, lp->lock); asm volatile( - "st %1,%0\n" - : "+Q" (lp->lock) - : "d" (0) - : "cc", "memory"); -} - -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - while (arch_spin_is_locked(lock)) - arch_spin_relax(lock); - smp_acquire__after_ctrl_dep(); +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0070\n" /* NIAI 7 */ +#endif + " st %1,%0\n" + : "=Q" (lp->lock) : "d" (0) : "cc", "memory"); } /* diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 118535123f34..93f2eb3f277c 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -15,6 +15,8 @@ /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL +extern u64 clock_comparator_max; + /* Inline functions for clock register access. */ static inline int set_tod_clock(__u64 time) { @@ -126,7 +128,7 @@ static inline unsigned long long local_tick_disable(void) unsigned long long old; old = S390_lowcore.clock_comparator; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); return old; } @@ -174,24 +176,24 @@ static inline cycles_t get_cycles(void) return (cycles_t) get_tod_clock() >> 2; } -int get_phys_clock(unsigned long long *clock); +int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); -extern u64 sched_clock_base_cc; +extern unsigned char tod_clock_base[16] __aligned(8); /** * get_clock_monotonic - returns current time in clock rate units * * The caller must ensure that preemption is disabled. - * The clock and sched_clock_base get changed via stop_machine. + * The clock and tod_clock_base get changed via stop_machine. * Therefore preemption must be disabled when calling this * function, otherwise the returned value is not guaranteed to * be monotonic. */ static inline unsigned long long get_tod_clock_monotonic(void) { - return get_tod_clock() - sched_clock_base_cc; + return get_tod_clock() - *(unsigned long long *) &tod_clock_base[1]; } /** @@ -218,4 +220,32 @@ static inline unsigned long long tod_to_ns(unsigned long long todval) return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } +/** + * tod_after - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a > (long long) b; + return a > b; +} + +/** + * tod_after_eq - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after_eq(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a >= (long long) b; + return a >= b; +} + #endif diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 2eb8ff0d6fca..3a14b864b2e3 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -135,7 +135,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 31)) + if (tlb->mm->context.asce_limit <= _REGION3_SIZE) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); tlb_remove_table(tlb, pmd); @@ -151,7 +151,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 53)) + if (tlb->mm->context.asce_limit <= _REGION1_SIZE) return; tlb_remove_table(tlb, p4d); } @@ -166,7 +166,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 42)) + if (tlb->mm->context.asce_limit <= _REGION2_SIZE) return; tlb_remove_table(tlb, pud); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 39846100682a..4d759f8f4bc7 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -20,10 +20,15 @@ static inline void __tlb_flush_local(void) */ static inline void __tlb_flush_idte(unsigned long asce) { + unsigned long opt; + + opt = IDTE_PTOA; + if (MACHINE_HAS_TLB_GUEST) + opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ asm volatile( " .insn rrf,0xb98e0000,0,%0,%1,0" - : : "a" (2048), "a" (asce) : "cc"); + : : "a" (opt), "a" (asce) : "cc"); } #ifdef CONFIG_SMP diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index fa1bfce10370..5222da162b69 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -77,12 +77,6 @@ static inline const struct cpumask *cpumask_of_node(int node) return &node_to_cpumask_map[node]; } -/* - * Returns the number of the node containing node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h deleted file mode 100644 index 6740f4f9781f..000000000000 --- a/arch/s390/include/asm/types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/types.h" - */ -#ifndef _S390_TYPES_H -#define _S390_TYPES_H - -#include <uapi/asm/types.h> - -#endif /* _S390_TYPES_H */ diff --git a/arch/s390/include/asm/unaligned.h b/arch/s390/include/asm/unaligned.h deleted file mode 100644 index da9627afe5d8..000000000000 --- a/arch/s390/include/asm/unaligned.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_S390_UNALIGNED_H -#define _ASM_S390_UNALIGNED_H - -/* - * The S390 can do unaligned accesses itself. - */ -#include <linux/unaligned/access_ok.h> -#include <linux/unaligned/generic.h> - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#endif /* _ASM_S390_UNALIGNED_H */ diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index ca62066895e0..098f28778a13 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -9,4 +9,5 @@ generic-y += param.h generic-y += poll.h generic-y += resource.h generic-y += sockios.h +generic-y += swab.h generic-y += termbits.h diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 1340311dab77..ab5797cdc1b7 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -72,7 +72,10 @@ typedef struct dasd_information2_t { * 0x02: use diag discipline (diag) * 0x04: set the device initially online (internal use only) * 0x08: enable ERP related logging - * 0x20: give access to raw eckd data + * 0x10: allow I/O to fail on lost paths + * 0x20: allow I/O to fail when a lock was stolen + * 0x40: give access to raw eckd data + * 0x80: enable discard support */ #define DASD_FEATURE_DEFAULT 0x00 #define DASD_FEATURE_READONLY 0x01 @@ -82,6 +85,7 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_FAILFAST 0x10 #define DASD_FEATURE_FAILONSLCK 0x20 #define DASD_FEATURE_USERAW 0x40 +#define DASD_FEATURE_DISCARD 0x80 #define DASD_PARTN_BITS 2 diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 52a63f4175cb..a56916c83565 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -108,4 +108,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/swab.h b/arch/s390/include/uapi/asm/swab.h deleted file mode 100644 index da3bfe5cc161..000000000000 --- a/arch/s390/include/uapi/asm/swab.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef _S390_SWAB_H -#define _S390_SWAB_H - -/* - * S390 version - * Copyright IBM Corp. 1999 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ - -#include <linux/types.h> - -#ifndef __s390x__ -# define __SWAB_64_THRU_32__ -#endif - -#ifdef __s390x__ -static inline __u64 __arch_swab64p(const __u64 *x) -{ - __u64 result; - - asm volatile("lrvg %0,%1" : "=d" (result) : "m" (*x)); - return result; -} -#define __arch_swab64p __arch_swab64p - -static inline __u64 __arch_swab64(__u64 x) -{ - __u64 result; - - asm volatile("lrvgr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab64 __arch_swab64 - -static inline void __arch_swab64s(__u64 *x) -{ - *x = __arch_swab64p(x); -} -#define __arch_swab64s __arch_swab64s -#endif /* __s390x__ */ - -static inline __u32 __arch_swab32p(const __u32 *x) -{ - __u32 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,8,%O1+3(%R1)\n" - " icm %0,4,%O1+2(%R1)\n" - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrv %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab32p __arch_swab32p - -#ifdef __s390x__ -static inline __u32 __arch_swab32(__u32 x) -{ - __u32 result; - - asm volatile("lrvr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab32 __arch_swab32 -#endif /* __s390x__ */ - -static inline __u16 __arch_swab16p(const __u16 *x) -{ - __u16 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1\n" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrvh %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab16p __arch_swab16p - -#endif /* _S390_SWAB_H */ diff --git a/arch/s390/include/uapi/asm/vmcp.h b/arch/s390/include/uapi/asm/vmcp.h new file mode 100644 index 000000000000..4caf71714a55 --- /dev/null +++ b/arch/s390/include/uapi/asm/vmcp.h @@ -0,0 +1,24 @@ +/* + * Copyright IBM Corp. 2004, 2005 + * Interface implementation for communication with the z/VM control program + * Version 1.0 + * Author(s): Christian Borntraeger <cborntra@de.ibm.com> + * + * + * z/VMs CP offers the possibility to issue commands via the diagnose code 8 + * this driver implements a character device that issues these commands and + * returns the answer of CP. + * + * The idea of this driver is based on cpint from Neale Ferguson + */ + +#ifndef _UAPI_ASM_VMCP_H +#define _UAPI_ASM_VMCP_H + +#include <linux/ioctl.h> + +#define VMCP_GETCODE _IOR(0x10, 1, int) +#define VMCP_SETBUF _IOW(0x10, 2, int) +#define VMCP_GETSIZE _IOR(0x10, 3, int) + +#endif /* _UAPI_ASM_VMCP_H */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index b65c414b6c0e..3d42f91c95fd 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -158,6 +158,7 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); + OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 9f0e4a2785f7..63bc6603e0ed 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> +#include <linux/mm.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> @@ -28,9 +29,7 @@ static int diag8_noresponse(int cmdlen) register unsigned long reg3 asm ("3") = cmdlen; asm volatile( - " sam31\n" " diag %1,%0,0x8\n" - " sam64\n" : "+d" (reg3) : "d" (reg2) : "cc"); return reg3; } @@ -43,9 +42,7 @@ static int diag8_response(int cmdlen, char *response, int *rlen) register unsigned long reg5 asm ("5") = *rlen; asm volatile( - " sam31\n" " diag %2,%0,0x8\n" - " sam64\n" " brc 8,1f\n" " agr %1,%4\n" "1:\n" @@ -57,7 +54,6 @@ static int diag8_response(int cmdlen, char *response, int *rlen) /* * __cpcmd has some restrictions over cpcmd - * - the response buffer must reside below 2GB (if any) * - __cpcmd is unlocked and therefore not SMP-safe */ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code) @@ -88,13 +84,12 @@ EXPORT_SYMBOL(__cpcmd); int cpcmd(const char *cmd, char *response, int rlen, int *response_code) { + unsigned long flags; char *lowbuf; int len; - unsigned long flags; - if ((virt_to_phys(response) != (unsigned long) response) || - (((unsigned long)response + rlen) >> 31)) { - lowbuf = kmalloc(rlen, GFP_KERNEL | GFP_DMA); + if (is_vmalloc_or_module_addr(response)) { + lowbuf = kmalloc(rlen, GFP_KERNEL); if (!lowbuf) { pr_warn("The cpcmd kernel function failed to allocate a response buffer\n"); return -ENOMEM; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 86b3e74f569e..1d9e83c401fc 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -866,7 +866,8 @@ static inline void debug_finish_entry(debug_info_t * id, debug_entry_t* active, int level, int exception) { - active->id.stck = get_tod_clock_fast() - sched_clock_base_cc; + active->id.stck = get_tod_clock_fast() - + *(unsigned long long *) &tod_clock_base[1]; active->id.fields.cpuid = smp_processor_id(); active->caller = __builtin_return_address(0); active->id.fields.exception = exception; @@ -1455,15 +1456,15 @@ int debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, int area, debug_entry_t * entry, char *out_buf) { - unsigned long sec, usec; + unsigned long base, sec, usec; char *except_str; unsigned long caller; int rc = 0; unsigned int level; level = entry->id.fields.level; - sec = (entry->id.stck >> 12) + (sched_clock_base_cc >> 12); - sec = sec - (TOD_UNIX_EPOCH >> 12); + base = (*(unsigned long *) &tod_clock_base[0]) >> 4; + sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12); usec = do_div(sec, USEC_PER_SEC); if (entry->id.fields.exception) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index dab78babfab6..2aa545dca4d5 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -76,7 +76,7 @@ void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); #ifdef CONFIG_CHECK_STACK sp = __dump_trace(func, data, sp, - S390_lowcore.panic_stack + frame_size - 4096, + S390_lowcore.panic_stack + frame_size - PAGE_SIZE, S390_lowcore.panic_stack + frame_size); #endif sp = __dump_trace(func, data, sp, diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 5d20182ee8ae..ca8cd80e8feb 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -53,8 +53,9 @@ static void __init reset_tod_clock(void) if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) disabled_wait(0); - sched_clock_base_cc = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = sched_clock_base_cc; + memset(tod_clock_base, 0, 16); + *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; + S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; } #ifdef CONFIG_SHARED_KERNEL @@ -165,8 +166,8 @@ static noinline __init void create_kernel_nss(void) } /* re-initialize cputime accounting. */ - sched_clock_base_cc = get_tod_clock(); - S390_lowcore.last_update_clock = sched_clock_base_cc; + get_tod_clock_ext(tod_clock_base); + S390_lowcore.last_update_clock = *(__u64 *) &tod_clock_base[1]; S390_lowcore.last_update_timer = 0x7fffffffffffffffULL; S390_lowcore.user_timer = 0; S390_lowcore.system_timer = 0; @@ -387,6 +388,12 @@ static __init void detect_machine_facilities(void) } if (test_facility(133)) S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + if (test_facility(139) && (tod_clock_base[1] & 0x80)) { + /* Enabled signed clock comparator comparisons */ + S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + clock_comparator_max = -1ULL >> 1; + __ctl_set_bit(0, 53); + } } static inline void save_vector_registers(void) @@ -413,7 +420,7 @@ static int __init disable_vector_extension(char *str) { S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; __ctl_clear_bit(0, 17); - return 1; + return 0; } early_param("novx", disable_vector_extension); diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index eff5b31671d4..8ed753c72d9b 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -302,7 +302,8 @@ ENTRY(startup_kdump) xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 lctlg %c0,%c15,0x200(%r0) # initialize control registers - stck __LC_LAST_UPDATE_CLOCK + stcke __LC_BOOT_CLOCK + mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) l %r15,.Lstack-.LPG0(%r13) diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 31c91f24e562..0d8f2a858ced 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -21,8 +21,8 @@ ENTRY(startup_continue) xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid mvi __LC_LPP,0x80 # and set LPP_MAGIC .insn s,0xb2800000,__LC_LPP # load program parameter -0: larl %r1,sched_clock_base_cc - mvc 0(8,%r1),__LC_LAST_UPDATE_CLOCK +0: larl %r1,tod_clock_base + mvc 0(16,%r1),__LC_BOOT_CLOCK larl %r13,.LPG1 # get base lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 6dca93b29bed..a2fdff0e730b 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -105,7 +105,8 @@ void do_IRQ(struct pt_regs *regs, int irq) old_regs = set_irq_regs(regs); irq_enter(); - if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) + if (tod_after_eq(S390_lowcore.int_clock, + S390_lowcore.clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index cfac28330b03..4bdc65636603 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -7,6 +7,7 @@ */ #include <linux/linkage.h> +#include <asm/page.h> #include <asm/sigp.h> /* @@ -55,8 +56,8 @@ ENTRY(relocate_kernel) .back_pgm: lmg %r0,%r15,gprregs-.base(%r13) .top: - lghi %r7,4096 # load PAGE_SIZE in r7 - lghi %r9,4096 # load PAGE_SIZE in r9 + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 lg %r5,0(%r2) # read another word for indirection page aghi %r2,8 # increment pointer tml %r5,0x1 # is it a destination page? diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3d1d808ea8a9..164a1e16b53e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,7 +305,7 @@ static void __init setup_lowcore(void) /* * Setup lowcore for boot cpu */ - BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096); + BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * PAGE_SIZE); lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; @@ -323,7 +323,7 @@ static void __init setup_lowcore(void) lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; - lc->clock_comparator = -1ULL; + lc->clock_comparator = clock_comparator_max; lc->kernel_stack = ((unsigned long) &init_thread_union) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->async_stack = (unsigned long) @@ -469,10 +469,10 @@ static void __init setup_memory_end(void) vmalloc_size = VMALLOC_END ?: (128UL << 30) - MODULES_LEN; tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; tmp = tmp * (sizeof(struct page) + PAGE_SIZE); - if (tmp + vmalloc_size + MODULES_LEN <= (1UL << 42)) - vmax = 1UL << 42; /* 3-level kernel page table */ + if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) + vmax = _REGION2_SIZE; /* 3-level kernel page table */ else - vmax = 1UL << 53; /* 4-level kernel page table */ + vmax = _REGION1_SIZE; /* 4-level kernel page table */ /* module area is at the end of the kernel address space. */ MODULES_END = vmax; MODULES_VADDR = MODULES_END - MODULES_LEN; @@ -818,6 +818,9 @@ static int __init setup_hwcaps(void) case 0x2965: strcpy(elf_platform, "z13"); break; + case 0x3906: + strcpy(elf_platform, "z14"); + break; } /* @@ -922,6 +925,7 @@ void __init setup_arch(char **cmdline_p) setup_memory_end(); setup_memory(); dma_contiguous_reserve(memory_end); + vmcp_cma_reserve(); check_initrd(); reserve_crashkernel(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1020a11a24e5..1cee6753d47a 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1181,6 +1181,7 @@ static int __init s390_smp_init(void) rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); + rc = rc <= 0 ? rc : 0; out: return rc; } diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c index 39e2f41b6cf0..c8ea715bfe10 100644 --- a/arch/s390/kernel/suspend.c +++ b/arch/s390/kernel/suspend.c @@ -98,10 +98,16 @@ int page_key_alloc(unsigned long pages) */ void page_key_read(unsigned long *pfn) { + struct page *page; unsigned long addr; - - addr = (unsigned long) page_address(pfn_to_page(*pfn)); - *(unsigned char *) pfn = (unsigned char) page_get_storage_key(addr); + unsigned char key; + + page = pfn_to_page(*pfn); + addr = (unsigned long) page_address(page); + key = (unsigned char) page_get_storage_key(addr) & 0x7f; + if (arch_test_page_nodat(page)) + key |= 0x80; + *(unsigned char *) pfn = key; } /* @@ -126,8 +132,16 @@ void page_key_memorize(unsigned long *pfn) */ void page_key_write(void *address) { - page_set_storage_key((unsigned long) address, - page_key_rp->data[page_key_rx], 0); + struct page *page; + unsigned char key; + + key = page_key_rp->data[page_key_rx]; + page_set_storage_key((unsigned long) address, key & 0x7f, 0); + page = virt_to_page(address); + if (key & 0x80) + arch_set_page_nodat(page, 0); + else + arch_set_page_dat(page, 0); if (++page_key_rx >= PAGE_KEY_DATA_SIZE) return; page_key_rp = page_key_rp->next; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 192efdfac918..5cbd52169348 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -51,8 +51,15 @@ #include <asm/cio.h> #include "entry.h" -u64 sched_clock_base_cc = -1; /* Force to data section. */ -EXPORT_SYMBOL_GPL(sched_clock_base_cc); +unsigned char tod_clock_base[16] __aligned(8) = { + /* Force to data section. */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; +EXPORT_SYMBOL_GPL(tod_clock_base); + +u64 clock_comparator_max = -1ULL; +EXPORT_SYMBOL_GPL(clock_comparator_max); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -75,7 +82,7 @@ void __init time_early_init(void) struct ptff_qui qui; /* Initialize TOD steering parameters */ - tod_steering_end = sched_clock_base_cc; + tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; vdso_data->ts_end = tod_steering_end; if (!test_facility(28)) @@ -111,22 +118,27 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -static void tod_to_timeval(__u64 todval, struct timespec64 *xt) +static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt) { - unsigned long long sec; + unsigned long long high, low, rem, sec, nsec; + + /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */ + high = (*(unsigned long long *) clk) >> 4; + low = (*(unsigned long long *)&clk[7]) << 4; + /* Calculate seconds and nano-seconds */ + sec = high; + rem = do_div(sec, 1000000); + nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32; - sec = todval >> 12; - do_div(sec, 1000000); xt->tv_sec = sec; - todval -= (sec * 1000000) << 12; - xt->tv_nsec = ((todval * 1000) >> 12); + xt->tv_nsec = nsec; } void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -148,7 +160,7 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); cpu = smp_processor_id(); @@ -179,7 +191,7 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == -1ULL) + if (S390_lowcore.clock_comparator == clock_comparator_max) set_clock_comparator(S390_lowcore.clock_comparator); } @@ -197,18 +209,28 @@ static void stp_reset(void); void read_persistent_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = get_tod_clock() - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + get_tod_clock_ext(clk); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } void read_boot_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = sched_clock_base_cc - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + memcpy(clk, tod_clock_base, 16); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } static u64 read_tod_clock(struct clocksource *cs) @@ -335,7 +357,7 @@ static unsigned long clock_sync_flags; * source. If the clock mode is local it will return -EOPNOTSUPP and * -EAGAIN if the clock is not in sync with the external reference. */ -int get_phys_clock(unsigned long long *clock) +int get_phys_clock(unsigned long *clock) { atomic_t *sw_ptr; unsigned int sw0, sw1; @@ -406,7 +428,10 @@ static void clock_sync_global(unsigned long long delta) struct ptff_qto qto; /* Fixup the monotonic sched clock. */ - sched_clock_base_cc += delta; + *(unsigned long long *) &tod_clock_base[1] += delta; + if (*(unsigned long long *) &tod_clock_base[1] < delta) + /* Epoch overflow */ + tod_clock_base[0]++; /* Adjust TOD steering parameters. */ vdso_data->tb_update_count++; now = get_tod_clock(); @@ -437,7 +462,7 @@ static void clock_sync_global(unsigned long long delta) static void clock_sync_local(unsigned long long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != -1ULL) { + if (S390_lowcore.clock_comparator != clock_comparator_max) { S390_lowcore.clock_comparator += delta; set_clock_comparator(S390_lowcore.clock_comparator); } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index b89d19f6f2ab..eacda05b45d7 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -157,6 +157,8 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) page_frame = get_zeroed_page(GFP_KERNEL); if (!segment_table || !page_table || !page_frame) goto out; + arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER); + arch_set_page_dat(virt_to_page(page_table), 0); /* Initialize per-cpu vdso data page */ vd = (struct vdso_per_cpu_data *) page_frame; diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index 8f048c2d6d13..263a7f9eee1e 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 32 bits vdso * library */ + +#include <asm/page.h> #include <asm/vdso.h> OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index f35455d497fe..9e3dbbcc1cfc 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 64 bits vdso * library */ + +#include <asm/page.h> #include <asm/vdso.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index ce865bd4f81d..e4d36094aceb 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -27,7 +27,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) unsigned long prefix = kvm_s390_get_prefix(vcpu); start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; - end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; + end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE; vcpu->stat.diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end @@ -51,9 +51,9 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) */ gmap_discard(vcpu->arch.gmap, start, prefix); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, 4096); - if (end > prefix + 4096) - gmap_discard(vcpu->arch.gmap, 4096, 8192); + gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + if (end > prefix + PAGE_SIZE) + gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); } return 0; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 653cae5e1ee1..3cc77391a102 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -629,7 +629,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); if (asce.r) goto real_address; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; switch (asce.dt) { case ASCE_TYPE_REGION1: if (vaddr.rfx01 > asce.tl) @@ -674,7 +674,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_SECOND_TRANS; if (edat1) dat_protection |= rfte.p; - ptr = rfte.rto * 4096 + vaddr.rsx * 8; + ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } /* fallthrough */ case ASCE_TYPE_REGION2: { @@ -692,7 +692,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_THIRD_TRANS; if (edat1) dat_protection |= rste.p; - ptr = rste.rto * 4096 + vaddr.rtx * 8; + ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } /* fallthrough */ case ASCE_TYPE_REGION3: { @@ -720,7 +720,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_SEGMENT_TRANSLATION; if (edat1) dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * 4096 + vaddr.sx * 8; + ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } /* fallthrough */ case ASCE_TYPE_SEGMENT: { @@ -743,7 +743,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, goto absolute_address; } dat_protection |= ste.fc0.p; - ptr = ste.fc0.pto * 2048 + vaddr.px * 8; + ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; } } if (kvm_is_error_gpa(vcpu->kvm, ptr)) @@ -993,7 +993,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; if (asce.r) { *fake = 1; ptr = 0; @@ -1029,7 +1029,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, union region1_table_entry rfte; if (*fake) { - ptr += (unsigned long) vaddr.rfx << 53; + ptr += vaddr.rfx * _REGION1_SIZE; rfte.val = ptr; goto shadow_r2t; } @@ -1044,7 +1044,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) *dat_protection |= rfte.p; - ptr = rfte.rto << 12UL; + ptr = rfte.rto * PAGE_SIZE; shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) @@ -1055,7 +1055,7 @@ shadow_r2t: union region2_table_entry rste; if (*fake) { - ptr += (unsigned long) vaddr.rsx << 42; + ptr += vaddr.rsx * _REGION2_SIZE; rste.val = ptr; goto shadow_r3t; } @@ -1070,7 +1070,7 @@ shadow_r2t: return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) *dat_protection |= rste.p; - ptr = rste.rto << 12UL; + ptr = rste.rto * PAGE_SIZE; shadow_r3t: rste.p |= *dat_protection; rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); @@ -1082,7 +1082,7 @@ shadow_r3t: union region3_table_entry rtte; if (*fake) { - ptr += (unsigned long) vaddr.rtx << 31; + ptr += vaddr.rtx * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1098,7 +1098,7 @@ shadow_r3t: if (rtte.fc && sg->edat_level >= 2) { *dat_protection |= rtte.fc0.p; *fake = 1; - ptr = rtte.fc1.rfaa << 31UL; + ptr = rtte.fc1.rfaa * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1106,7 +1106,7 @@ shadow_r3t: return PGM_SEGMENT_TRANSLATION; if (sg->edat_level >= 1) *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto << 12UL; + ptr = rtte.fc0.sto * PAGE_SIZE; shadow_sgt: rtte.fc0.p |= *dat_protection; rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); @@ -1118,7 +1118,7 @@ shadow_sgt: union segment_table_entry ste; if (*fake) { - ptr += (unsigned long) vaddr.sx << 20; + ptr += vaddr.sx * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } @@ -1134,11 +1134,11 @@ shadow_sgt: *dat_protection |= ste.fc0.p; if (ste.fc && sg->edat_level >= 1) { *fake = 1; - ptr = ste.fc1.sfaa << 20UL; + ptr = ste.fc1.sfaa * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } - ptr = ste.fc0.pto << 11UL; + ptr = ste.fc0.pto * (PAGE_SIZE / 2); shadow_pgt: ste.fc0.p |= *dat_protection; rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); @@ -1187,8 +1187,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, vaddr.addr = saddr; if (fake) { - /* offset in 1MB guest memory block */ - pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } if (!rc) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8a1dac793d6b..785ad028bde6 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -329,7 +329,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { /* start already designates an absolute address */ - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); } else { start = kvm_s390_real_to_abs(vcpu, start); end = start + PAGE_SIZE; @@ -893,10 +893,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) case 0x00000000: /* only 4k frames specify a real address */ start = kvm_s390_real_to_abs(vcpu, start); - end = (start + (1UL << 12)) & ~((1UL << 12) - 1); + end = (start + PAGE_SIZE) & ~(PAGE_SIZE - 1); break; case 0x00001000: - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); break; case 0x00002000: /* only support 2G frame size if EDAT2 is available and we are @@ -904,7 +904,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (!test_kvm_facility(vcpu->kvm, 78) || psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_24BIT) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - end = (start + (1UL << 31)) & ~((1UL << 31) - 1); + end = (start + _REGION3_SIZE) & ~(_REGION3_SIZE - 1); break; default: return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 715c19c45d9a..ba8203e4d516 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1069,7 +1069,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + BUILD_BUG_ON(sizeof(struct vsie_page) != PAGE_SIZE); scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); /* 512 byte alignment */ diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index 92e90e40b6fb..7f17555ad4d5 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -57,7 +57,7 @@ static void __udelay_enabled(unsigned long long usecs) end = get_tod_clock_fast() + (usecs << 12); do { clock_saved = 0; - if (end < S390_lowcore.clock_comparator) { + if (tod_after(S390_lowcore.clock_comparator, end)) { clock_saved = local_tick_disable(); set_clock_comparator(end); } diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ffb15bd4c593..b12663d653d8 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -32,42 +32,63 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +static inline int arch_load_niai4(int *lock) +{ + int owner; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0040\n" /* NIAI 4 */ +#endif + " l %0,%1\n" + : "=d" (owner) : "Q" (*lock) : "memory"); + return owner; +} + +static inline int arch_cmpxchg_niai8(int *lock, int old, int new) +{ + int expected = old; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0080\n" /* NIAI 8 */ +#endif + " cs %0,%3,%1\n" + : "=d" (old), "=Q" (*lock) + : "0" (old), "d" (new), "Q" (*lock) + : "cc", "memory"); + return expected == old; +} + void arch_spin_lock_wait(arch_spinlock_t *lp) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); - first_diag = 1; + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; continue; } - /* First iteration: check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait); @@ -75,42 +96,36 @@ EXPORT_SYMBOL(arch_spin_lock_wait); void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; local_irq_restore(flags); - first_diag = 1; + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); + + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { local_irq_disable(); - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; local_irq_restore(flags); continue; } - /* Check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait_flags); diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index b3bd3f23b8e8..4ea9106417ee 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -15,8 +15,30 @@ #include <asm/mmu_context.h> #include <asm/facility.h> +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES static DEFINE_STATIC_KEY_FALSE(have_mvcos); +static int __init uaccess_init(void) +{ + if (test_facility(27)) + static_branch_enable(&have_mvcos); + return 0; +} +early_initcall(uaccess_init); + +static inline int copy_with_mvcos(void) +{ + if (static_branch_likely(&have_mvcos)) + return 1; + return 0; +} +#else +static inline int copy_with_mvcos(void) +{ + return 1; +} +#endif + static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, unsigned long size) { @@ -84,7 +106,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_from_user_mvcos(to, from, n); return copy_from_user_mvcp(to, from, n); } @@ -157,7 +179,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_to_user_mvcos(to, from, n); return copy_to_user_mvcs(to, from, n); } @@ -220,7 +242,7 @@ static inline unsigned long copy_in_user_mvc(void __user *to, const void __user unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_in_user_mvcos(to, from, n); return copy_in_user_mvc(to, from, n); } @@ -292,7 +314,7 @@ static inline unsigned long clear_user_xc(void __user *to, unsigned long size) unsigned long __clear_user(void __user *to, unsigned long size) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return clear_user_mvcos(to, size); return clear_user_xc(to, size); } @@ -349,11 +371,3 @@ long __strncpy_from_user(char *dst, const char __user *src, long size) return done; } EXPORT_SYMBOL(__strncpy_from_user); - -static int __init uaccess_init(void) -{ - if (test_facility(27)) - static_branch_enable(&have_mvcos); - return 0; -} -early_initcall(uaccess_init); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 14f25798b001..bdabb013537b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -135,7 +135,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table = table + ((address >> 53) & 0x7ff); + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; if (bad_address(table)) goto bad; pr_cont("R1:%016lx ", *table); @@ -144,7 +144,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION2: - table = table + ((address >> 42) & 0x7ff); + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) goto bad; pr_cont("R2:%016lx ", *table); @@ -153,7 +153,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION3: - table = table + ((address >> 31) & 0x7ff); + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) goto bad; pr_cont("R3:%016lx ", *table); @@ -162,7 +162,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_SEGMENT: - table = table + ((address >> 20) & 0x7ff); + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) goto bad; pr_cont("S:%016lx ", *table); @@ -170,7 +170,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) goto out; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); } - table = table + ((address >> 12) & 0xff); + table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) goto bad; pr_cont("P:%016lx ", *table); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4fb3d3cdb370..9e1494e3d849 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -36,16 +36,16 @@ static struct gmap *gmap_alloc(unsigned long limit) unsigned long *table; unsigned long etype, atype; - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; atype = _ASCE_TYPE_SEGMENT; etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; atype = _ASCE_TYPE_REGION3; etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; atype = _ASCE_TYPE_REGION2; etype = _REGION2_ENTRY_EMPTY; } else { @@ -65,7 +65,7 @@ static struct gmap *gmap_alloc(unsigned long limit) spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); atomic_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) goto out_free; page->index = 0; @@ -186,7 +186,7 @@ static void gmap_free(struct gmap *gmap) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); @@ -306,7 +306,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); @@ -321,7 +321,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, } spin_unlock(&gmap->guest_table_lock); if (page) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return 0; } @@ -546,30 +546,30 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) + gaddr & _REGION1_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) + gaddr & _REGION2_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) + gaddr & _REGION3_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ mm = gmap->mm; pgd = pgd_offset(mm, vmaddr); @@ -771,7 +771,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = gmap->table; switch (gmap->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if (level == 4) break; if (*table & _REGION_ENTRY_INVALID) @@ -779,7 +779,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION2: - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) break; if (*table & _REGION_ENTRY_INVALID) @@ -787,7 +787,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION3: - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) break; if (*table & _REGION_ENTRY_INVALID) @@ -795,13 +795,13 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_SEGMENT: - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) break; if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr >> 12) & 0xff; + table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; } @@ -1126,7 +1126,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ if (!table || *table & _PAGE_INVALID) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); } @@ -1144,7 +1144,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, int i; BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < 256; i++, raddr += 1UL << 12) + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) pgt[i] = _PAGE_INVALID; } @@ -1164,8 +1164,8 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); - sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); *ste = _SEGMENT_ENTRY_EMPTY; @@ -1193,7 +1193,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; - for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); @@ -1222,8 +1222,8 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); - r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); *r3e = _REGION3_ENTRY_EMPTY; @@ -1231,7 +1231,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1251,7 +1251,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; - for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); @@ -1260,7 +1260,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1280,8 +1280,8 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); - r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); *r2e = _REGION2_ENTRY_EMPTY; @@ -1289,7 +1289,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1309,7 +1309,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; - for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); @@ -1318,7 +1318,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1338,8 +1338,8 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); - r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); *r1e = _REGION1_ENTRY_EMPTY; @@ -1347,7 +1347,7 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1367,7 +1367,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; - for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); @@ -1378,7 +1378,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1535,7 +1535,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, /* protect after insertion, so it will get properly invalidated */ down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, PROT_READ, PGSTE_VSIE_BIT); up_read(&parent->mm->mmap_sem); spin_lock(&parent->shadow_lock); @@ -1578,7 +1578,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; @@ -1614,10 +1614,10 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, } spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1634,7 +1634,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r2t); @@ -1662,7 +1662,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; @@ -1697,10 +1697,10 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, } spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1717,7 +1717,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r3t); @@ -1745,7 +1745,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; @@ -1781,10 +1781,10 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, } spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1801,7 +1801,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); @@ -1902,7 +1902,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, } spin_unlock(&sg->guest_table_lock); /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); spin_lock(&sg->guest_table_lock); @@ -2021,7 +2021,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, } /* Check for top level table */ start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && gaddr < end) { /* The complete shadow table has to go */ @@ -2032,7 +2032,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, return; } /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); gmap_for_each_rmap_safe(rmap, rnext, head) { bits = rmap->raddr & _SHADOW_RMAP_MASK; raddr = rmap->raddr ^ bits; @@ -2076,7 +2076,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); @@ -2121,6 +2121,37 @@ static inline void thp_split_mm(struct mm_struct *mm) } /* + * Remove all empty zero pages from the mapping for lazy refaulting + * - This must be called after mm->context.has_pgste is set, to avoid + * future creation of zero pages + * - This must be called after THP was enabled + */ +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + unsigned long addr; + + for (addr = start; addr != end; addr += PAGE_SIZE) { + pte_t *ptep; + spinlock_t *ptl; + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (is_zero_pfn(pte_pfn(*ptep))) + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_unmap_unlock(ptep, ptl); + } + return 0; +} + +static inline void zap_zero_pages(struct mm_struct *mm) +{ + struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); +} + +/* * switch on pgstes for its userspace process (for kvm) */ int s390_enable_sie(void) @@ -2137,6 +2168,7 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); + zap_zero_pages(mm); up_write(&mm->mmap_sem); return 0; } @@ -2149,13 +2181,6 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); static int __s390_enable_skey(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8111694ce55a..3b567838b905 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -26,6 +26,7 @@ #include <linux/poison.h> #include <linux/initrd.h> #include <linux/export.h> +#include <linux/cma.h> #include <linux/gfp.h> #include <linux/memblock.h> #include <asm/processor.h> @@ -84,7 +85,7 @@ void __init paging_init(void) psw_t psw; init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > (1UL << 42)) { + if (VMALLOC_END > _REGION2_SIZE) { asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; pgd_type = _REGION2_ENTRY_EMPTY; } else { @@ -93,8 +94,7 @@ void __init paging_init(void) } init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; S390_lowcore.kernel_asce = init_mm.context.asce; - clear_table((unsigned long *) init_mm.pgd, pgd_type, - sizeof(unsigned long)*2048); + crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); /* enable virtual mapping in kernel mode */ @@ -137,6 +137,8 @@ void __init mem_init(void) free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ + cmma_init_nodat(); + mem_init_print_info(NULL); } @@ -166,6 +168,58 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_CMA + +/* Prevent memory blocks which contain cma regions from going offline */ + +struct s390_cma_mem_data { + unsigned long start; + unsigned long end; +}; + +static int s390_cma_check_range(struct cma *cma, void *data) +{ + struct s390_cma_mem_data *mem_data; + unsigned long start, end; + + mem_data = data; + start = cma_get_base(cma); + end = start + cma_get_size(cma); + if (end < mem_data->start) + return 0; + if (start >= mem_data->end) + return 0; + return -EBUSY; +} + +static int s390_cma_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct s390_cma_mem_data mem_data; + struct memory_notify *arg; + int rc = 0; + + arg = data; + mem_data.start = arg->start_pfn << PAGE_SHIFT; + mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT); + if (action == MEM_GOING_OFFLINE) + rc = cma_for_each_area(s390_cma_check_range, &mem_data); + return notifier_from_errno(rc); +} + +static struct notifier_block s390_cma_mem_nb = { + .notifier_call = s390_cma_mem_notifier, +}; + +static int __init s390_cma_mem_init(void) +{ + return register_memory_notifier(&s390_cma_mem_nb); +} +device_initcall(s390_cma_mem_init); + +#endif /* CONFIG_CMA */ + int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 69a7b01ae746..07fa7b8ae233 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -10,9 +10,10 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/mm.h> +#include <linux/memblock.h> #include <linux/gfp.h> #include <linux/init.h> - +#include <asm/facility.h> #include <asm/page-states.h> static int cmma_flag = 1; @@ -36,14 +37,16 @@ __setup("cmma=", cmma); static inline int cmma_test_essa(void) { register unsigned long tmp asm("0") = 0; - register int rc asm("1") = -EOPNOTSUPP; + register int rc asm("1"); + /* test ESSA_GET_STATE */ asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,0,0\n" + " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" "0: la %0,0\n" "1:\n" EX_TABLE(0b,1b) - : "+&d" (rc), "+&d" (tmp)); + : "=&d" (rc), "+&d" (tmp) + : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); return rc; } @@ -51,11 +54,26 @@ void __init cmma_init(void) { if (!cmma_flag) return; - if (cmma_test_essa()) + if (cmma_test_essa()) { cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; } -static inline void set_page_unstable(struct page *page, int order) +static inline unsigned char get_page_state(struct page *page) +{ + unsigned char state; + + asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (state) + : "a" (page_to_phys(page)), + "i" (ESSA_GET_STATE)); + return state & 0x3f; +} + +static inline void set_page_unused(struct page *page, int order) { int i, rc; @@ -66,14 +84,18 @@ static inline void set_page_unstable(struct page *page, int order) "i" (ESSA_SET_UNUSED)); } -void arch_free_page(struct page *page, int order) +static inline void set_page_stable_dat(struct page *page, int order) { - if (!cmma_flag) - return; - set_page_unstable(page, order); + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE)); } -static inline void set_page_stable(struct page *page, int order) +static inline void set_page_stable_nodat(struct page *page, int order) { int i, rc; @@ -81,14 +103,154 @@ static inline void set_page_stable(struct page *page, int order) asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" : "=&d" (rc) : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); + "i" (ESSA_SET_STABLE_NODAT)); +} + +static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || pmd_large(*pmd)) + continue; + page = virt_to_page(pmd_val(*pmd)); + set_bit(PG_arch_1, &page->flags); + } while (pmd++, addr = next, addr != end); +} + +static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pud_t *pud; + int i; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) || pud_large(*pud)) + continue; + if (!pud_folded(*pud)) { + page = virt_to_page(pud_val(*pud)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pmd(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) + continue; + if (!p4d_folded(*p4d)) { + page = virt_to_page(p4d_val(*p4d)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pud(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +static void mark_kernel_pgd(void) +{ + unsigned long addr, next; + struct page *page; + pgd_t *pgd; + int i; + + addr = 0; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, MODULES_END); + if (pgd_none(*pgd)) + continue; + if (!pgd_folded(*pgd)) { + page = virt_to_page(pgd_val(*pgd)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_p4d(pgd, addr, next); + } while (pgd++, addr = next, addr != MODULES_END); +} + +void __init cmma_init_nodat(void) +{ + struct memblock_region *reg; + struct page *page; + unsigned long start, end, ix; + + if (cmma_flag < 2) + return; + /* Mark pages used in kernel page tables */ + mark_kernel_pgd(); + + /* Set all kernel pages not used for page tables to stable/no-dat */ + for_each_memblock(memory, reg) { + start = memblock_region_memory_base_pfn(reg); + end = memblock_region_memory_end_pfn(reg); + page = pfn_to_page(start); + for (ix = start; ix < end; ix++, page++) { + if (__test_and_clear_bit(PG_arch_1, &page->flags)) + continue; /* skip page table pages */ + if (!list_empty(&page->lru)) + continue; /* skip free pages */ + set_page_stable_nodat(page, 0); + } + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_unused(page, order); } void arch_alloc_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_stable(page, order); + if (cmma_flag < 2) + set_page_stable_dat(page, order); + else + set_page_stable_nodat(page, order); +} + +void arch_set_page_dat(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_stable_dat(page, order); +} + +void arch_set_page_nodat(struct page *page, int order) +{ + if (cmma_flag < 2) + return; + set_page_stable_nodat(page, order); +} + +int arch_test_page_nodat(struct page *page) +{ + unsigned char state; + + if (cmma_flag < 2) + return 0; + state = get_page_state(page); + return !!(state & 0x20); } void arch_set_page_states(int make_stable) @@ -108,9 +270,9 @@ void arch_set_page_states(int make_stable) list_for_each(l, &zone->free_area[order].free_list[t]) { page = list_entry(l, struct page, lru); if (make_stable) - set_page_stable(page, order); + set_page_stable_dat(page, 0); else - set_page_unstable(page, order); + set_page_unused(page, order); } } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 180481589246..552f898dfa74 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -7,6 +7,7 @@ #include <asm/cacheflush.h> #include <asm/facility.h> #include <asm/pgtable.h> +#include <asm/pgalloc.h> #include <asm/page.h> #include <asm/set_memory.h> @@ -191,7 +192,7 @@ static int split_pud_page(pud_t *pudp, unsigned long addr) pud_t new; int i, ro, nx; - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) return -ENOMEM; pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; @@ -328,7 +329,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte, IPTE_GLOBAL); + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 18918e394ce4..c5b74dd61197 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -57,6 +57,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!page) return NULL; + arch_set_page_dat(page, 2); return (unsigned long *) page_to_phys(page); } @@ -82,7 +83,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) int rc, notify; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - BUG_ON(mm->context.asce_limit < (1UL << 42)); + BUG_ON(mm->context.asce_limit < _REGION2_SIZE); if (end >= TASK_SIZE_MAX) return -ENOMEM; rc = 0; @@ -95,11 +96,11 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) } spin_lock_bh(&mm->page_table_lock); pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == (1UL << 42)) { + if (mm->context.asce_limit == _REGION2_SIZE) { crst_table_init(table, _REGION2_ENTRY_EMPTY); p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); mm->pgd = (pgd_t *) table; - mm->context.asce_limit = 1UL << 53; + mm->context.asce_limit = _REGION1_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; } else { @@ -123,7 +124,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd_t *pgd; /* downgrade should only happen from 3 to 2 levels (compat only) */ - BUG_ON(mm->context.asce_limit != (1UL << 42)); + BUG_ON(mm->context.asce_limit != _REGION2_SIZE); if (current->active_mm == mm) { clear_user_asce(); @@ -132,7 +133,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd = mm->pgd; mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = 1UL << 31; + mm->context.asce_limit = _REGION3_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; crst_table_free(mm, (unsigned long *) pgd); @@ -214,6 +215,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) __free_page(page); return NULL; } + arch_set_page_dat(page, 0); /* Initialize page table */ table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4a1f7366b17a..4198a71b8fdd 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -25,8 +25,49 @@ #include <asm/mmu_context.h> #include <asm/page-states.h> +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); + } +} + +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + } +} + static inline pte_t ptep_flush_direct(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -36,15 +77,16 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_ipte(addr, ptep, IPTE_LOCAL); + ptep_ipte_local(mm, addr, ptep, nodat); else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } static inline pte_t ptep_flush_lazy(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -57,7 +99,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } @@ -229,10 +271,12 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_direct(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -244,10 +288,12 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -259,10 +305,12 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); if (mm_has_pgste(mm)) { pgste = pgste_update_all(old, pgste, mm); pgste_set(ptep, pgste); @@ -290,6 +338,28 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_modify_prot_commit); +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); +} + +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); +} + static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { @@ -298,16 +368,12 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, old = *pmdp; if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; - if (!MACHINE_HAS_IDTE) { - __pmdp_csp(pmdp); - return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pmdp_idte(addr, pmdp, IDTE_LOCAL); + pmdp_idte_local(mm, addr, pmdp); else - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); + pmdp_idte_global(mm, addr, pmdp); atomic_dec(&mm->context.flush_count); return old; } @@ -325,10 +391,9 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; - } else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); - else - __pmdp_csp(pmdp); + } else { + pmdp_idte_global(mm, addr, pmdp); + } atomic_dec(&mm->context.flush_count); return old; } @@ -359,28 +424,46 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(pmdp_xchg_lazy); -static inline pud_t pudp_flush_direct(struct mm_struct *mm, - unsigned long addr, pud_t *pudp) +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) { - pud_t old; + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); +} - old = *pudp; - if (pud_val(old) & _REGION_ENTRY_INVALID) - return old; - if (!MACHINE_HAS_IDTE) { +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); + else /* * Invalid bit position is the same for pmd and pud, so we can * re-use _pmd_csp() here */ __pmdp_csp((pmd_t *) pudp); +} + +static inline pud_t pudp_flush_direct(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old; + + old = *pudp; + if (pud_val(old) & _REGION_ENTRY_INVALID) return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pudp_idte(addr, pudp, IDTE_LOCAL); + pudp_idte_local(mm, addr, pudp); else - __pudp_idte(addr, pudp, IDTE_GLOBAL); + pudp_idte_global(mm, addr, pudp); atomic_dec(&mm->context.flush_count); return old; } @@ -482,7 +565,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, { pte_t entry; pgste_t pgste; - int pte_i, pte_p; + int pte_i, pte_p, nodat; pgste = pgste_get_lock(ptep); entry = *ptep; @@ -495,13 +578,14 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, return -EAGAIN; } /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); pte_val(entry) |= _PAGE_INVALID; } if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pte_val(entry) &= ~_PAGE_INVALID; pte_val(entry) |= _PAGE_PROTECT; } @@ -541,10 +625,12 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) { pgste_t pgste; + int nodat; pgste = pgste_get_lock(ptep); /* notifier is called by the caller */ - ptep_flush_direct(mm, saddr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); /* don't touch the storage key - it belongs to parent pgste */ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); pgste_set_unlock(ptep, pgste); @@ -617,6 +703,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte_t *ptep; pte_t pte; bool dirty; + int nodat; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); @@ -645,7 +732,8 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index d8398962a723..c0af0d7b6e5f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -38,37 +38,14 @@ static void __ref *vmem_alloc_pages(unsigned int order) return (void *) memblock_alloc(size, size); } -static inline p4d_t *vmem_p4d_alloc(void) +void *vmem_crst_alloc(unsigned long val) { - p4d_t *p4d = NULL; + unsigned long *table; - p4d = vmem_alloc_pages(2); - if (!p4d) - return NULL; - clear_table((unsigned long *) p4d, _REGION2_ENTRY_EMPTY, PAGE_SIZE * 4); - return p4d; -} - -static inline pud_t *vmem_pud_alloc(void) -{ - pud_t *pud = NULL; - - pud = vmem_alloc_pages(2); - if (!pud) - return NULL; - clear_table((unsigned long *) pud, _REGION3_ENTRY_EMPTY, PAGE_SIZE * 4); - return pud; -} - -pmd_t *vmem_pmd_alloc(void) -{ - pmd_t *pmd = NULL; - - pmd = vmem_alloc_pages(2); - if (!pmd) - return NULL; - clear_table((unsigned long *) pmd, _SEGMENT_ENTRY_EMPTY, PAGE_SIZE * 4); - return pmd; + table = vmem_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; } pte_t __ref *vmem_pte_alloc(void) @@ -114,14 +91,14 @@ static int vmem_add_mem(unsigned long start, unsigned long size) while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); } p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -136,7 +113,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size) continue; } if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); @@ -253,7 +230,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) for (address = start; address < end;) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); @@ -261,7 +238,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -269,7 +246,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 1803797fc885..8ec88497a28d 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1093,15 +1093,27 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i case BPF_JMP | BPF_JSGT | BPF_K: /* ((s64) dst > (s64) imm) */ mask = 0x2000; /* jh */ goto branch_ks; + case BPF_JMP | BPF_JSLT | BPF_K: /* ((s64) dst < (s64) imm) */ + mask = 0x4000; /* jl */ + goto branch_ks; case BPF_JMP | BPF_JSGE | BPF_K: /* ((s64) dst >= (s64) imm) */ mask = 0xa000; /* jhe */ goto branch_ks; + case BPF_JMP | BPF_JSLE | BPF_K: /* ((s64) dst <= (s64) imm) */ + mask = 0xc000; /* jle */ + goto branch_ks; case BPF_JMP | BPF_JGT | BPF_K: /* (dst_reg > imm) */ mask = 0x2000; /* jh */ goto branch_ku; + case BPF_JMP | BPF_JLT | BPF_K: /* (dst_reg < imm) */ + mask = 0x4000; /* jl */ + goto branch_ku; case BPF_JMP | BPF_JGE | BPF_K: /* (dst_reg >= imm) */ mask = 0xa000; /* jhe */ goto branch_ku; + case BPF_JMP | BPF_JLE | BPF_K: /* (dst_reg <= imm) */ + mask = 0xc000; /* jle */ + goto branch_ku; case BPF_JMP | BPF_JNE | BPF_K: /* (dst_reg != imm) */ mask = 0x7000; /* jne */ goto branch_ku; @@ -1119,15 +1131,27 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i case BPF_JMP | BPF_JSGT | BPF_X: /* ((s64) dst > (s64) src) */ mask = 0x2000; /* jh */ goto branch_xs; + case BPF_JMP | BPF_JSLT | BPF_X: /* ((s64) dst < (s64) src) */ + mask = 0x4000; /* jl */ + goto branch_xs; case BPF_JMP | BPF_JSGE | BPF_X: /* ((s64) dst >= (s64) src) */ mask = 0xa000; /* jhe */ goto branch_xs; + case BPF_JMP | BPF_JSLE | BPF_X: /* ((s64) dst <= (s64) src) */ + mask = 0xc000; /* jle */ + goto branch_xs; case BPF_JMP | BPF_JGT | BPF_X: /* (dst > src) */ mask = 0x2000; /* jh */ goto branch_xu; + case BPF_JMP | BPF_JLT | BPF_X: /* (dst < src) */ + mask = 0x4000; /* jl */ + goto branch_xu; case BPF_JMP | BPF_JGE | BPF_X: /* (dst >= src) */ mask = 0xa000; /* jhe */ goto branch_xu; + case BPF_JMP | BPF_JLE | BPF_X: /* (dst <= src) */ + mask = 0xc000; /* jle */ + goto branch_xu; case BPF_JMP | BPF_JNE | BPF_X: /* (dst != src) */ mask = 0x7000; /* jne */ goto branch_xu; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index bd534b4d40e3..0ae3936e266f 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -24,6 +24,14 @@ bool zpci_unique_uid; +static void update_uid_checking(bool new) +{ + if (zpci_unique_uid != new) + zpci_dbg(1, "uid checking:%d\n", new); + + zpci_unique_uid = new; +} + static inline void zpci_err_clp(unsigned int rsp, int rc) { struct { @@ -319,7 +327,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data, goto out; } - zpci_unique_uid = rrb->response.uid_checking; + update_uid_checking(rrb->response.uid_checking); WARN_ON_ONCE(rrb->response.entry_size != sizeof(struct clp_fh_list_entry)); diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 025ea20fc4b4..29d72bf8ed2b 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -41,7 +41,7 @@ static struct facility_def facility_defs[] = { 27, /* mvcos */ 32, /* compare and swap and store */ 33, /* compare and swap and store 2 */ - 34, /* general extension facility */ + 34, /* general instructions extension */ 35, /* execute extensions */ #endif #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES @@ -54,6 +54,9 @@ static struct facility_def facility_defs[] = { #ifdef CONFIG_HAVE_MARCH_Z13_FEATURES 53, /* load-and-zero-rightmost-byte, etc. */ #endif +#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES + 58, /* miscellaneous-instruction-extension 2 */ +#endif -1 /* END */ } }, diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig index 75c92fc1876b..56b5e4ce8d4a 100644 --- a/arch/sh/configs/se7751_defconfig +++ b/arch/sh/configs/se7751_defconfig @@ -28,7 +28,6 @@ CONFIG_IP_PNP_RARP=y # CONFIG_INET_LRO is not set # CONFIG_IPV6 is not set CONFIG_NETFILTER=y -CONFIG_NETFILTER_DEBUG=y CONFIG_IP_NF_QUEUE=y CONFIG_MTD=y CONFIG_MTD_PARTITIONS=y diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index d0078747d308..8f8cf941a8cd 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); } -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - u32 oparg = (encoded_op << 8) >> 20; - u32 cmparg = (encoded_op << 20) >> 20; u32 oldval, newval, prev; int ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); do { @@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break; - case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break; - case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break; - case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; return ret; } diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h index c46e8cc7b515..5ed7dbbd94ff 100644 --- a/arch/sh/include/asm/spinlock-cas.h +++ b/arch/sh/include/asm/spinlock-cas.h @@ -29,11 +29,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { while (!__sl_cas(&lock->lock, 1, 0)); diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index cec78143fa83..f77263aae760 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -21,11 +21,6 @@ #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - /* * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c index c90930de76ba..3cd4f6b198b6 100644 --- a/arch/sparc/crypto/aes_glue.c +++ b/arch/sparc/crypto/aes_glue.c @@ -344,8 +344,7 @@ static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx, ctx->ops->ecb_encrypt(&ctx->key[0], (const u64 *)ctrblk, keystream, AES_BLOCK_SIZE); - crypto_xor((u8 *) keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, (u8 *) keystream, src, nbytes); crypto_inc(ctrblk, AES_BLOCK_SIZE); } diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index ee3f11c43cda..7643e979e333 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int); int __atomic_add_unless(atomic_t *, int, int); void atomic_set(atomic_t *, int); +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define atomic_read(v) ACCESS_ONCE((v)->counter) #define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v))) diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index 4e899b0dabf7..1cfd89d92208 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -29,22 +29,14 @@ : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; if (unlikely((((unsigned long) uaddr) & 0x3UL))) return -EINVAL; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - pagefault_disable(); switch (op) { @@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 8011e79f59c9..67345b2dc408 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -14,11 +14,6 @@ #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { __asm__ __volatile__( diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 186fd8199f54..b2f5c50d0947 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -98,6 +98,8 @@ #define SO_PEERGROUPS 0x003d +#define SO_ZEROCOPY 0x003e + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 8799ae9a8788..c340af7b1371 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -128,6 +128,8 @@ static u32 WDISP10(u32 off) #define BA (BRANCH | CONDA) #define BG (BRANCH | CONDG) +#define BL (BRANCH | CONDL) +#define BLE (BRANCH | CONDLE) #define BGU (BRANCH | CONDGU) #define BLEU (BRANCH | CONDLEU) #define BGE (BRANCH | CONDGE) @@ -715,9 +717,15 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, case BPF_JGT: br_opcode = BGU; break; + case BPF_JLT: + br_opcode = BLU; + break; case BPF_JGE: br_opcode = BGEU; break; + case BPF_JLE: + br_opcode = BLEU; + break; case BPF_JSET: case BPF_JNE: br_opcode = BNE; @@ -725,9 +733,15 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, case BPF_JSGT: br_opcode = BG; break; + case BPF_JSLT: + br_opcode = BL; + break; case BPF_JSGE: br_opcode = BGE; break; + case BPF_JSLE: + br_opcode = BLE; + break; default: /* Make sure we dont leak kernel information to the * user. @@ -746,18 +760,30 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, case BPF_JGT: cbcond_opcode = CBCONDGU; break; + case BPF_JLT: + cbcond_opcode = CBCONDLU; + break; case BPF_JGE: cbcond_opcode = CBCONDGEU; break; + case BPF_JLE: + cbcond_opcode = CBCONDLEU; + break; case BPF_JNE: cbcond_opcode = CBCONDNE; break; case BPF_JSGT: cbcond_opcode = CBCONDG; break; + case BPF_JSLT: + cbcond_opcode = CBCONDL; + break; case BPF_JSGE: cbcond_opcode = CBCONDGE; break; + case BPF_JSLE: + cbcond_opcode = CBCONDLE; + break; default: /* Make sure we dont leak kernel information to the * user. @@ -1176,10 +1202,14 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* IF (dst COND src) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: case BPF_JMP | BPF_JSET | BPF_X: { int err; @@ -1191,10 +1221,14 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* IF (dst COND imm) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: { int err; diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index a93774255136..53a423e7cb92 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n) _atomic_xchg(&v->counter, n); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /* A 64bit atomic type */ typedef struct { diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h index e64a1b75fc38..83c1e639b411 100644 --- a/arch/tile/include/asm/futex.h +++ b/arch/tile/include/asm/futex.h @@ -106,12 +106,9 @@ lock = __atomic_hashed_lock((int __force *)uaddr) #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int uninitialized_var(val), ret; __futex_prolog(); @@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) /* The 32-bit futex code makes this assumption, so validate it here. */ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int)); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (val == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (val != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (val < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (val >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (val <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (val > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = val; + return ret; } diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index b14b1ba5bf9c..cba8ba9b8da6 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -64,8 +64,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->current_ticket = old_ticket + TICKET_QUANTUM; } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - /* * Read-write spinlocks, allowing multiple readers * but only one writer. diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h index b9718fb4e74a..9a2c2d605752 100644 --- a/arch/tile/include/asm/spinlock_64.h +++ b/arch/tile/include/asm/spinlock_64.h @@ -58,8 +58,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT); } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val); /* Grab the "next" ticket number and bump it atomically. diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 076c6cc43113..db9333f2447c 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c @@ -62,29 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - int curr = READ_ONCE(lock->current_ticket); - int next = READ_ONCE(lock->next_ticket); - - /* Return immediately if unlocked. */ - if (next == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (READ_ONCE(lock->current_ticket) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); - /* * The low byte is always reserved to be the marker for a "tns" operation * since the low bit is set to "1" by a tns. The next seven bits are diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c index a4b5b2cbce93..de414c22892f 100644 --- a/arch/tile/lib/spinlock_64.c +++ b/arch/tile/lib/spinlock_64.c @@ -62,28 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - u32 val = READ_ONCE(lock->lock); - u32 curr = arch_spin_current(val); - - /* Return immediately if unlocked. */ - if (arch_spin_next(val) == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (arch_spin_current(READ_ONCE(lock->lock)) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); /* * If the read lock fails due to a writer, we retry periodically diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h new file mode 100644 index 000000000000..7ffa5437b761 --- /dev/null +++ b/arch/um/include/asm/unwind.h @@ -0,0 +1,8 @@ +#ifndef _ASM_UML_UNWIND_H +#define _ASM_UML_UNWIND_H + +static inline void +unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} + +#endif /* _ASM_UML_UNWIND_H */ diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 586b786b3edf..0038a2d10a7a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -8,10 +8,7 @@ obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support -obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ - -# lguest paravirtualization support -obj-$(CONFIG_LGUEST_GUEST) += lguest/ +obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/ obj-y += realmode/ obj-y += kernel/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 323cb065be5e..4b278a33ccbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,6 +55,8 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + # Causing hangs/crashes, see the commit that added this change for details. + select ARCH_HAS_REFCOUNT if BROKEN select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN @@ -73,7 +75,6 @@ config X86 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH - select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT @@ -158,6 +159,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES @@ -167,8 +169,9 @@ config X86 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -327,6 +330,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int + default 5 if X86_5LEVEL default 4 if X86_64 default 3 if X86_PAE default 2 @@ -425,16 +429,16 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config INTEL_RDT_A - bool "Intel Resource Director Technology Allocation support" +config INTEL_RDT + bool "Intel Resource Director Technology support" default n depends on X86 && CPU_SUP_INTEL select KERNFS help - Select to enable resource allocation which is a sub-feature of - Intel Resource Director Technology(RDT). More information about - RDT can be found in the Intel x86 Architecture Software - Developer Manual. + Select to enable resource allocation and monitoring which are + sub-features of Intel Resource Director Technology(RDT). More + information about RDT can be found in the Intel x86 + Architecture Software Developer Manual. Say N if unsure. @@ -778,8 +782,6 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. -source "arch/x86/lguest/Kconfig" - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT @@ -1399,6 +1401,24 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config X86_5LEVEL + bool "Enable 5-level page tables support" + depends on X86_64 + ---help--- + 5-level paging enables access to larger address space: + upto 128 PiB of virtual address space and 4 PiB of + physical address space. + + It will be supported by future Intel CPUs. + + Note: a kernel with this option enabled can only be booted + on machines that support the feature. + + See Documentation/x86/x86_64/5level-paging.txt for more + information. + + Say N if unsure. + config ARCH_PHYS_ADDR_T_64BIT def_bool y depends on X86_64 || X86_PAE @@ -1416,6 +1436,35 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config ARCH_HAS_MEM_ENCRYPT + def_bool y + +config AMD_MEM_ENCRYPT + bool "AMD Secure Memory Encryption (SME) support" + depends on X86_64 && CPU_SUP_AMD + ---help--- + Say yes to enable support for the encryption of system memory. + This requires an AMD processor that supports Secure Memory + Encryption (SME). + +config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT + bool "Activate AMD Secure Memory Encryption (SME) by default" + default y + depends on AMD_MEM_ENCRYPT + ---help--- + Say yes to have system memory encrypted by default if running on + an AMD processor that supports Secure Memory Encryption (SME). + + If set to Y, then the encryption of system memory can be + deactivated with the mem_encrypt=off command line option. + + If set to N, then the encryption of system memory can be + activated with the mem_encrypt=on command line option. + +config ARCH_USE_MEMREMAP_PROT + def_bool y + depends on AMD_MEM_ENCRYPT + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" @@ -1757,7 +1806,9 @@ config X86_SMAP config X86_INTEL_MPX prompt "Intel MPX (Memory Protection Extensions)" def_bool n - depends on CPU_SUP_INTEL + # Note: only available in 64-bit mode due to VMA flags shortage + depends on CPU_SUP_INTEL && X86_64 + select ARCH_USES_HIGH_VMA_FLAGS ---help--- MPX provides hardware features that can be used in conjunction with compiler-instrumented code to check diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index cd20ca0b4043..71a48a30fc84 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -305,8 +305,6 @@ config DEBUG_ENTRY Some of these sanity checks may slow down kernel entries and exits or otherwise impact performance. - This is currently used to help test NMI code. - If unsure, say N. config DEBUG_NMI_SELFTEST @@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG The current power state can be read from /sys/kernel/debug/punit_atom/dev_power_state +choice + prompt "Choose kernel unwinder" + default FRAME_POINTER_UNWINDER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +config FRAME_POINTER_UNWINDER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config ORC_UNWINDER + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION + ---help--- + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + This unwinder is more accurate across interrupt entry frames than the + frame pointer unwinder. It also enables a 5-10% performance + improvement across the entire kernel compared to frame pointers. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +config GUESS_UNWINDER + bool "Guess unwinder" + depends on EXPERT + ---help--- + This option enables the "guess" unwinder for unwinding kernel stack + traces. It scans the stack and reports every kernel text address it + finds. Some of the addresses it reports may be incorrect. + + While this option often produces false positives, it can still be + useful in many cases. Unlike the other unwinders, it has no runtime + overhead. + +endchoice + +config FRAME_POINTER + depends on !ORC_UNWINDER && !GUESS_UNWINDER + bool + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1e902f926be3..6276572259c8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -14,9 +14,11 @@ endif # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) - cc_stack_align_opt := -mpreferred-stack-boundary -else ifneq ($(call cc-option, -mstack-alignment=4),) - cc_stack_align_opt := -mstack-alignment + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 endif # How to compile the 16-bit code. Note we always compile for -march=i386; @@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y) # Align the stack to the register width instead of using the default # alignment of 16 bytes. This reduces stack usage and the number of # alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -115,7 +117,7 @@ else # default alignment which keep the stack *mis*aligned. # Furthermore an alignment to the register width reduces stack usage # and the number of alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8)) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) @@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -KBUILD_CFLAGS += $(mflags-y) -KBUILD_AFLAGS += $(mflags-y) - archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c3e869eaef0c..e56dbc67e837 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params, m |= (u64)efi->efi_memmap_hi << 32; #endif - d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size)); + d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i); switch (d->type) { case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -997,6 +997,9 @@ struct boot_params *efi_main(struct efi_config *c, if (boot_params->secure_boot == efi_secureboot_mode_unset) boot_params->secure_boot = efi_get_secureboot(sys_table); + /* Ask the firmware to clear memory on unclean shutdown */ + efi_enable_reset_attack_mitigation(sys_table); + setup_graphics(boot_params); setup_efi_pci(boot_params); @@ -1058,7 +1061,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1078,7 +1081,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; @@ -1099,7 +1102,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1116,7 +1119,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = 0; desc->dpl = 0; desc->p = 1; - desc->limit = 0x0; + desc->limit1 = 0x0; desc->avl = 0; desc->l = 0; desc->d = 0; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d85b9625e836..11c68cf53d4e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -61,71 +61,6 @@ __HEAD ENTRY(startup_32) -#ifdef CONFIG_EFI_STUB - jmp preferred_addr - - /* - * We don't need the return address, so set up the stack so - * efi_main() can find its arguments. - */ -ENTRY(efi_pe_entry) - add $0x4, %esp - - call 1f -1: popl %esi - subl $1b, %esi - - popl %ecx - movl %ecx, efi32_config(%esi) /* Handle */ - popl %ecx - movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax - - call make_boot_params - cmpl $0, %eax - je fail - movl %esi, BP_code32_start(%eax) - popl %ecx - pushl %eax - pushl %ecx - jmp 2f /* Skip efi_config initialization */ - -ENTRY(efi32_stub_entry) - add $0x4, %esp - popl %ecx - popl %edx - - call 1f -1: popl %esi - subl $1b, %esi - - movl %ecx, efi32_config(%esi) /* Handle */ - movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax -2: - call efi_main - cmpl $0, %eax - movl %eax, %esi - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leal preferred_addr(%eax), %eax - jmp *%eax - -preferred_addr: -#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -208,6 +143,70 @@ preferred_addr: jmp *%eax ENDPROC(startup_32) +#ifdef CONFIG_EFI_STUB +/* + * We don't need the return address, so set up the stack so efi_main() can find + * its arguments. + */ +ENTRY(efi_pe_entry) + add $0x4, %esp + + call 1f +1: popl %esi + subl $1b, %esi + + popl %ecx + movl %ecx, efi32_config(%esi) /* Handle */ + popl %ecx + movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax + + call make_boot_params + cmpl $0, %eax + je fail + movl %esi, BP_code32_start(%eax) + popl %ecx + pushl %eax + pushl %ecx + jmp 2f /* Skip efi_config initialization */ +ENDPROC(efi_pe_entry) + +ENTRY(efi32_stub_entry) + add $0x4, %esp + popl %ecx + popl %edx + + call 1f +1: popl %esi + subl $1b, %esi + + movl %ecx, efi32_config(%esi) /* Handle */ + movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax +2: + call efi_main + cmpl $0, %eax + movl %eax, %esi + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leal startup_32(%eax), %eax + jmp *%eax +ENDPROC(efi32_stub_entry) +#endif + .text relocated: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fbf4c32d0b62..b4a5d284391c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -243,65 +243,6 @@ ENTRY(startup_64) * that maps our entire kernel(text+data+bss+brk), zero page * and command line. */ -#ifdef CONFIG_EFI_STUB - /* - * The entry point for the PE/COFF executable is efi_pe_entry, so - * only legacy boot loaders will execute this jmp. - */ - jmp preferred_addr - -ENTRY(efi_pe_entry) - movq %rcx, efi64_config(%rip) /* Handle */ - movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ - - leaq efi64_config(%rip), %rax - movq %rax, efi_config(%rip) - - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - addq %rbp, efi64_config+40(%rip) - - movq %rax, %rdi - call make_boot_params - cmpq $0,%rax - je fail - mov %rax, %rsi - leaq startup_32(%rip), %rax - movl %eax, BP_code32_start(%rsi) - jmp 2f /* Skip the relocation */ - -handover_entry: - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - movq efi_config(%rip), %rax - addq %rbp, 40(%rax) -2: - movq efi_config(%rip), %rdi - call efi_main - movq %rax,%rsi - cmpq $0,%rax - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leaq preferred_addr(%rax), %rax - jmp *%rax - -preferred_addr: -#endif /* Setup data segments. */ xorl %eax, %eax @@ -413,6 +354,59 @@ lvl5: jmp *%rax #ifdef CONFIG_EFI_STUB + +/* The entry point for the PE/COFF executable is efi_pe_entry. */ +ENTRY(efi_pe_entry) + movq %rcx, efi64_config(%rip) /* Handle */ + movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ + + leaq efi64_config(%rip), %rax + movq %rax, efi_config(%rip) + + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + addq %rbp, efi64_config+40(%rip) + + movq %rax, %rdi + call make_boot_params + cmpq $0,%rax + je fail + mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) + jmp 2f /* Skip the relocation */ + +handover_entry: + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + movq efi_config(%rip), %rax + addq %rbp, 40(%rax) +2: + movq efi_config(%rip), %rdi + call efi_main + movq %rax,%rsi + cmpq $0,%rax + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leaq startup_64(%rax), %rax + jmp *%rax +ENDPROC(efi_pe_entry) + .org 0x390 ENTRY(efi64_stub_entry) movq %rdi, efi64_config(%rip) /* Handle */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 91f27ab970ef..17818ba6906f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -37,7 +37,9 @@ #include <linux/uts.h> #include <linux/utsname.h> #include <linux/ctype.h> +#include <linux/efi.h> #include <generated/utsrelease.h> +#include <asm/efi.h> /* Macros used by the included decompressor code below. */ #define STATIC @@ -479,35 +481,31 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct boot_e820_entry *entry, +static void process_mem_region(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { struct mem_vector region, overlap; struct slot_area slot_area; unsigned long start_orig, end; - struct boot_e820_entry cur_entry; - - /* Skip non-RAM entries. */ - if (entry->type != E820_TYPE_RAM) - return; + struct mem_vector cur_entry; /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) + if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) + if (entry->start + entry->size < minimum) return; /* Ignore entries above memory limit */ - end = min(entry->size + entry->addr, mem_limit); - if (entry->addr >= end) + end = min(entry->size + entry->start, mem_limit); + if (entry->start >= end) return; - cur_entry.addr = entry->addr; - cur_entry.size = end - entry->addr; + cur_entry.start = entry->start; + cur_entry.size = end - entry->start; - region.start = cur_entry.addr; + region.start = cur_entry.start; region.size = cur_entry.size; /* Give up if slot area array is full. */ @@ -521,8 +519,8 @@ static void process_e820_entry(struct boot_e820_entry *entry, /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Did we raise the address above this e820 region? */ - if (region.start > cur_entry.addr + cur_entry.size) + /* Did we raise the address above the passed in memory entry? */ + if (region.start > cur_entry.start + cur_entry.size) return; /* Reduce size by any delta from the original address. */ @@ -562,31 +560,126 @@ static void process_e820_entry(struct boot_e820_entry *entry, } } -static unsigned long find_random_phys_addr(unsigned long minimum, - unsigned long image_size) +#ifdef CONFIG_EFI +/* + * Returns true if mirror region found (and must have been processed + * for slots adding) + */ +static bool +process_efi_entries(unsigned long minimum, unsigned long image_size) { + struct efi_info *e = &boot_params->efi_info; + bool efi_mirror_found = false; + struct mem_vector region; + efi_memory_desc_t *md; + unsigned long pmap; + char *signature; + u32 nr_desc; int i; - unsigned long addr; - /* Check if we had too many memmaps. */ - if (memmap_too_large) { - debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); - return 0; + signature = (char *)&e->efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) + return false; + +#ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (e->efi_memmap_hi) { + warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); + return false; } + pmap = e->efi_memmap; +#else + pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); +#endif - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + nr_desc = e->efi_memmap_size / e->efi_memdesc_size; + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + efi_mirror_found = true; + break; + } + } + + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + + /* + * Here we are more conservative in picking free memory than + * the EFI spec allows: + * + * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also + * free memory and thus available to place the kernel image into, + * but in practice there's firmware where using that memory leads + * to crashes. + * + * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. + */ + if (md->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (efi_mirror_found && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + + region.start = md->phys_addr; + region.size = md->num_pages << EFI_PAGE_SHIFT; + process_mem_region(®ion, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted EFI scan (slot_areas full)!\n"); + break; + } + } + return true; +} +#else +static inline bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + return false; +} +#endif + +static void process_e820_entries(unsigned long minimum, + unsigned long image_size) +{ + int i; + struct mem_vector region; + struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_table[i], minimum, - image_size); + entry = &boot_params->e820_table[i]; + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + region.start = entry->addr; + region.size = entry->size; + process_mem_region(®ion, minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); break; } } +} + +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ + /* Check if we had too many memmaps. */ + if (memmap_too_large) { + debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); + return 0; + } + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + if (process_efi_entries(minimum, image_size)) + return slots_fetch_random(); + process_e820_entries(minimum, image_size); return slots_fetch_random(); } @@ -645,7 +738,7 @@ void choose_random_location(unsigned long input, */ min_addr = min(*output, 512UL << 20); - /* Walk e820 and find a random address. */ + /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { warn("Physical KASLR disabled: no suitable memory region!"); diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 28029be47fbb..f1aa43854bed 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -15,6 +15,13 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4b429df40d7a..550cd5012b73 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,3 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +CONFIG_GUESS_UNWINDER=y +# CONFIG_FRAME_POINTER_UNWINDER is not set diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 4a55cdcdc008..5c15d6b57329 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -475,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx, unsigned int nbytes = walk->nbytes; aesni_enc(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); } diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 17c05531dfd1..f9eca34301e2 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) unsigned int nbytes = walk->nbytes; blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, BF_BLOCK_SIZE); } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 8648158f3916..dbea6020ffe7 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc, unsigned int nbytes = walk->nbytes; __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, CAST5_BLOCK_SIZE); } diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index d6fc59aaaadf..30c0a37f4882 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, unsigned int nbytes = walk->nbytes; des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); } diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9976fcecd17e..af28a8a24366 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,7 +2,6 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 05ed3d393da7..640aafebdc00 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,4 +1,5 @@ #include <linux/jump_label.h> +#include <asm/unwind_hints.h> /* @@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with movq %rdx, 12*8+\offset(%rsp) movq %rsi, 13*8+\offset(%rsp) movq %rdi, 14*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with movq %r12, 3*8+\offset(%rsp) movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset .endm .macro RESTORE_EXTRA_REGS offset=0 @@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with movq 3*8+\offset(%rsp), %r12 movq 4*8+\offset(%rsp), %rbp movq 5*8+\offset(%rsp), %rbx + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 @@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with .endif movq 13*8(%rsp), %rsi movq 14*8(%rsp), %rdi + UNWIND_HINT_IRET_REGS offset=16*8 .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cdefcfdd9e63..03505ffbe1b6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -23,6 +23,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/livepatch.h> +#include <linux/syscalls.h> #include <asm/desc.h> #include <asm/traps.h> @@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); u32 cached_flags; + addr_limit_user_check(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 48ef7bb32c42..8a13d468635a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -673,16 +673,8 @@ ENTRY(name) \ jmp ret_from_intr; \ ENDPROC(name) - -#ifdef CONFIG_TRACING -# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -# define TRACE_BUILD_INTERRUPT(name, nr) -#endif - #define BUILD_INTERRUPT(name, nr) \ BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ #include <asm/entry_arch.h> @@ -880,25 +872,17 @@ ENTRY(xen_failsafe_callback) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) + xen_evtchn_do_upcall) #endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) + hyperv_vector_handler) #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp common_exception -END(trace_page_fault) -#endif - ENTRY(page_fault) ASM_CLAC pushl $do_page_fault diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 6d078b89a5e8..49167258d587 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -43,9 +44,10 @@ #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY swapgs sysretq -ENDPROC(native_usergs_sysret64) +END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_IRETQ @@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(entry_SYSCALL_64_after_swapgs) + swapgs movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 /* * If we need to do entry work or if we guess we'll need to do @@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath: movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 1: @@ -316,6 +316,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 opportunistic_sysret_failed: @@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax + UNWIND_HINT_REGS extra=0 jmp entry_SYSCALL64_slow_path 1: @@ -351,6 +353,7 @@ END(stub_ptregs_64) .macro ptregs_stub func ENTRY(ptregs_\func) + UNWIND_HINT_FUNC leaq \func(%rip), %rax jmp stub_ptregs_64 END(ptregs_\func) @@ -367,6 +370,7 @@ END(ptregs_\func) * %rsi: next task */ ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -406,6 +410,7 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -413,6 +418,7 @@ ENTRY(ret_from_fork) jnz 1f /* kernel threads are uncommon */ 2: + UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ @@ -440,13 +446,102 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 jmp common_interrupt .align 8 + vector=vector+1 .endr END(irq_entries_start) +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushfq + testl $X86_EFLAGS_IF, (%rsp) + jz .Lokay_\@ + ud2 +.Lokay_\@: + addq $8, %rsp +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + /* * Interrupt entry/exit. * @@ -485,17 +580,7 @@ END(irq_entries_start) CALL_enter_from_user_mode 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -515,10 +600,8 @@ common_interrupt: ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -561,6 +644,7 @@ restore_c_regs_and_iret: INTERRUPT_RETURN ENTRY(native_iret) + UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. @@ -633,6 +717,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 /* * At this point, we cannot write to the stack any more, but we can @@ -654,6 +739,7 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) + UNWIND_HINT_IRET_REGS ASM_CLAC pushq $~(\num) .Lcommon_\sym: @@ -662,31 +748,13 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - /* Make sure APIC interrupt handlers end up in the irqentry section: */ -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) -# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -# define POP_SECTION_IRQENTRY .popsection -#else -# define PUSH_SECTION_IRQENTRY -# define POP_SECTION_IRQENTRY -#endif +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +#define POP_SECTION_IRQENTRY .popsection .macro apicinterrupt num sym do_sym PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym POP_SECTION_IRQENTRY .endm @@ -740,13 +808,14 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=8 + /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 .error "using shift_ist requires paranoid=1" .endif ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -763,6 +832,7 @@ ENTRY(\sym) .else call error_entry .endif + UNWIND_HINT_REGS /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid @@ -829,17 +899,6 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 @@ -860,6 +919,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * edi: new selector */ ENTRY(native_load_gs_index) + FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS @@ -868,8 +928,9 @@ ENTRY(native_load_gs_index) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq + FRAME_END ret -END(native_load_gs_index) +ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, bad_gs) @@ -892,17 +953,15 @@ bad_gs: ENTRY(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK regs=0 old_rsp=%r11 call __do_softirq + LEAVE_IRQ_STACK regs=0 leaveq - decl PER_CPU_VAR(irq_count) ret -END(do_softirq_own_stack) +ENDPROC(do_softirq_own_stack) #ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -923,14 +982,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ + UNWIND_HINT_FUNC movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK + #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif @@ -951,6 +1010,7 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) jne 1f @@ -968,13 +1028,13 @@ ENTRY(xen_failsafe_callback) movq 8(%rsp), %r11 addq $0x30, %rsp pushq $0 /* RIP */ - pushq %r11 - pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 addq $0x30, %rsp + UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS @@ -998,13 +1058,12 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xendebug do_debug has_error_code=0 +idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST idtentry async_page_fault do_async_page_fault has_error_code=1 @@ -1020,6 +1079,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1047,6 +1107,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ @@ -1068,6 +1129,7 @@ END(paranoid_exit) * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1152,6 +1214,7 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testl %ebx, %ebx @@ -1160,19 +1223,9 @@ ENTRY(error_exit) END(error_exit) /* Runs on exception stack */ +/* XXX: broken on Xen PV */ ENTRY(nmi) - /* - * Fix up the exception frame if we're on Xen. - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most - * one value to the stack on native, so it may clobber the rdx - * scratch slot, but it won't clobber any of the important - * slots past it. - * - * Xen is a different story, because the Xen frame itself overlaps - * the "NMI executing" variable. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - + UNWIND_HINT_IRET_REGS /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1234,11 +1287,13 @@ ENTRY(nmi) cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ pushq 2*8(%rdx) /* pt_regs->cs */ pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -1255,6 +1310,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS ENCODE_FRAME_POINTER /* @@ -1409,6 +1465,7 @@ first_nmi: .rept 5 pushq 11*8(%rsp) .endr + UNWIND_HINT_IRET_REGS /* Everything up to here is safe from nested NMIs */ @@ -1424,6 +1481,7 @@ first_nmi: pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ INTERRUPT_RETURN /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS 1: #endif @@ -1473,6 +1531,7 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry + UNWIND_HINT_REGS /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi @@ -1510,17 +1569,19 @@ nmi_restore: END(nmi) ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -1: jmp 1b END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..e26c25ca7756 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat) */ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + swapgs /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -294,7 +293,6 @@ ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. */ - PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS @@ -342,8 +340,7 @@ ENTRY(entry_INT80_compat) jmp restore_regs_and_iret END(entry_INT80_compat) - ALIGN -GLOBAL(stub32_clone) +ENTRY(stub32_clone) /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -353,3 +350,4 @@ GLOBAL(stub32_clone) */ xchg %r8, %rcx jmp sys_clone +ENDPROC(stub32_clone) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 726355ce8497..1911310959f8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg) * and 8 bits for the node) */ d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; + d.limit1 = node >> 4; d.type = 5; /* RO data, expand down, accessed */ d.dpl = 3; /* Visible to user code */ d.s = 1; /* Not a system segment */ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index ad44af0dd667..f5cbbba99283 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu) if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared; + unsigned int nshared, subleaf, prev_eax = 0; uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); - nshared = ((eax >> 14) & 0xfff) + 1; + /* + * Iterate over Cache Topology Definition leaves until no + * more cache descriptions are available. + */ + for (subleaf = 0; subleaf < 5; subleaf++) { + cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx); + + /* EAX[0:4] gives type of cache */ + if (!(eax & 0x1f)) + break; + + prev_eax = eax; + } + nshared = ((prev_eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); @@ -555,7 +568,7 @@ static int __init amd_uncore_init(void) ret = 0; } - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 939050169d12..80534d3c2480 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_hw_config(struct perf_event *event) +int x86_pmu_max_precise(void) { - if (event->attr.precise_ip) { - int precise = 0; + int precise = 0; + + /* Support for constant skid */ + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + precise++; - /* Support for constant skid */ - if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + /* Support for IP fixup */ + if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) - precise++; + if (x86_pmu.pebs_prec_dist) + precise++; + } + return precise; +} - if (x86_pmu.pebs_prec_dist) - precise++; - } +int x86_pmu_hw_config(struct perf_event *event) +{ + if (event->attr.precise_ip) { + int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) } static struct attribute_group x86_pmu_attr_group; +static struct attribute_group x86_pmu_caps_group; static int __init init_hw_perf_events(void) { @@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void) x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (x86_pmu.caps_attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs); + if (!WARN_ON(!tmp)) + x86_pmu_caps_group.attrs = tmp; + } + if (x86_pmu.event_attrs) x86_pmu_events_group.attrs = x86_pmu.event_attrs; @@ -2213,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = { .attrs = x86_pmu_attrs, }; +static ssize_t max_precise_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); +} + +static DEVICE_ATTR_RO(max_precise); + +static struct attribute *x86_pmu_caps_attrs[] = { + &dev_attr_max_precise.attr, + NULL +}; + +static struct attribute_group x86_pmu_caps_group = { + .name = "caps", + .attrs = x86_pmu_caps_attrs, +}; + static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, + &x86_pmu_caps_group, NULL, }; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 06c2baa51814..e9d8520a801a 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index ddd8d3516bfc..16076eb34699 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags) bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - event->hw.itrace_started = 1; + perf_event_itrace_started(event); event->hw.state = 0; __bts_event_start(event); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 98b0f0729527..829e89cfcee2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = { &format_attr_any.attr, &format_attr_inv.attr, &format_attr_cmask.attr, + NULL, +}; + +static struct attribute *hsw_format_attr[] = { &format_attr_in_tx.attr, &format_attr_in_tx_cp.attr, + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; - &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ - &format_attr_ldlat.attr, /* PEBS load latency */ - NULL, +static struct attribute *nhm_format_attr[] = { + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; + +static struct attribute *slm_format_attr[] = { + &format_attr_offcore_rsp.attr, + NULL }; static struct attribute *skl_format_attr[] = { @@ -3781,6 +3795,36 @@ done: static DEVICE_ATTR_RW(freeze_on_smi); +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *lbr_attrs[] = { + &dev_attr_branches.attr, + NULL +}; + +static char pmu_name_str[30]; + +static ssize_t pmu_name_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str); +} + +static DEVICE_ATTR_RO(pmu_name); + +static struct attribute *intel_pmu_caps_attrs[] = { + &dev_attr_pmu_name.attr, + NULL +}; + static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, NULL, @@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; + struct attribute **extra_attr = NULL; + char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case INTEL_FAM6_CORE_YONAH: pr_cont("Core events, "); + name = "core"; break; case INTEL_FAM6_CORE2_MEROM: @@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_core2_event_constraints; x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); + name = "core2"; break; case INTEL_FAM6_NEHALEM: @@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); + x86_pmu.pebs_no_tlb = 1; + extra_attr = nhm_format_attr; pr_cont("Nehalem events, "); + name = "nehalem"; break; case INTEL_FAM6_ATOM_PINEVIEW: @@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_core2; pr_cont("Atom events, "); + name = "bonnell"; break; case INTEL_FAM6_ATOM_SILVERMONT1: @@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = slm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Silvermont events, "); + name = "silvermont"; break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = glm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Goldmont events, "); + name = "goldmont"; break; case INTEL_FAM6_ATOM_GEMINI_LAKE: @@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void) x86_pmu.cpu_events = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ event_attr_td_total_slots_scale_glm.event_str = "4"; + extra_attr = slm_format_attr; pr_cont("Goldmont plus events, "); + name = "goldmont_plus"; break; case INTEL_FAM6_WESTMERE: @@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void) X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); intel_pmu_pebs_data_source_nhm(); + extra_attr = nhm_format_attr; pr_cont("Westmere events, "); + name = "westmere"; break; case INTEL_FAM6_SANDYBRIDGE: @@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("SandyBridge events, "); + name = "sandybridge"; break; case INTEL_FAM6_IVYBRIDGE: @@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("IvyBridge events, "); + name = "ivybridge"; break; @@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Haswell events, "); + name = "haswell"; break; case INTEL_FAM6_BROADWELL_CORE: @@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Broadwell events, "); + name = "broadwell"; break; case INTEL_FAM6_XEON_PHI_KNL: @@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void) /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - + extra_attr = slm_format_attr; pr_cont("Knights Landing/Mill events, "); + name = "knights-landing"; break; case INTEL_FAM6_SKYLAKE_MOBILE: @@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - skl_format_attr); - WARN_ON(!x86_pmu.format_attrs); + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); x86_pmu.cpu_events = hsw_events_attrs; + intel_pmu_pebs_data_source_skl( + boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); + name = "skylake"; break; default: @@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void) case 1: x86_pmu.event_constraints = intel_v1_event_constraints; pr_cont("generic architected perfmon v1, "); + name = "generic_arch_v1"; break; default: /* @@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void) */ x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("generic architected perfmon, "); + name = "generic_arch_v2+"; break; } } + snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name); + + if (version >= 2 && extra_attr) { + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + extra_attr); + WARN_ON(!x86_pmu.format_attrs); + } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); @@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - if (x86_pmu.lbr_nr) + x86_pmu.caps_attrs = intel_pmu_caps_attrs; + + if (x86_pmu.lbr_nr) { + x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs); pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + } + /* * Access extra MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support offcore event diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c deleted file mode 100644 index 2521f771f2f5..000000000000 --- a/arch/x86/events/intel/cqm.c +++ /dev/null @@ -1,1766 +0,0 @@ -/* - * Intel Cache Quality-of-Service Monitoring (CQM) support. - * - * Based very, very heavily on work by Peter Zijlstra. - */ - -#include <linux/perf_event.h> -#include <linux/slab.h> -#include <asm/cpu_device_id.h> -#include <asm/intel_rdt_common.h> -#include "../perf_event.h" - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - -#define MBM_CNTR_WIDTH 24 -/* - * Guaranteed time in ms as per SDM where MBM counters will not overflow. - */ -#define MBM_CTR_OVERFLOW_TIME 1000 - -static u32 cqm_max_rmid = -1; -static unsigned int cqm_l3_scale; /* supposedly cacheline size */ -static bool cqm_enabled, mbm_enabled; -unsigned int mbm_socket_max; - -/* - * The cached intel_pqr_state is strictly per CPU and can never be - * updated from a remote CPU. Both functions which modify the state - * (intel_cqm_event_start and intel_cqm_event_stop) are called with - * interrupts disabled, which is sufficient for the protection. - */ -DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); -static struct hrtimer *mbm_timers; -/** - * struct sample - mbm event's (local or total) data - * @total_bytes #bytes since we began monitoring - * @prev_msr previous value of MSR - */ -struct sample { - u64 total_bytes; - u64 prev_msr; -}; - -/* - * samples profiled for total memory bandwidth type events - */ -static struct sample *mbm_total; -/* - * samples profiled for local memory bandwidth type events - */ -static struct sample *mbm_local; - -#define pkg_id topology_physical_package_id(smp_processor_id()) -/* - * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. - * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of - * rmids per socket, an example is given below - * RMID1 of Socket0: vrmid = 1 - * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 - * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 - */ -#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) -/* - * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. - * Also protects event->hw.cqm_rmid - * - * Hold either for stability, both for modification of ->hw.cqm_rmid. - */ -static DEFINE_MUTEX(cache_mutex); -static DEFINE_RAW_SPINLOCK(cache_lock); - -/* - * Groups of events that have the same target(s), one RMID per group. - */ -static LIST_HEAD(cache_groups); - -/* - * Mask of CPUs for reading CQM values. We only need one per-socket. - */ -static cpumask_t cqm_cpumask; - -#define RMID_VAL_ERROR (1ULL << 63) -#define RMID_VAL_UNAVAIL (1ULL << 62) - -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_MBM_LOCAL_EVENT_ID 0x03 - -/* - * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). - * - * This rmid is always free and is guaranteed to have an associated - * near-zero occupancy value, i.e. no cachelines are tagged with this - * RMID, once __intel_cqm_rmid_rotate() returns. - */ -static u32 intel_cqm_rotation_rmid; - -#define INVALID_RMID (-1) - -/* - * Is @rmid valid for programming the hardware? - * - * rmid 0 is reserved by the hardware for all non-monitored tasks, which - * means that we should never come across an rmid with that value. - * Likewise, an rmid value of -1 is used to indicate "no rmid currently - * assigned" and is used as part of the rotation code. - */ -static inline bool __rmid_valid(u32 rmid) -{ - if (!rmid || rmid == INVALID_RMID) - return false; - - return true; -} - -static u64 __rmid_read(u32 rmid) -{ - u64 val; - - /* - * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, - * it just says that to increase confusion. - */ - wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - /* - * Aside from the ERROR and UNAVAIL bits, assume this thing returns - * the number of cachelines tagged with @rmid. - */ - return val; -} - -enum rmid_recycle_state { - RMID_YOUNG = 0, - RMID_AVAILABLE, - RMID_DIRTY, -}; - -struct cqm_rmid_entry { - u32 rmid; - enum rmid_recycle_state state; - struct list_head list; - unsigned long queue_time; -}; - -/* - * cqm_rmid_free_lru - A least recently used list of RMIDs. - * - * Oldest entry at the head, newest (most recently used) entry at the - * tail. This list is never traversed, it's only used to keep track of - * the lru order. That is, we only pick entries of the head or insert - * them on the tail. - * - * All entries on the list are 'free', and their RMIDs are not currently - * in use. To mark an RMID as in use, remove its entry from the lru - * list. - * - * - * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. - * - * This list is contains RMIDs that no one is currently using but that - * may have a non-zero occupancy value associated with them. The - * rotation worker moves RMIDs from the limbo list to the free list once - * the occupancy value drops below __intel_cqm_threshold. - * - * Both lists are protected by cache_mutex. - */ -static LIST_HEAD(cqm_rmid_free_lru); -static LIST_HEAD(cqm_rmid_limbo_lru); - -/* - * We use a simple array of pointers so that we can lookup a struct - * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() - * and __put_rmid() from having to worry about dealing with struct - * cqm_rmid_entry - they just deal with rmids, i.e. integers. - * - * Once this array is initialized it is read-only. No locks are required - * to access it. - * - * All entries for all RMIDs can be looked up in the this array at all - * times. - */ -static struct cqm_rmid_entry **cqm_rmid_ptrs; - -static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - entry = cqm_rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); - - return entry; -} - -/* - * Returns < 0 on fail. - * - * We expect to be called with cache_mutex held. - */ -static u32 __get_rmid(void) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - if (list_empty(&cqm_rmid_free_lru)) - return INVALID_RMID; - - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); - list_del(&entry->list); - - return entry->rmid; -} - -static void __put_rmid(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - WARN_ON(!__rmid_valid(rmid)); - entry = __rmid_entry(rmid); - - entry->queue_time = jiffies; - entry->state = RMID_YOUNG; - - list_add_tail(&entry->list, &cqm_rmid_limbo_lru); -} - -static void cqm_cleanup(void) -{ - int i; - - if (!cqm_rmid_ptrs) - return; - - for (i = 0; i < cqm_max_rmid; i++) - kfree(cqm_rmid_ptrs[i]); - - kfree(cqm_rmid_ptrs); - cqm_rmid_ptrs = NULL; - cqm_enabled = false; -} - -static int intel_cqm_setup_rmid_cache(void) -{ - struct cqm_rmid_entry *entry; - unsigned int nr_rmids; - int r = 0; - - nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * - nr_rmids, GFP_KERNEL); - if (!cqm_rmid_ptrs) - return -ENOMEM; - - for (; r <= cqm_max_rmid; r++) { - struct cqm_rmid_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto fail; - - INIT_LIST_HEAD(&entry->list); - entry->rmid = r; - cqm_rmid_ptrs[r] = entry; - - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. - */ - entry = __rmid_entry(0); - list_del(&entry->list); - - mutex_lock(&cache_mutex); - intel_cqm_rotation_rmid = __get_rmid(); - mutex_unlock(&cache_mutex); - - return 0; - -fail: - cqm_cleanup(); - return -ENOMEM; -} - -/* - * Determine if @a and @b measure the same set of tasks. - * - * If @a and @b measure the same set of tasks then we want to share a - * single RMID. - */ -static bool __match_event(struct perf_event *a, struct perf_event *b) -{ - /* Per-cpu and task events don't mix */ - if ((a->attach_state & PERF_ATTACH_TASK) != - (b->attach_state & PERF_ATTACH_TASK)) - return false; - -#ifdef CONFIG_CGROUP_PERF - if (a->cgrp != b->cgrp) - return false; -#endif - - /* If not task event, we're machine wide */ - if (!(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Events that target same task are placed into the same cache group. - * Mark it as a multi event group, so that we update ->count - * for every event rather than just the group leader later. - */ - if (a->hw.target == b->hw.target) { - b->hw.is_group_event = true; - return true; - } - - /* - * Are we an inherited event? - */ - if (b->parent == a) - return true; - - return false; -} - -#ifdef CONFIG_CGROUP_PERF -static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) -{ - if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target, event->ctx); - - return event->cgrp; -} -#endif - -/* - * Determine if @a's tasks intersect with @b's tasks - * - * There are combinations of events that we explicitly prohibit, - * - * PROHIBITS - * system-wide -> cgroup and task - * cgroup -> system-wide - * -> task in cgroup - * task -> system-wide - * -> task in cgroup - * - * Call this function before allocating an RMID. - */ -static bool __conflict_event(struct perf_event *a, struct perf_event *b) -{ -#ifdef CONFIG_CGROUP_PERF - /* - * We can have any number of cgroups but only one system-wide - * event at a time. - */ - if (a->cgrp && b->cgrp) { - struct perf_cgroup *ac = a->cgrp; - struct perf_cgroup *bc = b->cgrp; - - /* - * This condition should have been caught in - * __match_event() and we should be sharing an RMID. - */ - WARN_ON_ONCE(ac == bc); - - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } - - if (a->cgrp || b->cgrp) { - struct perf_cgroup *ac, *bc; - - /* - * cgroup and system-wide events are mutually exclusive - */ - if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || - (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) - return true; - - /* - * Ensure neither event is part of the other's cgroup - */ - ac = event_to_cgroup(a); - bc = event_to_cgroup(b); - if (ac == bc) - return true; - - /* - * Must have cgroup and non-intersecting task events. - */ - if (!ac || !bc) - return false; - - /* - * We have cgroup and task events, and the task belongs - * to a cgroup. Check for for overlap. - */ - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } -#endif - /* - * If one of them is not a task, same story as above with cgroups. - */ - if (!(a->attach_state & PERF_ATTACH_TASK) || - !(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Must be non-overlapping. - */ - return false; -} - -struct rmid_read { - u32 rmid; - u32 evt_type; - atomic64_t value; -}; - -static void __intel_cqm_event_count(void *info); -static void init_mbm_sample(u32 rmid, u32 evt_type); -static void __intel_mbm_event_count(void *info); - -static bool is_cqm_event(int e) -{ - return (e == QOS_L3_OCCUP_EVENT_ID); -} - -static bool is_mbm_event(int e) -{ - return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); -} - -static void cqm_mask_call(struct rmid_read *rr) -{ - if (is_mbm_event(rr->evt_type)) - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); - else - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); -} - -/* - * Exchange the RMID of a group of events. - */ -static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) -{ - struct perf_event *event; - struct list_head *head = &group->hw.cqm_group_entry; - u32 old_rmid = group->hw.cqm_rmid; - - lockdep_assert_held(&cache_mutex); - - /* - * If our RMID is being deallocated, perform a read now. - */ - if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { - struct rmid_read rr = { - .rmid = old_rmid, - .evt_type = group->attr.config, - .value = ATOMIC64_INIT(0), - }; - - cqm_mask_call(&rr); - local64_set(&group->count, atomic64_read(&rr.value)); - } - - raw_spin_lock_irq(&cache_lock); - - group->hw.cqm_rmid = rmid; - list_for_each_entry(event, head, hw.cqm_group_entry) - event->hw.cqm_rmid = rmid; - - raw_spin_unlock_irq(&cache_lock); - - /* - * If the allocation is for mbm, init the mbm stats. - * Need to check if each event in the group is mbm event - * because there could be multiple type of events in the same group. - */ - if (__rmid_valid(rmid)) { - event = group; - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - - list_for_each_entry(event, head, hw.cqm_group_entry) { - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - } - } - - return old_rmid; -} - -/* - * If we fail to assign a new RMID for intel_cqm_rotation_rmid because - * cachelines are still tagged with RMIDs in limbo, we progressively - * increment the threshold until we find an RMID in limbo with <= - * __intel_cqm_threshold lines tagged. This is designed to mitigate the - * problem where cachelines tagged with an RMID are not steadily being - * evicted. - * - * On successful rotations we decrease the threshold back towards zero. - * - * __intel_cqm_max_threshold provides an upper bound on the threshold, - * and is measured in bytes because it's exposed to userland. - */ -static unsigned int __intel_cqm_threshold; -static unsigned int __intel_cqm_max_threshold; - -/* - * Test whether an RMID has a zero occupancy value on this cpu. - */ -static void intel_cqm_stable(void *arg) -{ - struct cqm_rmid_entry *entry; - - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - if (entry->state != RMID_AVAILABLE) - break; - - if (__rmid_read(entry->rmid) > __intel_cqm_threshold) - entry->state = RMID_DIRTY; - } -} - -/* - * If we have group events waiting for an RMID that don't conflict with - * events already running, assign @rmid. - */ -static bool intel_cqm_sched_in_event(u32 rmid) -{ - struct perf_event *leader, *event; - - lockdep_assert_held(&cache_mutex); - - leader = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - event = leader; - - list_for_each_entry_continue(event, &cache_groups, - hw.cqm_groups_entry) { - if (__rmid_valid(event->hw.cqm_rmid)) - continue; - - if (__conflict_event(event, leader)) - continue; - - intel_cqm_xchg_rmid(event, rmid); - return true; - } - - return false; -} - -/* - * Initially use this constant for both the limbo queue time and the - * rotation timer interval, pmu::hrtimer_interval_ms. - * - * They don't need to be the same, but the two are related since if you - * rotate faster than you recycle RMIDs, you may run out of available - * RMIDs. - */ -#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ - -static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; - -/* - * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list - * @nr_available: number of freeable RMIDs on the limbo list - * - * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no - * cachelines are tagged with those RMIDs. After this we can reuse them - * and know that the current set of active RMIDs is stable. - * - * Return %true or %false depending on whether stabilization needs to be - * reattempted. - * - * If we return %true then @nr_available is updated to indicate the - * number of RMIDs on the limbo list that have been queued for the - * minimum queue time (RMID_AVAILABLE), but whose data occupancy values - * are above __intel_cqm_threshold. - */ -static bool intel_cqm_rmid_stabilize(unsigned int *available) -{ - struct cqm_rmid_entry *entry, *tmp; - - lockdep_assert_held(&cache_mutex); - - *available = 0; - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - unsigned long min_queue_time; - unsigned long now = jiffies; - - /* - * We hold RMIDs placed into limbo for a minimum queue - * time. Before the minimum queue time has elapsed we do - * not recycle RMIDs. - * - * The reasoning is that until a sufficient time has - * passed since we stopped using an RMID, any RMID - * placed onto the limbo list will likely still have - * data tagged in the cache, which means we'll probably - * fail to recycle it anyway. - * - * We can save ourselves an expensive IPI by skipping - * any RMIDs that have not been queued for the minimum - * time. - */ - min_queue_time = entry->queue_time + - msecs_to_jiffies(__rmid_queue_time_ms); - - if (time_after(min_queue_time, now)) - break; - - entry->state = RMID_AVAILABLE; - (*available)++; - } - - /* - * Fast return if none of the RMIDs on the limbo list have been - * sitting on the queue for the minimum queue time. - */ - if (!*available) - return false; - - /* - * Test whether an RMID is free for each package. - */ - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); - - list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { - /* - * Exhausted all RMIDs that have waited min queue time. - */ - if (entry->state == RMID_YOUNG) - break; - - if (entry->state == RMID_DIRTY) - continue; - - list_del(&entry->list); /* remove from limbo */ - - /* - * The rotation RMID gets priority if it's - * currently invalid. In which case, skip adding - * the RMID to the the free lru. - */ - if (!__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_rotation_rmid = entry->rmid; - continue; - } - - /* - * If we have groups waiting for RMIDs, hand - * them one now provided they don't conflict. - */ - if (intel_cqm_sched_in_event(entry->rmid)) - continue; - - /* - * Otherwise place it onto the free list. - */ - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - - return __rmid_valid(intel_cqm_rotation_rmid); -} - -/* - * Pick a victim group and move it to the tail of the group list. - * @next: The first group without an RMID - */ -static void __intel_cqm_pick_and_rotate(struct perf_event *next) -{ - struct perf_event *rotor; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - rotor = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - - /* - * The group at the front of the list should always have a valid - * RMID. If it doesn't then no groups have RMIDs assigned and we - * don't need to rotate the list. - */ - if (next == rotor) - return; - - rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); - __put_rmid(rmid); - - list_rotate_left(&cache_groups); -} - -/* - * Deallocate the RMIDs from any events that conflict with @event, and - * place them on the back of the group list. - */ -static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) -{ - struct perf_event *group, *g; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { - if (group == event) - continue; - - rmid = group->hw.cqm_rmid; - - /* - * Skip events that don't have a valid RMID. - */ - if (!__rmid_valid(rmid)) - continue; - - /* - * No conflict? No problem! Leave the event alone. - */ - if (!__conflict_event(group, event)) - continue; - - intel_cqm_xchg_rmid(group, INVALID_RMID); - __put_rmid(rmid); - } -} - -/* - * Attempt to rotate the groups and assign new RMIDs. - * - * We rotate for two reasons, - * 1. To handle the scheduling of conflicting events - * 2. To recycle RMIDs - * - * Rotating RMIDs is complicated because the hardware doesn't give us - * any clues. - * - * There's problems with the hardware interface; when you change the - * task:RMID map cachelines retain their 'old' tags, giving a skewed - * picture. In order to work around this, we must always keep one free - * RMID - intel_cqm_rotation_rmid. - * - * Rotation works by taking away an RMID from a group (the old RMID), - * and assigning the free RMID to another group (the new RMID). We must - * then wait for the old RMID to not be used (no cachelines tagged). - * This ensure that all cachelines are tagged with 'active' RMIDs. At - * this point we can start reading values for the new RMID and treat the - * old RMID as the free RMID for the next rotation. - * - * Return %true or %false depending on whether we did any rotating. - */ -static bool __intel_cqm_rmid_rotate(void) -{ - struct perf_event *group, *start = NULL; - unsigned int threshold_limit; - unsigned int nr_needed = 0; - unsigned int nr_available; - bool rotated = false; - - mutex_lock(&cache_mutex); - -again: - /* - * Fast path through this function if there are no groups and no - * RMIDs that need cleaning. - */ - if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { - if (!__rmid_valid(group->hw.cqm_rmid)) { - if (!start) - start = group; - nr_needed++; - } - } - - /* - * We have some event groups, but they all have RMIDs assigned - * and no RMIDs need cleaning. - */ - if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - if (!nr_needed) - goto stabilize; - - /* - * We have more event groups without RMIDs than available RMIDs, - * or we have event groups that conflict with the ones currently - * scheduled. - * - * We force deallocate the rmid of the group at the head of - * cache_groups. The first event group without an RMID then gets - * assigned intel_cqm_rotation_rmid. This ensures we always make - * forward progress. - * - * Rotate the cache_groups list so the previous head is now the - * tail. - */ - __intel_cqm_pick_and_rotate(start); - - /* - * If the rotation is going to succeed, reduce the threshold so - * that we don't needlessly reuse dirty RMIDs. - */ - if (__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); - intel_cqm_rotation_rmid = __get_rmid(); - - intel_cqm_sched_out_conflicting_events(start); - - if (__intel_cqm_threshold) - __intel_cqm_threshold--; - } - - rotated = true; - -stabilize: - /* - * We now need to stablize the RMID we freed above (if any) to - * ensure that the next time we rotate we have an RMID with zero - * occupancy value. - * - * Alternatively, if we didn't need to perform any rotation, - * we'll have a bunch of RMIDs in limbo that need stabilizing. - */ - threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; - - while (intel_cqm_rmid_stabilize(&nr_available) && - __intel_cqm_threshold < threshold_limit) { - unsigned int steal_limit; - - /* - * Don't spin if nobody is actively waiting for an RMID, - * the rotation worker will be kicked as soon as an - * event needs an RMID anyway. - */ - if (!nr_needed) - break; - - /* Allow max 25% of RMIDs to be in limbo. */ - steal_limit = (cqm_max_rmid + 1) / 4; - - /* - * We failed to stabilize any RMIDs so our rotation - * logic is now stuck. In order to make forward progress - * we have a few options: - * - * 1. rotate ("steal") another RMID - * 2. increase the threshold - * 3. do nothing - * - * We do both of 1. and 2. until we hit the steal limit. - * - * The steal limit prevents all RMIDs ending up on the - * limbo list. This can happen if every RMID has a - * non-zero occupancy above threshold_limit, and the - * occupancy values aren't dropping fast enough. - * - * Note that there is prioritisation at work here - we'd - * rather increase the number of RMIDs on the limbo list - * than increase the threshold, because increasing the - * threshold skews the event data (because we reuse - * dirty RMIDs) - threshold bumps are a last resort. - */ - if (nr_available < steal_limit) - goto again; - - __intel_cqm_threshold++; - } - -out: - mutex_unlock(&cache_mutex); - return rotated; -} - -static void intel_cqm_rmid_rotate(struct work_struct *work); - -static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); - -static struct pmu intel_cqm_pmu; - -static void intel_cqm_rmid_rotate(struct work_struct *work) -{ - unsigned long delay; - - __intel_cqm_rmid_rotate(); - - delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); - schedule_delayed_work(&intel_cqm_rmid_work, delay); -} - -static u64 update_sample(unsigned int rmid, u32 evt_type, int first) -{ - struct sample *mbm_current; - u32 vrmid = rmid_2_index(rmid); - u64 val, bytes, shift; - u32 eventid; - - if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { - mbm_current = &mbm_local[vrmid]; - eventid = QOS_MBM_LOCAL_EVENT_ID; - } else { - mbm_current = &mbm_total[vrmid]; - eventid = QOS_MBM_TOTAL_EVENT_ID; - } - - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return mbm_current->total_bytes; - - if (first) { - mbm_current->prev_msr = val; - mbm_current->total_bytes = 0; - return mbm_current->total_bytes; - } - - /* - * The h/w guarantees that counters will not overflow - * so long as we poll them at least once per second. - */ - shift = 64 - MBM_CNTR_WIDTH; - bytes = (val << shift) - (mbm_current->prev_msr << shift); - bytes >>= shift; - - bytes *= cqm_l3_scale; - - mbm_current->total_bytes += bytes; - mbm_current->prev_msr = val; - - return mbm_current->total_bytes; -} - -static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) -{ - return update_sample(rmid, evt_type, 0); -} - -static void __intel_mbm_event_init(void *info) -{ - struct rmid_read *rr = info; - - update_sample(rr->rmid, rr->evt_type, 1); -} - -static void init_mbm_sample(u32 rmid, u32 evt_type) -{ - struct rmid_read rr = { - .rmid = rmid, - .evt_type = evt_type, - .value = ATOMIC64_INIT(0), - }; - - /* on each socket, init sample */ - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); -} - -/* - * Find a group and setup RMID. - * - * If we're part of a group, we use the group's RMID. - */ -static void intel_cqm_setup_event(struct perf_event *event, - struct perf_event **group) -{ - struct perf_event *iter; - bool conflict = false; - u32 rmid; - - event->hw.is_group_event = false; - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - rmid = iter->hw.cqm_rmid; - - if (__match_event(iter, event)) { - /* All tasks in a group share an RMID */ - event->hw.cqm_rmid = rmid; - *group = iter; - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - return; - } - - /* - * We only care about conflicts for events that are - * actually scheduled in (and hence have a valid RMID). - */ - if (__conflict_event(iter, event) && __rmid_valid(rmid)) - conflict = true; - } - - if (conflict) - rmid = INVALID_RMID; - else - rmid = __get_rmid(); - - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - - event->hw.cqm_rmid = rmid; -} - -static void intel_cqm_event_read(struct perf_event *event) -{ - unsigned long flags; - u32 rmid; - u64 val; - - /* - * Task events are handled by intel_cqm_event_count(). - */ - if (event->cpu == -1) - return; - - raw_spin_lock_irqsave(&cache_lock, flags); - rmid = event->hw.cqm_rmid; - - if (!__rmid_valid(rmid)) - goto out; - - if (is_mbm_event(event->attr.config)) - val = rmid_read_mbm(rmid, event->attr.config); - else - val = __rmid_read(rmid); - - /* - * Ignore this reading on error states and do not update the value. - */ - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - goto out; - - local64_set(&event->count, val); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); -} - -static void __intel_cqm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = __rmid_read(rr->rmid); - - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - - atomic64_add(val, &rr->value); -} - -static inline bool cqm_group_leader(struct perf_event *event) -{ - return !list_empty(&event->hw.cqm_groups_entry); -} - -static void __intel_mbm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = rmid_read_mbm(rr->rmid, rr->evt_type); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - atomic64_add(val, &rr->value); -} - -static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) -{ - struct perf_event *iter, *iter1; - int ret = HRTIMER_RESTART; - struct list_head *head; - unsigned long flags; - u32 grp_rmid; - - /* - * Need to cache_lock as the timer Event Select MSR reads - * can race with the mbm/cqm count() and mbm_init() reads. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (list_empty(&cache_groups)) { - ret = HRTIMER_NORESTART; - goto out; - } - - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - grp_rmid = iter->hw.cqm_rmid; - if (!__rmid_valid(grp_rmid)) - continue; - if (is_mbm_event(iter->attr.config)) - update_sample(grp_rmid, iter->attr.config, 0); - - head = &iter->hw.cqm_group_entry; - if (list_empty(head)) - continue; - list_for_each_entry(iter1, head, hw.cqm_group_entry) { - if (!iter1->hw.is_group_event) - break; - if (is_mbm_event(iter1->attr.config)) - update_sample(iter1->hw.cqm_rmid, - iter1->attr.config, 0); - } - } - - hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return ret; -} - -static void __mbm_start_timer(void *info) -{ - hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), - HRTIMER_MODE_REL_PINNED); -} - -static void __mbm_stop_timer(void *info) -{ - hrtimer_cancel(&mbm_timers[pkg_id]); -} - -static void mbm_start_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); -} - -static void mbm_stop_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); -} - -static void mbm_hrtimer_init(void) -{ - struct hrtimer *hr; - int i; - - for (i = 0; i < mbm_socket_max; i++) { - hr = &mbm_timers[i]; - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = mbm_hrtimer_handle; - } -} - -static u64 intel_cqm_event_count(struct perf_event *event) -{ - unsigned long flags; - struct rmid_read rr = { - .evt_type = event->attr.config, - .value = ATOMIC64_INIT(0), - }; - - /* - * We only need to worry about task events. System-wide events - * are handled like usual, i.e. entirely with - * intel_cqm_event_read(). - */ - if (event->cpu != -1) - return __perf_event_count(event); - - /* - * Only the group leader gets to report values except in case of - * multiple events in the same group, we still need to read the - * other events.This stops us - * reporting duplicate values to userspace, and gives us a clear - * rule for which task gets to report the values. - * - * Note that it is impossible to attribute these values to - * specific packages - we forfeit that ability when we create - * task events. - */ - if (!cqm_group_leader(event) && !event->hw.is_group_event) - return 0; - - /* - * Getting up-to-date values requires an SMP IPI which is not - * possible if we're being called in interrupt context. Return - * the cached values instead. - */ - if (unlikely(in_interrupt())) - goto out; - - /* - * Notice that we don't perform the reading of an RMID - * atomically, because we can't hold a spin lock across the - * IPIs. - * - * Speculatively perform the read, since @event might be - * assigned a different (possibly invalid) RMID while we're - * busying performing the IPI calls. It's therefore necessary to - * check @event's RMID afterwards, and if it has changed, - * discard the result of the read. - */ - rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); - - if (!__rmid_valid(rr.rmid)) - goto out; - - cqm_mask_call(&rr); - - raw_spin_lock_irqsave(&cache_lock, flags); - if (event->hw.cqm_rmid == rr.rmid) - local64_set(&event->count, atomic64_read(&rr.value)); - raw_spin_unlock_irqrestore(&cache_lock, flags); -out: - return __perf_event_count(event); -} - -static void intel_cqm_event_start(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 rmid = event->hw.cqm_rmid; - - if (!(event->hw.cqm_state & PERF_HES_STOPPED)) - return; - - event->hw.cqm_state &= ~PERF_HES_STOPPED; - - if (state->rmid_usecnt++) { - if (!WARN_ON_ONCE(state->rmid != rmid)) - return; - } else { - WARN_ON_ONCE(state->rmid); - } - - state->rmid = rmid; - wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); -} - -static void intel_cqm_event_stop(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - - if (event->hw.cqm_state & PERF_HES_STOPPED) - return; - - event->hw.cqm_state |= PERF_HES_STOPPED; - - intel_cqm_event_read(event); - - if (!--state->rmid_usecnt) { - state->rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); - } else { - WARN_ON_ONCE(!state->rmid); - } -} - -static int intel_cqm_event_add(struct perf_event *event, int mode) -{ - unsigned long flags; - u32 rmid; - - raw_spin_lock_irqsave(&cache_lock, flags); - - event->hw.cqm_state = PERF_HES_STOPPED; - rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid) && (mode & PERF_EF_START)) - intel_cqm_event_start(event, mode); - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return 0; -} - -static void intel_cqm_event_destroy(struct perf_event *event) -{ - struct perf_event *group_other = NULL; - unsigned long flags; - - mutex_lock(&cache_mutex); - /* - * Hold the cache_lock as mbm timer handlers could be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - /* - * If there's another event in this group... - */ - if (!list_empty(&event->hw.cqm_group_entry)) { - group_other = list_first_entry(&event->hw.cqm_group_entry, - struct perf_event, - hw.cqm_group_entry); - list_del(&event->hw.cqm_group_entry); - } - - /* - * And we're the group leader.. - */ - if (cqm_group_leader(event)) { - /* - * If there was a group_other, make that leader, otherwise - * destroy the group and return the RMID. - */ - if (group_other) { - list_replace(&event->hw.cqm_groups_entry, - &group_other->hw.cqm_groups_entry); - } else { - u32 rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid)) - __put_rmid(rmid); - list_del(&event->hw.cqm_groups_entry); - } - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - /* - * Stop the mbm overflow timers when the last event is destroyed. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_stop_timers(); - - mutex_unlock(&cache_mutex); -} - -static int intel_cqm_event_init(struct perf_event *event) -{ - struct perf_event *group = NULL; - bool rotate = false; - unsigned long flags; - - if (event->attr.type != intel_cqm_pmu.type) - return -ENOENT; - - if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || - (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) - return -EINVAL; - - if ((is_cqm_event(event->attr.config) && !cqm_enabled) || - (is_mbm_event(event->attr.config) && !mbm_enabled)) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - INIT_LIST_HEAD(&event->hw.cqm_group_entry); - INIT_LIST_HEAD(&event->hw.cqm_groups_entry); - - event->destroy = intel_cqm_event_destroy; - - mutex_lock(&cache_mutex); - - /* - * Start the mbm overflow timers when the first event is created. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_start_timers(); - - /* Will also set rmid */ - intel_cqm_setup_event(event, &group); - - /* - * Hold the cache_lock as mbm timer handlers be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (group) { - list_add_tail(&event->hw.cqm_group_entry, - &group->hw.cqm_group_entry); - } else { - list_add_tail(&event->hw.cqm_groups_entry, - &cache_groups); - - /* - * All RMIDs are either in use or have recently been - * used. Kick the rotation worker to clean/free some. - * - * We only do this for the group leader, rather than for - * every event in a group to save on needless work. - */ - if (!__rmid_valid(event->hw.cqm_rmid)) - rotate = true; - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - mutex_unlock(&cache_mutex); - - if (rotate) - schedule_delayed_work(&intel_cqm_rmid_work, 0); - - return 0; -} - -EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); -EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); -EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); -EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); -EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); - -EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); -EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); -EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); -EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); - -EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); -EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); -EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); -EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); - -static struct attribute *intel_cqm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute *intel_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - NULL, -}; - -static struct attribute *intel_cmt_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute_group intel_cqm_events_group = { - .name = "events", - .attrs = NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-7"); -static struct attribute *intel_cqm_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group intel_cqm_format_group = { - .name = "format", - .attrs = intel_cqm_formats_attr, -}; - -static ssize_t -max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, - char *page) -{ - ssize_t rv; - - mutex_lock(&cache_mutex); - rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); - mutex_unlock(&cache_mutex); - - return rv; -} - -static ssize_t -max_recycle_threshold_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned int bytes, cachelines; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - mutex_lock(&cache_mutex); - - __intel_cqm_max_threshold = bytes; - cachelines = bytes / cqm_l3_scale; - - /* - * The new maximum takes effect immediately. - */ - if (__intel_cqm_threshold > cachelines) - __intel_cqm_threshold = cachelines; - - mutex_unlock(&cache_mutex); - - return count; -} - -static DEVICE_ATTR_RW(max_recycle_threshold); - -static struct attribute *intel_cqm_attrs[] = { - &dev_attr_max_recycle_threshold.attr, - NULL, -}; - -static const struct attribute_group intel_cqm_group = { - .attrs = intel_cqm_attrs, -}; - -static const struct attribute_group *intel_cqm_attr_groups[] = { - &intel_cqm_events_group, - &intel_cqm_format_group, - &intel_cqm_group, - NULL, -}; - -static struct pmu intel_cqm_pmu = { - .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, - .attr_groups = intel_cqm_attr_groups, - .task_ctx_nr = perf_sw_context, - .event_init = intel_cqm_event_init, - .add = intel_cqm_event_add, - .del = intel_cqm_event_stop, - .start = intel_cqm_event_start, - .stop = intel_cqm_event_stop, - .read = intel_cqm_event_read, - .count = intel_cqm_event_count, -}; - -static inline void cqm_pick_event_reader(int cpu) -{ - int reader; - - /* First online cpu in package becomes the reader */ - reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); - if (reader >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cqm_cpumask); -} - -static int intel_cqm_cpu_starting(unsigned int cpu) -{ - struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - state->rmid = 0; - state->closid = 0; - state->rmid_usecnt = 0; - - WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); - WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); - - cqm_pick_event_reader(cpu); - return 0; -} - -static int intel_cqm_cpu_exit(unsigned int cpu) -{ - int target; - - /* Is @cpu the current cqm reader for this package ? */ - if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) - return 0; - - /* Find another online reader in this package */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &cqm_cpumask); - - return 0; -} - -static const struct x86_cpu_id intel_cqm_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, - {} -}; - -static void mbm_cleanup(void) -{ - if (!mbm_enabled) - return; - - kfree(mbm_local); - kfree(mbm_total); - mbm_enabled = false; -} - -static const struct x86_cpu_id intel_mbm_local_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, - {} -}; - -static const struct x86_cpu_id intel_mbm_total_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, - {} -}; - -static int intel_mbm_init(void) -{ - int ret = 0, array_size, maxid = cqm_max_rmid + 1; - - mbm_socket_max = topology_max_packages(); - array_size = sizeof(struct sample) * maxid * mbm_socket_max; - mbm_local = kmalloc(array_size, GFP_KERNEL); - if (!mbm_local) - return -ENOMEM; - - mbm_total = kmalloc(array_size, GFP_KERNEL); - if (!mbm_total) { - ret = -ENOMEM; - goto out; - } - - array_size = sizeof(struct hrtimer) * mbm_socket_max; - mbm_timers = kmalloc(array_size, GFP_KERNEL); - if (!mbm_timers) { - ret = -ENOMEM; - goto out; - } - mbm_hrtimer_init(); - -out: - if (ret) - mbm_cleanup(); - - return ret; -} - -static int __init intel_cqm_init(void) -{ - char *str = NULL, scale[20]; - int cpu, ret; - - if (x86_match_cpu(intel_cqm_match)) - cqm_enabled = true; - - if (x86_match_cpu(intel_mbm_local_match) && - x86_match_cpu(intel_mbm_total_match)) - mbm_enabled = true; - - if (!cqm_enabled && !mbm_enabled) - return -ENODEV; - - cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; - - /* - * It's possible that not all resources support the same number - * of RMIDs. Instead of making scheduling much more complicated - * (where we have to match a task's RMID to a cpu that supports - * that many RMIDs) just find the minimum RMIDs supported across - * all cpus. - * - * Also, check that the scales match on all cpus. - */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->x86_cache_max_rmid < cqm_max_rmid) - cqm_max_rmid = c->x86_cache_max_rmid; - - if (c->x86_cache_occ_scale != cqm_l3_scale) { - pr_err("Multiple LLC scale values, disabling\n"); - ret = -EINVAL; - goto out; - } - } - - /* - * A reasonable upper limit on the max threshold is the number - * of lines tagged per RMID if all RMIDs have the same number of - * lines tagged in the LLC. - * - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. - */ - __intel_cqm_max_threshold = - boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); - - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); - str = kstrdup(scale, GFP_KERNEL); - if (!str) { - ret = -ENOMEM; - goto out; - } - - event_attr_intel_cqm_llc_scale.event_str = str; - - ret = intel_cqm_setup_rmid_cache(); - if (ret) - goto out; - - if (mbm_enabled) - ret = intel_mbm_init(); - if (ret && !cqm_enabled) - goto out; - - if (cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; - else if (!cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_mbm_events_attr; - else if (cqm_enabled && !mbm_enabled) - intel_cqm_events_group.attrs = intel_cqm_events_attr; - - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) { - pr_err("Intel CQM perf registration failed: %d\n", ret); - goto out; - } - - if (cqm_enabled) - pr_info("Intel CQM monitoring enabled\n"); - if (mbm_enabled) - pr_info("Intel MBM enabled\n"); - - /* - * Setup the hot cpu notifier once we are sure cqm - * is enabled to avoid notifier leak. - */ - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, - "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); -out: - cpus_read_unlock(); - - if (ret) { - kfree(str); - cqm_cleanup(); - mbm_cleanup(); - } - - return ret; -} -device_initcall(intel_cqm_init); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a322fed5f8ed..e1965e5ff570 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -49,34 +49,47 @@ union intel_x86_pebs_dse { */ #define P(a, b) PERF_MEM_S(a, b) #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define LEVEL(x) P(LVLNUM, x) +#define REM P(REMOTE, REMOTE) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ static u64 pebs_data_source[] = { - P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ - OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ - OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ - OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ - OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ - OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ - OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ - OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ - OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ - OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ - OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ - OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ }; /* Patch up minor differences in the bits */ void __init intel_pmu_pebs_data_source_nhm(void) { - pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); - pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); - pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); + + pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); } static u64 precise_store_data(u64 status) @@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status) { union intel_x86_pebs_dse dse; u64 val; - int model = boot_cpu_data.x86_model; - int fam = boot_cpu_data.x86; dse.val = status; @@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status) /* * Nehalem models do not support TLB, Lock infos */ - if (fam == 0x6 && (model == 26 || model == 30 - || model == 31 || model == 46)) { + if (x86_pmu.pebs_no_tlb) { val |= P(TLB, NA) | P(LOCK, NA); return val; } @@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event, else regs->flags &= ~PERF_EFLAGS_EXACT; - if ((sample_type & PERF_SAMPLE_ADDR) && + if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 955457a30197..8a6bbacd17dc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -109,6 +109,9 @@ enum { X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ X86_BR_CALL_STACK = 1 << 16,/* call stack */ X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].in_tx = 0; cpuc->lbr_entries[i].abort = 0; cpuc->lbr_entries[i].cycles = 0; + cpuc->lbr_entries[i].type = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; @@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; cpuc->lbr_entries[out].cycles = cycles; + cpuc->lbr_entries[out].type = 0; cpuc->lbr_entries[out].reserved = 0; out++; } @@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_CALL) mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_UNKNOWN, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_UNKNOWN, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +static int +common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) bool compress = false; /* if sampling all branches, then nothing to filter */ - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) return; for (i = 0; i < cpuc->lbr_stack.nr; i++) { @@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].from = 0; compress = true; } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); } if (!compress) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index ae8324d65e61..81fd41d5a0d9 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event) struct pt *pt = this_cpu_ptr(&pt_ctx); u64 reg; - if (!event->hw.itrace_started) { - event->hw.itrace_started = 1; + /* First round: clear STATUS, in particular the PSB byte counter. */ + if (!event->hw.config) { + perf_event_itrace_started(event); wrmsrl(MSR_IA32_RTIT_STATUS, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 476aec3a4cab..4196f81ec0e1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -91,7 +91,7 @@ struct amd_nb { (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) /* * A debug store configuration. @@ -558,6 +558,7 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; struct attribute **event_attrs; + struct attribute **caps_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; @@ -591,7 +592,8 @@ struct x86_pmu { pebs :1, pebs_active :1, pebs_broken :1, - pebs_prec_dist :1; + pebs_prec_dist :1, + pebs_no_tlb :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); @@ -741,6 +743,8 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); +int x86_pmu_max_precise(void); + void hw_perf_lbr_event_destroy(struct perf_event *event); int x86_setup_perfctr(struct perf_event *event); @@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void); void intel_pmu_pebs_data_source_nhm(void); +void intel_pmu_pebs_data_source_skl(bool pmem); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 171ae09864d7..367a8203cfcf 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1 +1 @@ -obj-y := hv_init.o +obj-y := hv_init.o mmu.o diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 5b882cc0c0e9..1a8eb550c40f 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/clockchips.h> #include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> #ifdef CONFIG_HYPERV_TSCPAGE @@ -75,10 +77,25 @@ static struct clocksource hyperv_cs_msr = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void *hypercall_pg; +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); struct clocksource *hyperv_cs; EXPORT_SYMBOL_GPL(hyperv_cs); +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = msr_vp_index; + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -94,6 +111,16 @@ void hyperv_init(void) if (x86_hyper != &x86_hyper_ms_hyperv) return; + /* Allocate percpu VP index */ + hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, NULL) < 0) + goto free_vp_index; + /* * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID @@ -102,17 +129,19 @@ void hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); - if (hypercall_pg == NULL) { + hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - return; + goto free_vp_index; } rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg); + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hyper_alloc_mmu(); + /* * Register Hyper-V specific clocksource. */ @@ -148,6 +177,12 @@ register_msr_cs: hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + + return; + +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; } /* @@ -170,51 +205,6 @@ void hyperv_cleanup(void) } EXPORT_SYMBOL_GPL(hyperv_cleanup); -/* - * hv_do_hypercall- Invoke the specified hypercall - */ -u64 hv_do_hypercall(u64 control, void *input, void *output) -{ - u64 input_address = (input) ? virt_to_phys(input) : 0; - u64 output_address = (output) ? virt_to_phys(output) : 0; -#ifdef CONFIG_X86_64 - u64 hv_status = 0; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); - __asm__ __volatile__("call *%3" : "=a" (hv_status) : - "c" (control), "d" (input_address), - "m" (hypercall_pg)); - - return hv_status; - -#else - - u32 control_hi = control >> 32; - u32 control_lo = control & 0xFFFFFFFF; - u32 hv_status_hi = 1; - u32 hv_status_lo = 1; - u32 input_address_hi = input_address >> 32; - u32 input_address_lo = input_address & 0xFFFFFFFF; - u32 output_address_hi = output_address >> 32; - u32 output_address_lo = output_address & 0xFFFFFFFF; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), - "=a"(hv_status_lo) : "d" (control_hi), - "a" (control_lo), "b" (input_address_hi), - "c" (input_address_lo), "D"(output_address_hi), - "S"(output_address_lo), "m" (hypercall_pg)); - - return hv_status_lo | ((u64)hv_status_hi << 32); -#endif /* !x86_64 */ -} -EXPORT_SYMBOL_GPL(hv_do_hypercall); - void hyperv_report_panic(struct pt_regs *regs) { static bool panic_reported; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c new file mode 100644 index 000000000000..39e7f6e50919 --- /dev/null +++ b/arch/x86/hyperv/mmu.c @@ -0,0 +1,272 @@ +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/hyperv.h> +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include <asm/fpu/api.h> +#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/tlbflush.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hyperv.h> + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_flush_pcpu { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +}; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_flush_pcpu_ex { + u64 address_space; + u64 flags; + struct { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; + } hv_vp_set; + u64 gva_list[]; +}; + +/* Each gva in gva_list encodes up to 4096 pages to flush */ +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) + +static struct hv_flush_pcpu __percpu *pcpu_flush; + +static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; + +/* + * Fills in gva_list starting from offset. Returns the number of items added. + */ +static inline int fill_gva_list(u64 gva_list[], int offset, + unsigned long start, unsigned long end) +{ + int gva_n = offset; + unsigned long cur = start, diff; + + do { + diff = end > cur ? end - cur : 0; + + gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (diff >= HV_TLB_FLUSH_UNIT) + gva_list[gva_n] |= ~PAGE_MASK; + else if (diff) + gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + + cur += HV_TLB_FLUSH_UNIT; + gva_n++; + + } while (cur < end); + + return gva_n - offset; +} + +/* Return the number of banks in the resulting vp_set */ +static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, + const struct cpumask *cpus) +{ + int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + + /* + * Some banks may end up being empty but this is acceptable. + */ + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + vcpu_bank = vcpu / 64; + vcpu_offset = vcpu % 64; + + /* valid_bank_mask can represent up to 64 banks */ + if (vcpu_bank >= 64) + return 0; + + __set_bit(vcpu_offset, (unsigned long *) + &flush->hv_vp_set.bank_contents[vcpu_bank]); + if (vcpu_bank >= nr_bank) + nr_bank = vcpu_bank + 1; + } + flush->hv_vp_set.valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0); + + return nr_bank; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush = this_cpu_ptr(pcpu_flush); + + if (info->mm) { + flush->address_space = virt_to_phys(info->mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu >= 64) + goto do_native; + + __set_bit(vcpu, (unsigned long *) + &flush->processor_mask); + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, 0, + info->start, info->end); + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush_ex || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush = this_cpu_ptr(pcpu_flush_ex); + + if (info->mm) { + flush->address_space = virt_to_phys(info->mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + if (!cpumask_equal(cpus, cpu_present_mask)) { + flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K; + nr_bank = cpumask_to_vp_set(flush, cpus); + } + + if (!nr_bank) { + flush->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = + (PAGE_SIZE - sizeof(*flush) - nr_bank * + sizeof(flush->hv_vp_set.bank_contents[0])) / + sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank + 2, flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank + 2, flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, nr_bank, + info->start, info->end); + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank + 2, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { + pr_info("Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + } else { + pr_info("Using ext hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex; + } +} + +void hyper_alloc_mmu(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + else + pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); +} diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 724153797209..e0bb46c02857 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, if (ksig->ka.sa.sa_flags & SA_ONSTACK) sp = sigsp(sp, ksig); /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER32_DS && + else if (regs->ss != __USER32_DS && !(ksig->ka.sa.sa_flags & SA_RESTORER) && ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2efc768e4362..72d867f6b518 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -150,8 +150,6 @@ static inline void disable_acpi(void) { } extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ -#define acpi_unlazy_tlb(x) leave_mm(x) - #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { @@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) * you call efi_mem_attributes() during boot and at runtime, * you could theoretically see different attributes. * - * Since we are yet to see any x86 platforms that require - * anything other than PAGE_KERNEL (some arm64 platforms - * require the equivalent of PAGE_KERNEL_NOCACHE), return that - * until we know differently. + * We are yet to see any x86 platforms that require anything + * other than PAGE_KERNEL (some ARM64 platforms require the + * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME + * is active, the ACPI information will not be encrypted, + * so return PAGE_KERNEL_NOENC until we know differently. */ - return PAGE_KERNEL; + return PAGE_KERNEL_NOENC; } #endif diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7a9df3beb89b..676ee5807d86 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -74,6 +74,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -123,6 +126,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 33380b871463..0874ebda3069 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..9e206f31ce2a 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..5d9de36a2f04 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index e01f7f7ccb0c..84ae170bc3d0 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,5 +2,7 @@ #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..b5069e802d5c 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5a28e8e55e36..42bbbf0f173d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -177,7 +177,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d0a21b12dd58..1a2ba368da39 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include <asm/ldt.h> #include <asm/mmu.h> #include <asm/fixmap.h> +#include <asm/irq_vectors.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -22,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf0000) >> 16; + desc->limit1 = (info->limit & 0xf0000) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; @@ -83,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); + gate->offset_low = (u16) func; + gate->bits.p = 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, - unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int memcpy(&gdt[entry], desc, size); } -static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - unsigned long limit, unsigned char type, - unsigned char flags) -{ - desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); - desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); - desc->p = 1; -} - - -static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) { -#ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; + struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); - desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->limit0 = (u16) size; + desc->base0 = (u16) addr; + desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); -#else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + desc->base2 = (addr >> 24) & 0xFF; +#ifdef CONFIG_X86_64 + desc->base3 = (u32) (addr >> 32); #endif } @@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base) static inline unsigned long get_desc_limit(const struct desc_struct *desc) { - return desc->limit0 | (desc->limit << 16); + return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0xffff; - desc->limit = (limit >> 16) & 0xf; -} - -#ifdef CONFIG_X86_64 -static inline void set_nmi_gate(int gate, void *addr) -{ - gate_desc s; - - pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(debug_idt_table, gate, &s); + desc->limit1 = (limit >> 16) & 0xf; } -#endif -#ifdef CONFIG_TRACING -extern struct desc_ptr trace_idt_descr; -extern gate_desc trace_idt_table[]; -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ - write_idt_entry(trace_idt_table, entry, gate); -} +void update_intr_gate(unsigned int n, const void *addr); +void alloc_intr_gate(unsigned int n, const void *addr); -static inline void _trace_set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_trace_idt_entry(gate, &s); -} -#else -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ -} - -#define _trace_set_gate(gate, type, addr, dpl, ist, seg) -#endif - -static inline void _set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_idt_entry(idt_table, gate, &s); - write_trace_idt_entry(gate, &s); -} - -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -#define set_intr_gate_notrace(n, addr) \ - do { \ - BUG_ON((unsigned)n > 0xFF); \ - _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ - __KERNEL_CS); \ - } while (0) - -#define set_intr_gate(n, addr) \ - do { \ - set_intr_gate_notrace(n, addr); \ - _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ - 0, 0, __KERNEL_CS); \ - } while (0) - -extern int first_system_vector; -/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; -static inline void alloc_system_vector(int vector) -{ - if (!test_bit(vector, used_vectors)) { - set_bit(vector, used_vectors); - if (first_system_vector > vector) - first_system_vector = vector; - } else { - BUG(); - } -} - -#define alloc_intr_gate(n, addr) \ - do { \ - alloc_system_vector(n); \ - set_intr_gate(n, addr); \ - } while (0) - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_system_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); -} - -static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); -} - -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); static inline bool is_debug_idt_enabled(void) @@ -567,31 +420,6 @@ static inline void load_debug_idt(void) } #endif -#ifdef CONFIG_TRACING -extern atomic_t trace_idt_ctr; -static inline bool is_trace_idt_enabled(void) -{ - if (atomic_read(&trace_idt_ctr)) - return true; - - return false; -} - -static inline void load_trace_idt(void) -{ - load_idt((const struct desc_ptr *)&trace_idt_descr); -} -#else -static inline bool is_trace_idt_enabled(void) -{ - return false; -} - -static inline void load_trace_idt(void) -{ -} -#endif - /* * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected @@ -603,9 +431,25 @@ static inline void load_current_idt(void) { if (is_debug_idt_enabled()) load_debug_idt(); - else if (is_trace_idt_enabled()) - load_trace_idt(); else load_idt((const struct desc_ptr *)&idt_descr); } + +extern void idt_setup_early_handler(void); +extern void idt_setup_early_traps(void); +extern void idt_setup_traps(void); +extern void idt_setup_apic_and_irq_gates(void); + +#ifdef CONFIG_X86_64 +extern void idt_setup_early_pf(void); +extern void idt_setup_ist_traps(void); +extern void idt_setup_debugidt_traps(void); +#else +static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_ist_traps(void) { } +static inline void idt_setup_debugidt_traps(void) { } +#endif + +extern void idt_invalidate(void *addr); + #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 49265345d4d2..346d252029b7 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -11,34 +11,30 @@ #include <linux/types.h> -/* - * FIXME: Accessing the desc_struct through its fields is more elegant, - * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than a union, - * so we can get rid of it transparently in the future -- glommer - */ /* 8 byte segment descriptor */ struct desc_struct { - union { - struct { - unsigned int a; - unsigned int b; - }; - struct { - u16 limit0; - u16 base0; - unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; - unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; - }; - }; + u16 limit0; + u16 base0; + u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1; + u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; } __attribute__((packed)); -#define GDT_ENTRY_INIT(flags, base, limit) { { { \ - .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ - .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ - ((limit) & 0xf0000) | ((base) & 0xff000000), \ - } } } +#define GDT_ENTRY_INIT(flags, base, limit) \ + { \ + .limit0 = (u16) (limit), \ + .limit1 = ((limit) >> 16) & 0x0F, \ + .base0 = (u16) (base), \ + .base1 = ((base) >> 16) & 0xFF, \ + .base2 = ((base) >> 24) & 0xFF, \ + .type = (flags & 0x0f), \ + .s = (flags >> 4) & 0x01, \ + .dpl = (flags >> 5) & 0x03, \ + .p = (flags >> 7) & 0x01, \ + .avl = (flags >> 12) & 0x01, \ + .l = (flags >> 13) & 0x01, \ + .d = (flags >> 14) & 0x01, \ + .g = (flags >> 15) & 0x01, \ + } enum { GATE_INTERRUPT = 0xE, @@ -47,49 +43,63 @@ enum { GATE_TASK = 0x5, }; -/* 16byte gate */ -struct gate_struct64 { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) - enum { DESC_TSS = 0x9, DESC_LDT = 0x2, DESCTYPE_S = 0x10, /* !system */ }; -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct ldttss_desc64 { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; +/* LDT or TSS descriptor in the GDT. */ +struct ldttss_desc { + u16 limit0; + u16 base0; + + u16 base1 : 8, type : 5, dpl : 2, p : 1; + u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8; +#ifdef CONFIG_X86_64 + u32 base3; + u32 zero1; +#endif } __attribute__((packed)); +typedef struct ldttss_desc ldt_desc; +typedef struct ldttss_desc tss_desc; + +struct idt_bits { + u16 ist : 3, + zero : 5, + type : 5, + dpl : 2, + p : 1; +} __attribute__((packed)); + +struct gate_struct { + u16 offset_low; + u16 segment; + struct idt_bits bits; + u16 offset_middle; +#ifdef CONFIG_X86_64 + u32 offset_high; + u32 reserved; +#endif +} __attribute__((packed)); + +typedef struct gate_struct gate_desc; + +static inline unsigned long gate_offset(const gate_desc *g) +{ #ifdef CONFIG_X86_64 -typedef struct gate_struct64 gate_desc; -typedef struct ldttss_desc64 ldt_desc; -typedef struct ldttss_desc64 tss_desc; -#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) -#define gate_segment(g) ((g).segment) + return g->offset_low | ((unsigned long)g->offset_middle << 16) | + ((unsigned long) g->offset_high << 32); #else -typedef struct desc_struct gate_desc; -typedef struct desc_struct ldt_desc; -typedef struct desc_struct tss_desc; -#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) -#define gate_segment(g) ((g).a >> 16) + return g->offset_low | ((unsigned long)g->offset_middle << 16); #endif +} + +static inline unsigned long gate_segment(const gate_desc *g) +{ + return g->segment; +} struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 398c79889f5c..1387dafdba2d 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,6 +12,7 @@ #include <asm/io.h> #include <asm/swiotlb.h> #include <linux/dma-contiguous.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr; + return __sme_set(paddr); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr; + return __sme_clr(daddr); } #endif /* CONFIG_X86_DMA_REMAP */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 3c69fed215c5..a8e15b04565b 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_early_remap early_ioremap -#define dmi_early_unmap early_iounmap -#define dmi_remap ioremap_cache -#define dmi_unmap iounmap +#define dmi_early_remap early_memremap +#define dmi_early_unmap early_memunmap +#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) +#define dmi_unmap(_x) memunmap(_x) #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index a504adc661a4..cd266d830e49 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void); extern void e820__reallocate_tables(void); extern void e820__register_nosave_regions(unsigned long limit_pfn); +extern int e820__get_entry_type(u64 start, u64 end); + /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9aeb91935ce0..04330c8d9af9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -126,15 +126,15 @@ do { \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ - pr_reg[7] = regs->ds & 0xffff; \ - pr_reg[8] = regs->es & 0xffff; \ - pr_reg[9] = regs->fs & 0xffff; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + pr_reg[9] = regs->fs; \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ - pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ - pr_reg[16] = regs->ss & 0xffff; \ + pr_reg[16] = regs->ss; \ } while (0); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ @@ -304,8 +305,8 @@ static inline int mmap_is_ia32(void) test_thread_flag(TIF_ADDR32)); } -extern unsigned long tasksize_32bit(void); -extern unsigned long tasksize_64bit(void); +extern unsigned long task_size_32bit(void); +extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 07b06955a05d..aa15d1f7e530 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,20 +13,14 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, - smp_irq_move_cleanup_interrupt) -BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) +BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, - smp_kvm_posted_intr_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, - smp_kvm_posted_intr_wakeup_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, - smp_kvm_posted_intr_nested_ipi) +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) #endif /* @@ -41,6 +35,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760..dcd9fb55e679 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include <asm-generic/fixmap.h> #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index b4c1f5453436..f4dc9b63bdda 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -41,20 +41,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d6dbafbd4207..6dfe366a8804 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -46,26 +46,6 @@ extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); -#ifdef CONFIG_TRACING -/* Interrupt handlers registered during init_IRQ */ -extern void trace_apic_timer_interrupt(void); -extern void trace_x86_platform_ipi(void); -extern void trace_error_interrupt(void); -extern void trace_irq_work_interrupt(void); -extern void trace_spurious_interrupt(void); -extern void trace_thermal_interrupt(void); -extern void trace_reschedule_interrupt(void); -extern void trace_threshold_interrupt(void); -extern void trace_deferred_error_interrupt(void); -extern void trace_call_function_interrupt(void); -extern void trace_call_function_single_interrupt(void); -#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt -#define trace_reboot_interrupt reboot_interrupt -#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi -#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi -#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi -#endif /* CONFIG_TRACING */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 474eb8c66fee..05c4aa00cc86 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -7,6 +7,7 @@ struct x86_mapping_info { unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ bool direct_gbpages; /* PUD level 1GB page support */ + unsigned long kernpg_flag; /* kernel pagetable flag override */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h deleted file mode 100644 index 597dc4995678..000000000000 --- a/arch/x86/include/asm/intel_rdt.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_H -#define _ASM_X86_INTEL_RDT_H - -#ifdef CONFIG_INTEL_RDT_A - -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/jump_label.h> - -#include <asm/intel_rdt_common.h> - -#define IA32_L3_QOS_CFG 0xc81 -#define IA32_L3_CBM_BASE 0xc90 -#define IA32_L2_CBM_BASE 0xd10 -#define IA32_MBA_THRTL_BASE 0xd50 - -#define L3_QOS_CDP_ENABLE 0x01ULL - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - int closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - struct kernfs_ops *kf_ops; - unsigned long flags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct rdt_domain - group of cpus sharing an RDT resource - * @list: all instances of this resource - * @id: unique id for this instance - * @cpu_mask: which cpus share this resource - * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @new_ctrl: new ctrl value to be loaded - * @have_new_ctrl: did user provide new_ctrl for this domain - */ -struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - u32 *ctrl_val; - u32 new_ctrl; - bool have_new_ctrl; -}; - -/** - * struct msr_param - set a range of MSRs from a domain - * @res: The resource to use - * @low: Beginning index from base MSR - * @high: End index - */ -struct msr_param { - struct rdt_resource *res; - int low; - int high; -}; - -/** - * struct rdt_cache - Cache allocation related data - * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set - * @cbm_idx_mult: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - * in a cache bit mask - */ -struct rdt_cache { - unsigned int cbm_len; - unsigned int min_cbm_bits; - unsigned int cbm_idx_mult; - unsigned int cbm_idx_offset; -}; - -/** - * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. - * @min_bw: Minimum memory bandwidth percentage user can request - * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @mb_map: Mapping of memory B/W percentage to memory B/W delay - */ -struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - u32 *mb_map; -}; - -/** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @cache_level: Which cache level defines scope of this resource - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @msr_base: Base MSR address for CBMs - * @msr_update: Function pointer to update QOS MSRs - * @data_width: Character width of data when displaying - * @domains: All domains for this resource - * @cache: Cache allocation related data - * @info_files: resctrl info files for the resource - * @nr_info_files: Number of info files - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cache_level; - u32 default_ctrl; - unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); - int data_width; - struct list_head domains; - struct rdt_cache cache; - struct rdt_membw membw; - struct rftype *info_files; - int nr_info_files; - const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, - struct rdt_domain *d); -}; - -void rdt_get_cache_infofile(struct rdt_resource *r); -void rdt_get_mba_infofile(struct rdt_resource *r); -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); - -extern struct mutex rdtgroup_mutex; - -extern struct rdt_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); - -int __init rdtgroup_init(void); - -enum { - RDT_RESOURCE_L3, - RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -#define for_each_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->capable) - -#define for_each_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->enabled) - -/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ -union cpuid_0x10_1_eax { - struct { - unsigned int cbm_len:5; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ -union cpuid_0x10_3_eax { - struct { - unsigned int max_delay:12; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID).EDX */ -union cpuid_0x10_x_edx { - struct { - unsigned int cos_max:16; - } split; - unsigned int full; -}; - -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); - -void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); - -/* - * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control and we enable by mounting the - * resctrl file system. - * - Caches the per cpu CLOSid values and does the MSR write only - * when a task with a different CLOSid is scheduled in. - * - * Must be called with preemption disabled. - */ -static inline void intel_rdt_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) { - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; - - /* - * If this task has a closid assigned, use it. - * Else use the closid assigned to this cpu. - */ - closid = current->closid; - if (closid == 0) - closid = this_cpu_read(cpu_closid); - - if (closid != state->closid) { - state->closid = closid; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); - } - } -} - -#else - -static inline void intel_rdt_sched_in(void) {} - -#endif /* CONFIG_INTEL_RDT_A */ -#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h deleted file mode 100644 index b31081b89407..000000000000 --- a/arch/x86/include/asm/intel_rdt_common.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_COMMON_H -#define _ASM_X86_INTEL_RDT_COMMON_H - -#define MSR_IA32_PQR_ASSOC 0x0c8f - -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - -DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); - -#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h new file mode 100644 index 000000000000..b4bbf8b21512 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_sched.h @@ -0,0 +1,92 @@ +#ifndef _ASM_X86_INTEL_RDT_SCHED_H +#define _ASM_X86_INTEL_RDT_SCHED_H + +#ifdef CONFIG_INTEL_RDT + +#include <linux/sched.h> +#include <linux/jump_label.h> + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __intel_rdt_sched_in(); +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT */ + +#endif /* _ASM_X86_INTEL_RDT_SCHED_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 48febf07e828..c40a95c33bb8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) +#define readb readb +#define readw readw +#define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) @@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb writeb +#define writew writew +#define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) @@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) readq(a) -#define writeq_relaxed(v, a) writeq(v, a) +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) -#define __raw_readq(a) readq(a) -#define __raw_writeq(val, addr) writeq(val, addr) +#define __raw_readq __readq +#define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq @@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } +#define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual @@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } +#define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. @@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * else, you probably want one of the following. */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +#define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +#define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +#define ioremap_prot ioremap_prot /** * ioremap - map bus memory into CPU space @@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); } +#define ioremap ioremap extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap extern void set_iounmap_nonlazy(void); @@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> /* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -/** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set - * - * Set a range of I/O memory to a given value. - */ -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -/** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data from I/O memory. - */ -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -/** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data to I/O memory. - */ -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values @@ -341,13 +309,38 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) +#define inb inb +#define inw inw +#define inl inl +#define inb_p inb_p +#define inw_p inw_p +#define inl_p inl_p +#define insb insb +#define insw insw +#define insl insl + +#define outb outb +#define outw outw +#define outl outl +#define outb_p outb_p +#define outw_p outw_p +#define outl_p outl_p +#define outsb outsb +#define outsw outsw +#define outsl outsl + extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +#define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); +#define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#include <asm-generic/io.h> +#undef PCI_IOBASE + #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index @@ -381,4 +377,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +extern bool arch_memremap_can_ram_remap(resource_size_t offset, + unsigned long size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + +extern bool phys_mem_access_encrypted(unsigned long phys_addr, + unsigned long size); + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 668cca540025..9958ceea2fa3 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -42,10 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); -extern int vector_used_by_percpu_irq(unsigned int vector); - extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index f70604125286..ddbb8ea0f5a9 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -3,9 +3,17 @@ #include <asm/cpufeature.h> +#ifdef CONFIG_X86_LOCAL_APIC static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } +extern void arch_irq_work_raise(void); +#else +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} +#endif #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 70ef205489f0..942c1f444da8 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -147,7 +147,8 @@ unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, unsigned long start_address, - unsigned int preserve_context); + unsigned int preserve_context, + unsigned int sme_active); #endif #define ARCH_HAS_KIMAGE_ARCH @@ -207,6 +208,14 @@ struct kexec_entry64_regs { uint64_t r15; uint64_t rip; }; + +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, + gfp_t gfp); +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages + +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages + #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 92c9032502d8..369e41c23f07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1079,7 +1079,7 @@ void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask); + u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include <asm/desc.h> - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include <asm/hw_irq.h> - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..8e618fcf1f7c --- /dev/null +++ b/arch/x86/include/asm/mem_encrypt.h @@ -0,0 +1,80 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __X86_MEM_ENCRYPT_H__ +#define __X86_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#include <linux/init.h> + +#include <asm/bootparam.h> + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +extern unsigned long sme_me_mask; + +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, + unsigned long decrypted_kernel_vaddr, + unsigned long kernel_len, + unsigned long encryption_wa, + unsigned long encryption_pgd); + +void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size); +void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size); + +void __init sme_map_bootdata(char *real_mode_data); +void __init sme_unmap_bootdata(char *real_mode_data); + +void __init sme_early_init(void); + +void __init sme_encrypt_kernel(void); +void __init sme_enable(struct boot_params *bp); + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +static inline void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size) { } +static inline void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size) { } + +static inline void __init sme_map_bootdata(char *real_mode_data) { } +static inline void __init sme_unmap_bootdata(char *real_mode_data) { } + +static inline void __init sme_early_init(void) { } + +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(struct boot_params *bp) { } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 79b647a7ebd0..bb8c597c2248 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,12 +3,28 @@ #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/atomic.h> /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -37,6 +53,11 @@ typedef struct { #endif } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 7a234be7e298..7ae318c340d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -12,6 +12,9 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/mpx.h> + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + cpumask_clear_cpu(cpu, mm_cpumask(mm)); } static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ @@ -290,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void) { unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + if (static_cpu_has(X86_FEATURE_PCID)) + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e3b7819caeef..9eb7c718aaf8 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -2,6 +2,15 @@ #define _ASM_X86_MODULE_H #include <asm-generic/module.h> +#include <asm/orc_types.h> + +struct mod_arch_specific { +#ifdef CONFIG_ORC_UNWINDER + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a0d662be4c5b..7d7404756bb4 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm) } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); + +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { @@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } + +static inline unsigned long mpx_unmapped_area_check(unsigned long addr, + unsigned long len, unsigned long flags) +{ + return addr; +} #endif /* CONFIG_X86_INTEL_MPX */ #endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 2b58c8c1eeaa..63cc96f064dc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -3,6 +3,8 @@ #include <linux/types.h> #include <linux/atomic.h> +#include <linux/nmi.h> +#include <asm/io.h> #include <asm/hyperv.h> /* @@ -28,6 +30,8 @@ struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; @@ -168,12 +172,155 @@ void hv_remove_crash_handler(void); #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; +extern void *hv_hypercall_pg; + +static inline u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = input ? virt_to_phys(input) : 0; + u64 output_address = output ? virt_to_phys(output) : 0; + u64 hv_status; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("mov %4, %%r8\n" + "call *%5" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input_address) + : "r" (output_address), "m" (hv_hypercall_pg) + : "cc", "memory", "r8", "r9", "r10", "r11"); +#else + u32 input_address_hi = upper_32_bits(input_address); + u32 input_address_lo = lower_32_bits(input_address); + u32 output_address_hi = upper_32_bits(output_address); + u32 output_address_lo = lower_32_bits(output_address); + + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("call *%7" + : "=A" (hv_status), + "+c" (input_address_lo), "+r" (__sp) + : "A" (control), + "b" (input_address_hi), + "D"(output_address_hi), "S"(output_address_lo), + "m" (hv_hypercall_pg) + : "cc", "memory"); +#endif /* !x86_64 */ + return hv_status; +} + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* Fast hypercall with 8 bytes of input and no output */ +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + { + __asm__ __volatile__("call *%4" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input1) + : "m" (hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + } +#else + { + u32 input1_hi = upper_32_bits(input1); + u32 input1_lo = lower_32_bits(input1); + + __asm__ __volatile__ ("call *%5" + : "=A"(hv_status), + "+c"(input1_lo), + "+r"(__sp) + : "A" (control), + "b" (input1_hi), + "m" (hv_hypercall_pg) + : "cc", "edi", "esi"); + } +#endif + return hv_status; +} + +/* + * Rep hypercalls. Callers of this functions are supposed to ensure that + * rep_count and varhead_size comply with Hyper-V hypercall definition. + */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + u64 control = code; + u64 status; + u16 rep_comp; + + control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; + control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + + do { + status = hv_do_hypercall(control, input, output); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + return status; + + /* Bits 32-43 of status have 'Reps completed' data. */ + rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >> + HV_HYPERCALL_REP_COMP_OFFSET; + + control &= ~HV_HYPERCALL_REP_START_MASK; + control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET; + + touch_nmi_watchdog(); + } while (rep_comp < rep_count); + + return status; +} + +/* + * Hypervisor's notion of virtual processor ID is different from + * Linux' notion of CPU ID. This information can only be retrieved + * in the context of the calling CPU. Setup a map for easy access + * to this information. + */ +extern u32 *hv_vp_index; + +/** + * hv_cpu_number_to_vp_number() - Map CPU to VP. + * @cpu_number: CPU number in Linux terms + * + * This function returns the mapping between the Linux processor + * number and the hypervisor's virtual processor number, useful + * in making hypercalls and such that talk about specific + * processors. + * + * Return: Virtual processor number in Hyper-V terms + */ +static inline int hv_cpu_number_to_vp_number(int cpu_number) +{ + return hv_vp_index[cpu_number]; +} void hyperv_init(void); +void hyperv_setup_mmu_ops(void); +void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); -#endif +#else /* CONFIG_HYPERV */ +static inline void hyperv_init(void) {} +static inline bool hv_is_hypercall_page_setup(void) { return false; } +static inline void hyperv_cleanup(void) {} +static inline void hyperv_setup_mmu_ops(void) {} +#endif /* CONFIG_HYPERV */ + #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5573c75f8e4c..17f5c12e1afd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -356,6 +356,8 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h new file mode 100644 index 000000000000..91c8d868424d --- /dev/null +++ b/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h new file mode 100644 index 000000000000..9c9dc579bd7d --- /dev/null +++ b/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include <linux/types.h> +#include <linux/compiler.h> + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b4a0d43248cf..b50df06ad251 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -51,6 +51,10 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_MCE +#define arch_unmap_kpfn arch_unmap_kpfn +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384ca..b98ed9d14630 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include <linux/const.h> #include <linux/types.h> +#include <linux/mem_encrypt.h> /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +16,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 9ccac1926587..c25dd22f7c70 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -960,11 +960,6 @@ extern void default_banner(void); #define GET_CR2_INTO_RAX \ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) -#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ - CLBR_NONE, \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) - #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9ffc36bfe4cd..6b64fc6367f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -196,9 +196,6 @@ struct pv_irq_ops { void (*safe_halt)(void); void (*halt)(void); -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 77037b6f1caa..bbeae4a2bd01 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include <linux/mem_encrypt.h> #include <asm/page.h> #include <asm/pgtable_types.h> @@ -13,9 +14,18 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include <asm/x86_init.h> +extern pgd_t early_top_pgt[PTRS_PER_PGD]; +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -38,6 +48,8 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT */ @@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ @@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) \ - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) \ - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define p4d_page(p4d) \ - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the third-level page table.. */ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) @@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee42..399261ce904c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> +#include <linux/mem_encrypt.h> + #include <asm/page_types.h> #define FIRST_USER_ADDRESS 0UL @@ -121,10 +123,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -159,6 +161,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -187,22 +190,42 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#ifndef __ASSEMBLY__ + +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE @@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d) #else #include <asm-generic/pgtable-nop4d.h> +static inline p4d_t native_make_p4d(pudval_t val) +{ + return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; +} + static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 79aa2f98398d..dc723b64acf0 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -2,6 +2,7 @@ #define _ASM_X86_PROCESSOR_FLAGS_H #include <uapi/asm/processor-flags.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM @@ -32,16 +33,18 @@ * CR3_ADDR_MASK is the mask used by read_cr3_pa(). */ #ifdef CONFIG_X86_64 -/* Mask off the address space ID bits. */ -#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull -#define CR3_PCID_MASK 0xFFFull +/* Mask off the address space ID and SME encryption bits. */ +#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_PCID_MASK 0xFFFull +#define CR3_NOFLUSH BIT_ULL(63) #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save * a tiny bit of code size by setting all the bits. */ -#define CR3_ADDR_MASK 0xFFFFFFFFull -#define CR3_PCID_MASK 0ull +#define CR3_ADDR_MASK 0xFFFFFFFFull +#define CR3_PCID_MASK 0ull +#define CR3_NOFLUSH 0 #endif #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 028245e1c42b..3fa26a61eabc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -22,6 +22,7 @@ struct vm86; #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> +#include <asm/unwind_hints.h> #include <linux/personality.h> #include <linux/cache.h> @@ -29,6 +30,7 @@ struct vm86; #include <linux/math64.h> #include <linux/err.h> #include <linux/irqflags.h> +#include <linux/mem_encrypt.h> /* * We handle most unaligned accesses in hardware. On the other hand @@ -239,9 +241,14 @@ static inline unsigned long read_cr3_pa(void) return __read_cr3() & CR3_ADDR_MASK; } +static inline unsigned long native_read_cr3_pa(void) +{ + return __native_read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 @@ -661,7 +668,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a @@ -684,6 +691,7 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( + UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -693,6 +701,7 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" + UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif @@ -802,7 +811,9 @@ static inline void spin_lock_prefetch(const void *x) */ #define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE #define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -842,7 +853,9 @@ static inline void spin_lock_prefetch(const void *x) * particular problem by preventing anything from being mapped * at the maximum canonical address. */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -850,12 +863,14 @@ static inline void spin_lock_prefetch(const void *x) #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define STACK_TOP TASK_SIZE +#define STACK_TOP TASK_SIZE_LOW #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ @@ -876,7 +891,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) -#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8d3964fc5f91..b408b1886195 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_INT80_compat(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +void xen_entry_INT80_compat(void); +#endif #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 2b5d686ea9f3..91c04c8e67fa 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -9,6 +9,20 @@ #ifdef __i386__ struct pt_regs { + /* + * NB: 32-bit x86 CPUs are inconsistent as what happens in the + * following cases (where %seg represents a segment register): + * + * - pushl %seg: some do a 16-bit write and leave the high + * bits alone + * - movl %seg, [mem]: some do a 16-bit write despite the movl + * - IDT entry: some (e.g. 486) will leave the high bits of CS + * and (if applicable) SS undefined. + * + * Fortunately, x86-32 doesn't read the high bits on POP or IRET, + * so we can just treat all of the segment registers as 16-bit + * values. + */ unsigned long bx; unsigned long cx; unsigned long dx; @@ -16,16 +30,22 @@ struct pt_regs { unsigned long di; unsigned long bp; unsigned long ax; - unsigned long ds; - unsigned long es; - unsigned long fs; - unsigned long gs; + unsigned short ds; + unsigned short __dsh; + unsigned short es; + unsigned short __esh; + unsigned short fs; + unsigned short __fsh; + unsigned short gs; + unsigned short __gsh; unsigned long orig_ax; unsigned long ip; - unsigned long cs; + unsigned short cs; + unsigned short __csh; unsigned long flags; unsigned long sp; - unsigned long ss; + unsigned short ss; + unsigned short __ssh; }; #else /* __i386__ */ @@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); + + /* The selector fields are 16-bit. */ + if (offset == offsetof(struct pt_regs, cs) || + offset == offsetof(struct pt_regs, ss) || + offset == offsetof(struct pt_regs, ds) || + offset == offsetof(struct pt_regs, es) || + offset == offsetof(struct pt_regs, fs) || + offset == offsetof(struct pt_regs, gs)) { + return *(u16 *)((unsigned long)regs + offset); + + } #endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 230e1903acf0..90d91520c13a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -1,6 +1,15 @@ #ifndef _ARCH_X86_REALMODE_H #define _ARCH_X86_REALMODE_H +/* + * Flag bit definitions for use with the flags field of the trampoline header + * in the CONFIG_X86_64 variant. + */ +#define TH_FLAGS_SME_ACTIVE_BIT 0 +#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) + +#ifndef __ASSEMBLY__ + #include <linux/types.h> #include <asm/io.h> @@ -38,6 +47,7 @@ struct trampoline_header { u64 start; u64 efer; u32 cr4; + u32 flags; #endif }; @@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void) void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); +#endif /* __ASSEMBLY__ */ + #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..ff871210b9f2 --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,109 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from + * PaX/grsecurity. + */ +#include <linux/refcount.h> + +/* + * This is the first portion of the refcount error handling, which lives in + * .text.unlikely, and is jumped to from the CPU flag check (in the + * following macros). This saves the refcount value location into CX for + * the exception handler to use (in mm/extable.c), and then triggers the + * central refcount exception. The fixup address for the exception points + * back to the regular execution flow in .text. + */ +#define _REFCOUNT_EXCEPTION \ + ".pushsection .text.unlikely\n" \ + "111:\tlea %[counter], %%" _ASM_CX "\n" \ + "112:\t" ASM_UD0 "\n" \ + ASM_UNREACHABLE \ + ".popsection\n" \ + "113:\n" \ + _ASM_EXTABLE_REFCOUNT(112b, 113b) + +/* Trigger refcount exception if refcount result is negative. */ +#define REFCOUNT_CHECK_LT_ZERO \ + "js 111f\n\t" \ + _REFCOUNT_EXCEPTION + +/* Trigger refcount exception if refcount result is zero or negative. */ +#define REFCOUNT_CHECK_LE_ZERO \ + "jz 111f\n\t" \ + REFCOUNT_CHECK_LT_ZERO + +/* Trigger refcount exception unconditionally. */ +#define REFCOUNT_ERROR \ + "jmp 111f\n\t" \ + _REFCOUNT_EXCEPTION + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_LE_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "%0", e); +} + +static __always_inline __must_check +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + int c, result; + + c = atomic_read(&(r->refs)); + do { + if (unlikely(c == 0)) + return false; + + result = c + i; + + /* Did we try to increment from/to an undesirable state? */ + if (unlikely(c < 0 || c == INT_MAX || result < c)) { + asm volatile(REFCOUNT_ERROR + : : [counter] "m" (r->refs.counter) + : "cc", "cx"); + break; + } + + } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); + + return c != 0; +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return refcount_add_not_zero(1, r); +} + +#endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 661dd305694a..045f99211a99 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,45 +1,56 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +#define __CLOBBERS_MEM "memory" +#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" + #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : "m" (var), ## __VA_ARGS__ \ - : "memory" : cc_label); \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : clobbers : cc_label); \ return 0; \ cc_label: \ return 1; \ } while (0) -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) +#define __BINARY_RMWcc_ARG " %1, " -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ asm volatile (fullop ";" CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ return c; \ } while (0) +#define __BINARY_RMWcc_ARG " %2, " + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ + __CLOBBERS_MEM, vcon (val)) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX, vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 1549caa098f0..066aaf813141 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -238,9 +238,7 @@ #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; -#ifdef CONFIG_TRACING -# define trace_early_idt_handler_array early_idt_handler_array -#endif +extern void early_ignore_irq(void); /* * Load a segment. Fall back on loading the zero segment if something goes diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index eaec6c364e42..cd71273ec49d 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -11,6 +11,7 @@ * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent + * Encryption : Encrypted, Decrypted * * Within a category, the attributes are mutually exclusive. * @@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index e4585a393965..a65cf544686a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,7 @@ static inline void vsmp_init(void) { } #endif void setup_bios_corruption_check(void); +void early_platform_quirks(void); extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e00e1bd6e7b3..5161da1a0fa0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -98,6 +98,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -122,6 +123,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -137,7 +139,8 @@ struct thread_info { (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ - _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index c7797307fc2b..79a4ca6a9606 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -15,4 +15,18 @@ #include <asm-generic/tlb.h> +/* + * While x86 architecture in general requires an IPI to perform TLB + * shootdown, enablement code for several hypervisors overrides + * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing + * a hypercall. To keep software pagetable walkers safe in this case we + * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment + * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ +static inline void __tlb_remove_table(void *table) +{ + free_page_and_swap_cache(table); +} + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 50ea3482e1d1..4893abf7f74f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); } +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + u64 new_tlb_gen; + + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + smp_mb__before_atomic(); + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); + smp_mb__after_atomic(); + + return new_tlb_gen; +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * 6 because 6 should be plenty and struct tlb_state will fit in + * two cache lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts @@ -73,13 +101,35 @@ struct tlb_state { * mode even if we've already switched back to swapper_pg_dir. */ struct mm_struct *loaded_mm; - int state; + u16 loaded_mm_asid; + u16 next_asid; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * There is one per ASID that we use, and the ASID (what the + * CPU calls PCID) is the index into ctxts. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + * + * NB: context 0 is a bit special, since it's also used by + * various bits of init code. This is fine -- code that + * isn't aware of PCID will end up harmlessly flushing + * context 0. + */ + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -148,6 +198,8 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) cr4_set_bits(mask); } +extern void initialize_tlbstate_and_flush(void); + static inline void __native_flush_tlb(void) { /* @@ -207,6 +259,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) @@ -231,9 +291,26 @@ static inline void __flush_tlb_one(unsigned long addr) * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { - struct mm_struct *mm; - unsigned long start; - unsigned long end; + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; }; #define local_flush_tlb() __flush_tlb() @@ -256,12 +333,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { + inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6358a85e2270..c1d2a9892352 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) extern void setup_node_to_cpumask_map(void); -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h new file mode 100644 index 000000000000..57c8da027d99 --- /dev/null +++ b/arch/x86/include/asm/trace/common.h @@ -0,0 +1,16 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); +#define trace_pagefault_enabled() \ + static_branch_unlikely(&trace_pagefault_key) +DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); +#define trace_resched_ipi_enabled() \ + static_branch_unlikely(&trace_resched_ipi_key) +#else +static inline bool trace_pagefault_enabled(void) { return false; } +static inline bool trace_resched_ipi_enabled(void) { return false; } +#endif + +#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2422b14c50a7..5665bf205b8d 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -5,9 +5,10 @@ #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +extern int trace_pagefault_reg(void); +extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, @@ -37,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); + trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h new file mode 100644 index 000000000000..4253bca99989 --- /dev/null +++ b/arch/x86/include/asm/trace/hyperv.h @@ -0,0 +1,40 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HYPERV_H + +#include <linux/tracepoint.h> + +#if IS_ENABLED(CONFIG_HYPERV) + +TRACE_EVENT(hyperv_mmu_flush_tlb_others, + TP_PROTO(const struct cpumask *cpus, + const struct flush_tlb_info *info), + TP_ARGS(cpus, info), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(struct mm_struct *, mm) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->mm = info->mm; + __entry->addr = info->start; + __entry->end = info->end; + ), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, + __entry->addr, __entry->end) + ); + +#endif /* CONFIG_HYPERV */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hyperv +#endif /* _TRACE_HYPERV_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 32dd6a9e343c..1599d394c8c1 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -5,9 +5,12 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#ifdef CONFIG_X86_LOCAL_APIC + +extern int trace_resched_ipi_reg(void); +extern void trace_resched_ipi_unreg(void); DECLARE_EVENT_CLASS(x86_irq_vector, @@ -28,15 +31,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector, #define DEFINE_IRQ_VECTOR_EVENT(name) \ DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); + +#define DEFINE_RESCHED_IPI_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); \ + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); \ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); - + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); /* * local_timer - called when entering/exiting a local timer interrupt @@ -45,11 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ DEFINE_IRQ_VECTOR_EVENT(local_timer); /* - * reschedule - called when entering/exiting a reschedule vector handler - */ -DEFINE_IRQ_VECTOR_EVENT(reschedule); - -/* * spurious_apic - called when entering/exiting a spurious apic vector handler */ DEFINE_IRQ_VECTOR_EVENT(spurious_apic); @@ -65,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic); */ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); +#ifdef CONFIG_IRQ_WORK /* * irq_work - called when entering/exiting a irq work interrupt * vector handler @@ -81,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work); * 4) goto 1 */ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); +#endif + +/* + * The ifdef is required because that tracepoint macro hell emits tracepoint + * code in files which include this header even if the tracepoint is not + * enabled. Brilliant stuff that. + */ +#ifdef CONFIG_SMP +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_RESCHED_IPI_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt @@ -93,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function); * single interrupt vector handler */ DEFINE_IRQ_VECTOR_EVENT(call_function_single); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD /* * threshold_apic - called when entering/exiting a threshold apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +#endif +#ifdef CONFIG_X86_MCE_AMD /* * deferred_error_apic - called when entering/exiting a deferred apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(thermal_apic); +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7f48cd..5545f6459bf5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,9 +13,6 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void xen_debug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -38,22 +35,29 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -#ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); -#define trace_stack_segment stack_segment -#define trace_divide_error divide_error -#define trace_bounds bounds -#define trace_invalid_op invalid_op -#define trace_device_not_available device_not_available -#define trace_coprocessor_segment_overrun coprocessor_segment_overrun -#define trace_invalid_TSS invalid_TSS -#define trace_segment_not_present segment_not_present -#define trace_general_protection general_protection -#define trace_spurious_interrupt_bug spurious_interrupt_bug -#define trace_coprocessor_error coprocessor_error -#define trace_alignment_check alignment_check -#define trace_simd_coprocessor_error simd_coprocessor_error -#define trace_async_page_fault async_page_fault +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +asmlinkage void xen_divide_error(void); +asmlinkage void xen_xendebug(void); +asmlinkage void xen_xenint3(void); +asmlinkage void xen_nmi(void); +asmlinkage void xen_overflow(void); +asmlinkage void xen_bounds(void); +asmlinkage void xen_invalid_op(void); +asmlinkage void xen_device_not_available(void); +asmlinkage void xen_double_fault(void); +asmlinkage void xen_coprocessor_segment_overrun(void); +asmlinkage void xen_invalid_TSS(void); +asmlinkage void xen_segment_not_present(void); +asmlinkage void xen_stack_segment(void); +asmlinkage void xen_general_protection(void); +asmlinkage void xen_page_fault(void); +asmlinkage void xen_spurious_interrupt_bug(void); +asmlinkage void xen_coprocessor_error(void); +asmlinkage void xen_alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void xen_machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -74,14 +78,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 30269dafec47..184eb9894dae 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -26,7 +26,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.addr_limit) -#define set_fs(x) (current->thread.addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current->thread.addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e6676495b125..e9f793e2df7a 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,14 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_ORC_UNWINDER) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_FRAME_POINTER_UNWINDER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,41 +27,30 @@ struct unwind_state { void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } +#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_ORC_UNWINDER +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h new file mode 100644 index 000000000000..bae46fc6b9de --- /dev/null +++ b/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp + .if \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .else + .set sp_reg, ORC_REG_SP + .endif + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index c4b9dc2f67c5..9f42beefc67a 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -7,12 +7,24 @@ #ifndef _ASM_X86_VGA_H #define _ASM_X86_VGA_H +#include <asm/set_memory.h> + /* * On the PC, we can just recalculate addresses and then * access the videoram directly without any black magic. + * To support memory encryption however, we need to access + * the videoram as decrypted memory. */ -#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x, s) \ +({ \ + unsigned long start = (unsigned long)phys_to_virt(x); \ + \ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \ + set_memory_decrypted(start, (s) >> PAGE_SHIFT); \ + \ + start; \ +}) #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 11071fcd630e..9606688caa4b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,6 +552,8 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { + u32 *p = (u32 *) &desc; + mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; @@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } else { mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8417ef7c3885..07b6531813c4 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -158,9 +158,6 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) unsigned long pfn; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return mfn; - if (unlikely(mfn >= machine_to_phys_nr)) return ~0; @@ -317,8 +314,6 @@ static inline pte_t __pte_ma(pteval_t x) #define p4d_val_ma(x) ((x).p4d) #endif -void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); - xmaddr_t arbitrary_virt_to_machine(void *address); unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ddef37b16af2..66b8f93333d1 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -201,7 +201,7 @@ struct boot_params { * * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 127ddadee1a5..7032f4d8dff3 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -149,6 +149,9 @@ */ #define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +/* Recommend using the newer ExProcessorMasks interface */ +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) + /* * HV_VP_SET available */ @@ -242,7 +245,11 @@ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) /* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d @@ -259,6 +266,16 @@ #define HV_PROCESSOR_POWER_STATE_C2 2 #define HV_PROCESSOR_POWER_STATE_C3 3 +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARCE_4K, + HV_GENERIC_SET_ALL, +}; + /* hypercall status code */ #define HV_STATUS_SUCCESS 0 #define HV_STATUS_INVALID_HYPERCALL_CODE 2 diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 39bca7fac087..3be08f07695c 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,9 +3,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS /* * Take the 4 protection key bits out of the vma->vm_flags diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a01892bdd61a..fd0a7895b63f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o -obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o @@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o @@ -126,11 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -ifdef CONFIG_FRAME_POINTER -obj-y += unwind_frame.o -else -obj-y += unwind_guess.o -endif +obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7491e73d9253..f8ae286c1502 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -115,24 +115,24 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { #define ACPI_INVALID_GSI INT_MIN /* - * This is just a simple wrapper around early_ioremap(), + * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!phys || !size) return NULL; - return early_ioremap(phys, size); + return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; - early_iounmap(map, size); + early_memunmap(map, size); } #ifdef CONFIG_X86_LOCAL_APIC @@ -199,8 +199,10 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; +#ifdef CONFIG_X86_X2APIC int apic_id; u8 enabled; +#endif processor = (struct acpi_madt_local_x2apic *)header; @@ -209,9 +211,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); +#ifdef CONFIG_X86_X2APIC apic_id = processor->local_apic_id; enabled = processor->lapic_flags & ACPI_MADT_ENABLED; -#ifdef CONFIG_X86_X2APIC + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -1083,7 +1086,7 @@ static void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs()); /* * Use the default configuration for the IRQs 0-15. Unless diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 32e14d137416..3344d3382e91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr; int poke_int3_handler(struct pt_regs *regs) { - /* bp_patching_in_progress */ + /* + * Having observed our INT3 instruction, we now must observe + * bp_patching_in_progress. + * + * in_progress = TRUE INT3 + * WMB RMB + * write INT3 if (in_progress) + * + * Idem for bp_int3_handler. + */ smp_rmb(); if (likely(!bp_patching_in_progress)) @@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) bp_int3_addr = (u8 *)addr + sizeof(int3); bp_patching_in_progress = true; /* - * Corresponding read barrier in int3 notifier for - * making sure the in_progress flags is correctly ordered wrt. - * patching + * Corresponding read barrier in int3 notifier for making sure the + * in_progress and handler are correctly ordered wrt. patching. */ smp_wmb(); @@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) text_poke(addr, opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - + /* + * sync_core() implies an smp_mb() and orders this store against + * the writing of the new instruction. + */ bp_patching_in_progress = false; - smp_wmb(); return addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 98b3dd8cf2bf..7834f73efbf1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -177,8 +177,6 @@ static int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -int first_system_vector = FIRST_SYSTEM_VECTOR; - /* * Debug level, exported for io_apic.c */ @@ -599,9 +597,13 @@ static const struct x86_cpu_id deadline_match[] = { static void apic_check_deadline_errata(void) { - const struct x86_cpu_id *m = x86_match_cpu(deadline_match); + const struct x86_cpu_id *m; u32 rev; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + m = x86_match_cpu(deadline_match); if (!m) return; @@ -990,8 +992,7 @@ void setup_secondary_APIC_clock(void) */ static void local_apic_timer_interrupt(void) { - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + struct clock_event_device *evt = this_cpu_ptr(&lapic_events); /* * Normally we should not be here till LAPIC has been initialized but @@ -1005,7 +1006,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1040,25 +1042,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * interrupt lock, which is the WrongThing (tm) to do. */ entering_ack_irq(); - local_apic_timer_interrupt(); - exiting_irq(); - - set_irq_regs(old_regs); -} - -__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1920,10 +1903,14 @@ void __init register_lapic_address(unsigned long address) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static void __smp_spurious_interrupt(u8 vector) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { + u8 vector = ~regs->orig_ax; u32 v; + entering_irq(); + trace_spurious_apic_entry(vector); + /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1938,22 +1925,7 @@ static void __smp_spurious_interrupt(u8 vector) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " "should never happen.\n", vector, smp_processor_id()); -} -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_spurious_interrupt(~regs->orig_ax); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) -{ - u8 vector = ~regs->orig_ax; - - entering_irq(); - trace_spurious_apic_entry(vector); - __smp_spurious_interrupt(vector); trace_spurious_apic_exit(vector); exiting_irq(); } @@ -1961,10 +1933,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static void __smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { - u32 v; - u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ "Receive CS error", /* APIC Error Bit 1 */ @@ -1975,6 +1945,10 @@ static void __smp_error_interrupt(struct pt_regs *regs) "Received illegal vector", /* APIC Error Bit 6 */ "Illegal register address", /* APIC Error Bit 7 */ }; + u32 v, i = 0; + + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ @@ -1996,20 +1970,6 @@ static void __smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); -} - -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_error_interrupt(regs); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - trace_error_apic_entry(ERROR_APIC_VECTOR); - __smp_error_interrupt(regs); trace_error_apic_exit(ERROR_APIC_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 237e9c2341c7..70e48aa6af98 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index << 15) | ir_entry->index, + buf, (ir_entry->index2 << 15) | ir_entry->index, ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b3af457ed667..88c214e75a6b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, offset = current_offset; next: vector += 16; - if (vector >= first_system_vector) { + if (vector >= FIRST_SYSTEM_VECTOR) { offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 880aa093268d..710edab9e644 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -4,9 +4,6 @@ #include <asm/ucontext.h> -#include <linux/lguest.h> -#include "../../../drivers/lguest/lg.h" - #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include <asm/syscalls_32.h> @@ -62,23 +59,6 @@ void foo(void) OFFSET(stack_canary_offset, stack_canary, canary); #endif -#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) - BLANK(); - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - - BLANK(); - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); -#endif BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); DEFINE(NR_syscalls, sizeof(syscalls)); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 99332f550c48..cf42206926af 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,7 +20,6 @@ static char syscalls_ia32[] = { int main(void) { #ifdef CONFIG_PARAVIRT - OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); BLANK(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cdf82492b770..e17942c131c8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3b9e220621f8..9862e2cd6d93 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -297,13 +297,29 @@ static int nearby_node(int apicid) } #endif +#ifdef CONFIG_SMP +/* + * Fix up cpu_core_id for pre-F17h systems to be in the + * [0 .. cores_per_node - 1] range. Not really needed but + * kept so as not to break existing setups. + */ +static void legacy_fixup_core_id(struct cpuinfo_x86 *c) +{ + u32 cus_per_node; + + if (c->x86 >= 0x17) + return; + + cus_per_node = c->x86_max_cores / nodes_per_socket; + c->cpu_core_id %= cus_per_node; +} + /* * Fixup core topology information for * (1) AMD multi-node processors * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u8 node_id; @@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c) } else return; - /* fixup multi-node processor information */ if (nodes_per_socket > 1) { - u32 cus_per_node; - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cus_per_node = c->x86_max_cores / nodes_per_socket; - - /* core id has to be in the [0 .. cores_per_node - 1] range */ - c->cpu_core_id %= cus_per_node; + legacy_fixup_core_id(c); } } #endif @@ -548,8 +558,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) static void early_init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + early_init_amd_mc(c); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states @@ -612,6 +626,27 @@ static void early_init_amd(struct cpuinfo_x86 *c) */ if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); + + /* + * BIOS support is required for SME. If BIOS has enabled SME then + * adjust x86_phys_bits by the SME physical address space reduction + * value. If BIOS has not enabled SME then don't advertise the + * feature (set in scattered.c). Also, since the SME support requires + * long mode, don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME)) { + u64 msr; + + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + if (IS_ENABLED(CONFIG_X86_32)) + clear_cpu_cap(c, X86_FEATURE_SME); + } else { + clear_cpu_cap(c, X86_FEATURE_SME); + } + } } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -730,8 +765,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static void init_amd(struct cpuinfo_x86 *c) { - u32 dummy; - early_init_amd(c); /* @@ -793,8 +826,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); - /* 3DNow or LM implies PREFETCHW */ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0af86d9242da..db684880d74a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,6 +21,14 @@ void __init check_bugs(void) { +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c8b39870f33e..fb1d3358a4af 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +#ifdef CONFIG_X86_64 +static int __init x86_pcid_setup(char *s) +{ + /* require an exact match without trailing characters */ + if (strlen(s)) + return 0; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 1; +} +__setup("nopcid", x86_pcid_setup); +#endif + static int __init x86_noinvpcid_setup(char *s) { /* noinvpcid doesn't accept parameters */ @@ -311,6 +329,38 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static void setup_pcid(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { + /* + * We'd like to use cr4_set_bits_and_update_boot(), + * but we can't. CR4.PCIDE is special and can only + * be set in long mode, and the early CPU init code + * doesn't know this and would try to restore CR4.PCIDE + * prior to entering long mode. + * + * Instead, we rely on the fact that hotplug, resume, + * etc all fully restore CR4 before they write anything + * that could have nonzero PCID bits to CR3. CR4.PCIDE + * has no effect on the page tables themselves, so we + * don't need it to be restored early. + */ + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1125,6 +1175,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smep(c); setup_smap(c); + /* Set up PCID */ + setup_pcid(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -1289,15 +1342,6 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr __ro_after_init = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) idt_table, -}; -const struct desc_ptr debug_idt_descr = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; - DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; @@ -1552,6 +1596,7 @@ void cpu_init(void) mmgrab(&init_mm); me->active_mm = &init_mm; BUG_ON(me->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); load_sp0(t, ¤t->thread); @@ -1606,6 +1651,7 @@ void cpu_init(void) mmgrab(&init_mm); curr->active_mm = &init_mm; BUG_ON(curr->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); load_sp0(t, thread); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c55fb2cb2acc..24f749324c0f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct cacheinfo *this_leaf; int i, sibling; - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; this_leaf = this_cpu_ci->info_list + index; @@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, &this_leaf->shared_cpu_map); } } - } else if (index == 3) { - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - this_leaf = this_cpu_ci->info_list + index; - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); - } - } } else return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5b366462f579..cd5fc61ba450 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -30,7 +30,8 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -38,7 +39,13 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Used to store the max resource name width and max resource data width @@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); */ int max_name_width, max_data_width; +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + static void mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); static void @@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = { + .rid = RDT_RESOURCE_L3, .name = "L3", .domains = domain_init(RDT_RESOURCE_L3), .msr_base = IA32_L3_CBM_BASE, @@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3DATA] = { + .rid = RDT_RESOURCE_L3DATA, .name = "L3DATA", .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = IA32_L3_CBM_BASE, @@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3CODE] = { + .rid = RDT_RESOURCE_L3CODE, .name = "L3CODE", .domains = domain_init(RDT_RESOURCE_L3CODE), .msr_base = IA32_L3_CBM_BASE, @@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2] = { + .rid = RDT_RESOURCE_L2, .name = "L2", .domains = domain_init(RDT_RESOURCE_L2), .msr_base = IA32_L2_CBM_BASE, @@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_MBA] = { + .rid = RDT_RESOURCE_MBA, .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .msr_base = IA32_MBA_THRTL_BASE, @@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = { .cache_level = 3, .parse_ctrlval = parse_bw, .format_str = "%d=%*d", + .fflags = RFTYPE_RES_MB, }, }; @@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ -static inline bool cache_alloc_hsw_probe(void) +static inline void cache_alloc_hsw_probe(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - u32 l, h, max_cbm = BIT_MASK(20) - 1; - - if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) - return false; - rdmsr(IA32_L3_CBM_BASE, l, h); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; - /* If all the bits were set in MSR, return success */ - if (l != max_cbm) - return false; + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return; + rdmsr(IA32_L3_CBM_BASE, l, h); - r->num_closid = 4; - r->default_ctrl = max_cbm; - r->cache.cbm_len = 20; - r->cache.min_cbm_bits = 2; - r->capable = true; - r->enabled = true; + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; - return true; - } + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; - return false; + rdt_alloc_capable = true; } /* @@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r) return false; } r->data_width = 3; - rdt_get_mba_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; return true; } -static void rdt_get_cache_config(int idx, struct rdt_resource *r) +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; @@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r) r->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - rdt_get_cache_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; } static void rdt_get_cdp_l3_config(int type) @@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type) r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - r->capable = true; + r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->enabled = false; + r->alloc_enabled = false; } static int get_cache_id(int cpu, int level) @@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); } +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; @@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg) int cpu = smp_processor_id(); struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - r->msr_update(d, m, r); - return; - } + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); @@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg) * caller, return the first domain whose id is bigger than the input id. * The domain list is sorted by id in ascending order. */ -static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) { struct rdt_domain *d; struct list_head *l; @@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), + sizeof(unsigned long), + GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + kfree(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); - if (domain_setup_ctrlval(r, d)) { + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { kfree(d); return; } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); list_del(&d->list); + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } } } -static void clear_closid(int cpu) +static void clear_closid_rmid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - per_cpu(cpu_closid, cpu) = 0; - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); } static int intel_rdt_online_cpu(unsigned int cpu) @@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu) domain_add_cpu(cpu, r); /* The cpu is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdtgroup *rdtgrp; @@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu) for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); break; + } } - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -492,7 +627,7 @@ static __init void rdt_init_padding(void) struct rdt_resource *r; int cl; - for_each_capable_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { cl = strlen(r->name); if (cl > max_name_width) max_name_width = cl; @@ -502,38 +637,153 @@ static __init void rdt_init_padding(void) } } -static __init bool get_rdt_resources(void) +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_rdt_alloc_resources(void) { bool ret = false; - if (cache_alloc_hsw_probe()) + if (rdt_alloc_capable) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); } ret = true; } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); ret = true; } - if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_cpu_has(X86_FEATURE_MBA)) { if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) ret = true; } - return ret; } +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void rdt_quirks(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_mask <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + } +} + +static __init bool get_rdt_resources(void) +{ + rdt_quirks(); + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void) return ret; } - for_each_capable_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); + for_each_mon_capable_rdt_resource(r) + pr_info("Intel RDT %s monitoring detected\n", r->name); + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h new file mode 100644 index 000000000000..ebaddaeef023 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#include <linux/sched.h> +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH 24 +#define MBM_OVERFLOW_INTERVAL 1000 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int intel_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned long fflags; +}; + +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); + +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 406d7a6532f9..f6ea94f8954a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -26,7 +26,7 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> -#include <asm/intel_rdt.h> +#include "intel_rdt.h" /* * Check whether MBA bandwidth percentage value is correct. The value is @@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) { struct rdt_resource *r; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } @@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; } @@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { ret = update_domains(r, closid); if (ret) goto out; @@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, { struct rdtgroup *rdtgrp; struct rdt_resource *r; - int closid, ret = 0; + int ret = 0; + u32 closid; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (closid < r->num_closid) show_doms(s, r, closid); } @@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return ret; } + +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (!d) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c new file mode 100644 index 000000000000..30827510094b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -0,0 +1,499 @@ +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/cpu_device_id.h> +#include "intel_rdt.h" + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int intel_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= intel_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= intel_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + u64 chunks, shift, tval; + struct mbm_state *m; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + rr->val = tval; + return -EINVAL; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because + * an invalid event id would fail the __rmid_read. + */ + return -EINVAL; + } + + if (rr->first) { + m->prev_msr = tval; + m->chunks = 0; + return 0; + } + + shift = 64 - MBM_CNTR_WIDTH; + chunks = (tval << shift) - (m->prev_msr << shift); + chunks >>= shift; + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + + rdtgrp = rr->rgrp; + + if (__mon_event_count(rdtgrp->mon.rmid, rr)) + return; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr)) + return; + } + } +} + +static void mbm_update(struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = get_domain_from_cpu(cpu, r); + + if (!d) { + pr_warn_once("Failure to get domain for limbo worker\n"); + goto out_unlock; + } + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + struct rdt_resource *r; + int cpu; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_enable_key)) + goto out_unlock; + + d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + if (!d) + goto out_unlock; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(d, crgrp->mon.rmid); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + intel_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9257bd9dc664..a869d4a073c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -32,17 +32,25 @@ #include <uapi/linux/magic.h> -#include <asm/intel_rdt.h> -#include <asm/intel_rdt_common.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -struct kernfs_root *rdt_root; +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -66,7 +74,7 @@ static void closid_init(void) int rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) rdt_min_closid = min(rdt_min_closid, r->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -75,9 +83,9 @@ static void closid_init(void) closid_free_map &= ~1; } -int closid_alloc(void) +static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; @@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) return 0; } -static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, - int len) -{ - struct rftype *rft; - int ret; - - lockdep_assert_held(&rdtgroup_mutex); - - for (rft = rfts; rft < rfts + len; rft++) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) - kernfs_remove_by_name(kn, rft->name); - return ret; -} - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + static bool is_cpu_list(struct kernfs_open_file *of) { struct rftype *rft = of->kn->priv; @@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_closid() is proteced against __switch_to() because + * from update_closid_rmid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *closid) +static void update_cpu_closid_rmid(void *info) { - if (closid) - this_cpu_write(cpu_closid, *(int *)closid); + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid) /* * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * - * Per task closids must have been set up before calling this function. - * - * The per cpu closids are updated with the smp function call, when @closid - * is not NULL. If @closid is NULL then all affected percpu closids must - * have been set up before calling this function. + * Per task closids/rmids must have been set up before calling this function. */ static void -rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); put_cpu(); } +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) + return -EINVAL; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) + return -EINVAL; + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - cpumask_var_t tmpmask, newmask; - struct rdtgroup *rdtgrp, *r; + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; int ret; if (!buf) @@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, free_cpumask_var(tmpmask); return -ENOMEM; } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - ret = -EINVAL; - goto unlock; - } - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_closid(tmpmask, &rdtgroup_default.closid); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu closid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); - } - rdt_update_closid(tmpmask, &rdtgrp->closid); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); return ret ?: nbytes; } @@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } -/* Files in each rdtgroup */ -static struct rftype rdtgroup_base_files[] = { - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - }, -}; - static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, return 0; } +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, return 0; } +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + intel_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + /* rdtgroup information files for one cache resource. */ -static struct rftype res_cache_info_files[] = { +static struct rftype res_common_files[] = { { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, -}; - -/* rdtgroup information files for memory bandwidth. */ -static struct rftype res_mba_info_files[] = { { - .name = "num_closids", + .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, }, }; -void rdt_get_mba_infofile(struct rdt_resource *r) +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) { - r->info_files = res_mba_info_files; - r->nr_info_files = ARRAY_SIZE(res_mba_info_files); + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if ((fflags & rft->fflags) == rft->fflags) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; } -void rdt_get_cache_infofile(struct rdt_resource *r) +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) { - r->info_files = res_cache_info_files; - r->nr_info_files = ARRAY_SIZE(res_cache_info_files); + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { - struct kernfs_node *kn_subdir; - struct rftype *res_info_files; struct rdt_resource *r; - int ret, len; + unsigned long fflags; + char name[32]; + int ret; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); - for_each_enabled_rdt_resource(r) { - kn_subdir = kernfs_create_dir(kn_info, r->name, - kn_info->mode, r); - if (IS_ERR(kn_subdir)) { - ret = PTR_ERR(kn_subdir); - goto out_destroy; - } - kernfs_get(kn_subdir); - ret = rdtgroup_kn_set_ugid(kn_subdir); + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); if (ret) goto out_destroy; + } - res_info_files = r->info_files; - len = r->nr_info_files; - - ret = rdtgroup_add_files(kn_subdir, res_info_files, len); + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) goto out_destroy; - kernfs_activate(kn_subdir); } /* @@ -678,6 +889,39 @@ out_destroy: return ret; } +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -718,14 +962,15 @@ static int cdp_enable(void) struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; int ret; - if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + if (!r_l3->alloc_capable || !r_l3data->alloc_capable || + !r_l3code->alloc_capable) return -EINVAL; ret = set_l3_qos_cfg(r_l3, true); if (!ret) { - r_l3->enabled = false; - r_l3data->enabled = true; - r_l3code->enabled = true; + r_l3->alloc_enabled = false; + r_l3data->alloc_enabled = true; + r_l3code->alloc_enabled = true; } return ret; } @@ -734,11 +979,11 @@ static void cdp_disable(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - r->enabled = r->capable; + r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; set_l3_qos_cfg(r, false); } } @@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) } } +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct rdt_domain *dom; + struct rdt_resource *r; struct dentry *dentry; int ret; @@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + NULL, "mon_groups", + &kn_mongrp); + if (ret) { + dentry = ERR_PTR(ret); + goto out_info; + } + kernfs_get(kn_mongrp); + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mongrp; + } + kernfs_get(kn_mondata); + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_destroy; + goto out_mondata; + + if (rdt_alloc_capable) + static_branch_enable(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } - static_branch_enable(&rdt_enable_key); goto out; -out_destroy: +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: kernfs_remove(kn_info); out_cdp: cdp_disable(); @@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_lock(&tasklist_lock); for_each_process_thread(p, t) { - if (!from || t->closid == from->closid) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { t->closid = to->closid; + t->rmid = to->mon.rmid; + #ifdef CONFIG_SMP /* * This is safe on x86 w/o barriers as the ordering @@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_unlock(&tasklist_lock); } +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + kfree(sentry); + } +} + /* * Forcibly remove all of subdirectories under root. */ @@ -955,6 +1273,9 @@ static void rmdir_all_sub(void) rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; @@ -967,16 +1288,20 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + free_rmid(rdtgrp->mon.rmid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ get_online_cpus(); - rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + update_closid_rmid(cpu_online_mask, &rdtgroup_default); put_online_cpus(); kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); } static void rdt_kill_sb(struct super_block *sb) @@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb) mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); + static_branch_disable(&rdt_alloc_enable_key); + static_branch_disable(&rdt_mon_enable_key); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); @@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = { .kill_sb = rdt_kill_sb, }; -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) { - struct rdtgroup *parent, *rdtgrp; struct kernfs_node *kn; - int ret, closid; + int ret = 0; - /* Only allow mkdir in the root directory */ - if (parent_kn != rdtgroup_default.kn) - return -EPERM; + kn = __kernfs_create_file(parent_kn, name, 0444, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } - parent = rdtgroup_kn_lock_live(parent_kn); - if (!parent) { - ret = -ENODEV; - goto out_unlock; + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); } +} - ret = closid_alloc(); - if (ret < 0) +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that kn is always accessible. + */ + kernfs_get(kn); + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + if (!prdtgrp) { + ret = -ENODEV; goto out_unlock; - closid = ret; + } /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; - goto out_closid_free; + goto out_unlock; } - rdtgrp->closid = closid; - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_cancel_ref; + goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + files = RFTYPE_BASE | RFTYPE_CTRL; + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); if (ret) goto out_destroy; + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) + goto out_destroy; + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) + goto out_idfree; + } kernfs_activate(kn); - ret = 0; - goto out_unlock; + /* + * The caller unlocks the prgrp_kn upon success. + */ + return 0; +out_idfree: + free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_remove(rdtgrp->kn); -out_cancel_ref: - list_del(&rdtgrp->rdtgroup_list); +out_free_rgrp: kfree(rdtgrp); -out_closid_free: - closid_free(closid); out_unlock: - rdtgroup_kn_unlock(parent_kn); + rdtgroup_kn_unlock(prgrp_kn); return ret; } -static int rdtgroup_rmdir(struct kernfs_node *kn) +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + kfree(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, + umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, + &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode) { - int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; + struct kernfs_node *kn; + u32 closid; + int ret; - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, + &rdtgrp); + if (ret) + return ret; - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) + goto out_common_fail; + closid = ret; + + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + if (ret) + goto out_id_free; } + goto out_unlock; + +out_id_free: + closid_free(closid); + list_del(&rdtgrp->rdtgroup_list); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + /* Give any tasks back to the default group */ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); @@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - /* Update per cpu closid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = closid; + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. */ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - rdt_update_closid(tmpmask, NULL); + update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + list_del(&rdtgrp->rdtgroup_list); /* @@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - ret = 0; + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + else + ret = -EPERM; + out: rdtgroup_kn_unlock(kn); free_cpumask_var(tmpmask); @@ -1129,7 +1845,7 @@ out: static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); return 0; } @@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void) mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6dde0497efc7..3b413065c613 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,6 +51,7 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> +#include <asm/set_memory.h> #include "mce-internal.h" @@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m) return ret; } +#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) + +void arch_unmap_kpfn(unsigned long pfn) +{ + unsigned long decoy_addr; + + /* + * Unmap this page from the kernel 1:1 mappings to make sure + * we don't log more errors because of speculative access to + * the page. + * We would like to just call: + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the posion page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_np() not checking whether we passed + * a legal address. + */ + +/* + * Build time check to see if we have a spare virtual bit. Don't want + * to leave this until run time because most developers don't have a + * system that can exercise this code path. This will only become a + * problem if/when we move beyond 5-level page tables. + * + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) + */ +#if PGDIR_SHIFT + 9 < 63 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); +#else +#error "no unused virtual bit available" +#endif + + if (set_memory_np(decoy_addr, 1)) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + +} +#endif + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9e314bcf67cc..40e28ed77fbf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - /* Collect bank_info using CPU 0 for now. */ - if (cpu) + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid) return; if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { @@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; if (hwid_mcatype == s_hwid->hwid_mcatype) { - - WARN(smca_banks[bank].hwid, - "Bank %s already initialized!\n", - smca_get_name(s_hwid->bank_type)); - smca_banks[bank].hwid = s_hwid; smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; @@ -776,24 +771,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -static inline void __smp_deferred_error_interrupt(void) -{ - inc_irq_stat(irq_deferred_error_count); - deferred_error_int_vector(); -} - asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); - __smp_deferred_error_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) -{ - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); - __smp_deferred_error_interrupt(); + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index f7370abd33c6..2da67b70ba98 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -static inline void __smp_thermal_interrupt(void) -{ - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); -} - -asmlinkage __visible void __irq_entry -smp_thermal_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_thermal_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry -smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - __smp_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index bb0e75eed10a..5e7249e42f8f 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,24 +17,12 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -static inline void __smp_threshold_interrupt(void) -{ - inc_irq_stat(irq_threshold_count); - mce_threshold_vector(); -} - asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); - __smp_threshold_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) -{ - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); - __smp_threshold_interrupt(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 21b185793c80..c6daec4bdba5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch) list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) + if (p->patch_id >= new_patch->patch_id) { /* we already have the latest patch */ + kfree(new_patch->data); + kfree(new_patch); return; + } list_replace(&p->plist, &new_patch->plist); kfree(p->data); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 59edbe9d4ccb..8f7a9bbad514 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header, return false; } -static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) +static struct ucode_patch *memdup_patch(void *data, unsigned int size) { struct ucode_patch *p; p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) - return ERR_PTR(-ENOMEM); + return NULL; p->data = kmemdup(data, size, GFP_KERNEL); if (!p->data) { kfree(p); - return ERR_PTR(-ENOMEM); + return NULL; } return p; @@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size) if (mc_hdr->rev <= mc_saved_hdr->rev) continue; - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer %p\n", data); else list_replace(&iter->plist, &p->plist); @@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size) * newly found. */ if (!prev_found) { - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer for %p\n", data); else list_add_tail(&p->plist, µcode_cache); } + if (!p) + return; + /* * Save for early loading. On 32-bit, that needs to be a physical * address as the APs are running from physical addresses, before * paging has been enabled. */ - if (p) { - if (IS_ENABLED(CONFIG_X86_32)) - intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); - else - intel_ucode_patch = p->data; - } + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; } static int microcode_sanity_check(void *mc, int print_err) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 70e717fccdd6..3b3f713e15e5 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs) void hv_setup_vmbus_irq(void (*handler)(void)) { vmbus_handler = handler; - /* - * Setup the IDT for hypervisor callback. Prevent reallocation - * at module reload. - */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - hyperv_callback_vector); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); } void hv_remove_vmbus_irq(void) @@ -184,9 +179,15 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -219,7 +220,7 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -249,11 +250,12 @@ static void __init ms_hyperv_init_platform(void) * Setup the hook to get control post apic initialization. */ x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); #endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 23c23508c012..05459ad3db46 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index dbce3cca94cb..f13b4c00a5de 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) + __show_regs(regs, 0); + /* * Scan the stack, printing any text addresses we find. At the * same time, follow proper stack frames with the unwinder. @@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * Don't print regs->ip again if it was already printed * by __show_regs() below. */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); - continue; - } + if (regs && stack == ®s->ip) + goto next; if (stack == ret_addr_p) reliable = 1; @@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!reliable) continue; +next: /* * Get the next frame from the unwinder. No need to * check for an error: if anything goes wrong, the rest @@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs) + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) __show_regs(regs, 0); } @@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_X86_32 if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; } else { sp = kernel_stack_pointer(regs); savesegment(ss, ss); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e5f0b40e66d2..4f0481474903 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 3e1471d57487..225af4184f06 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stack <= begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 532da61d605c..71c11ad5643e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any); * Note: this function only works correctly once the E820 table is sorted and * not-overlapping (at least for the range specified), which is the case normally. */ -bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +static struct e820_entry *__e820__mapped_all(u64 start, u64 end, + enum e820_type type) { int i; @@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) * coverage of the desired range exists: */ if (start >= end) - return 1; + return entry; } - return 0; + + return NULL; +} + +/* + * This function checks if the entire range <start,end> is mapped with type. + */ +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +{ + return __e820__mapped_all(start, end, type); +} + +/* + * This function returns the type associated with the range <start,end>. + */ +int e820__get_entry_type(u64 start, u64 end) +{ + struct e820_entry *entry = __e820__mapped_all(start, end, 0); + + return entry ? entry->type : -EINVAL; } /* diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index d907c3d8633f..927abeaf63e2 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -12,10 +12,10 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/delay.h> -#include <linux/dmi.h> #include <linux/pci_ids.h> #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_regs.h> +#include <linux/platform_data/x86/apple.h> #include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), + INTEL_CNL_IDS(&gen9_early_ops), }; static void __init @@ -593,7 +594,7 @@ static void __init apple_airport_reset(int bus, int slot, int func) u64 addr; int i; - if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + if (!x86_apple_machine) return; /* Card may have been put into PCI_D3hot by grub quirk */ diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000000000000..f260e452e4f8 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,19 @@ +/* + * EISA specific code + * + * This file is licensed under the GPL V2 + */ +#include <linux/ioport.h> +#include <linux/eisa.h> +#include <linux/io.h> + +static __init int eisa_bus_probe(void) +{ + void __iomem *p = ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6b91e2eb8d3f..9c4e7ba6870c 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,7 +195,7 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 538ec012b371..cf2ce063f65a 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/memblock.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820/api.h> @@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { cr4_init_shadow(); + + idt_setup_early_handler(); + sanitize_boot_params(&boot_params); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 9ba79543d9ee..bab4fa579450 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -14,6 +14,7 @@ #include <linux/start_kernel.h> #include <linux/io.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <asm/processor.h> #include <asm/proto.h> @@ -33,7 +34,6 @@ /* * Manage page tables very early on. */ -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); @@ -45,9 +45,11 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } -void __head __startup_64(unsigned long physaddr) +unsigned long __head __startup_64(unsigned long physaddr, + struct boot_params *bp) { unsigned long load_delta, *p; + unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; @@ -69,6 +71,12 @@ void __head __startup_64(unsigned long physaddr) if (load_delta & ~PMD_PAGE_MASK) for (;;); + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(bp); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); @@ -92,31 +100,35 @@ void __head __startup_64(unsigned long physaddr) * creates a bunch of nonsense entries but that is fine -- * it avoids problems around wraparound. */ + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 0] = (pudval_t)pmd + pgtable_flags; + pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { @@ -137,9 +149,30 @@ void __head __startup_64(unsigned long physaddr) pmd[i] += load_delta; } - /* Fixup phys_base */ + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ p = fixup_pointer(&phys_base, physaddr); - *p += load_delta; + *p += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel (if SME is active) */ + sme_encrypt_kernel(); + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); } /* Wipe all early page tables except for the kernel symbol map */ @@ -147,17 +180,17 @@ static void __init reset_early_page_tables(void) { memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_top_pgt)); + write_cr3(__sme_pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ -int __init early_make_pgtable(unsigned long address) +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; - pmdval_t pmd, *pmd_p; + pmdval_t *pmd_p; /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) @@ -216,12 +249,21 @@ again: memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - pmd = (physaddr & PMD_MASK) + early_pmd_flags; pmd_p[pmd_index(address)] = pmd; return 0; } +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -244,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data) char * command_line; unsigned long cmd_line_ptr; + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); cmd_line_ptr = get_cmd_line_ptr(); @@ -251,12 +299,18 @@ static void __init copy_bootdata(char *real_mode_data) command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); } asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { - int i; - /* * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) @@ -280,11 +334,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_page(init_top_pgt); + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + kasan_early_init(); - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handler_array[i]); - load_idt((const struct desc_ptr *)&idt_descr); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1f85ee8f9439..9ed3074d0d27 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -155,7 +155,6 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ @@ -165,7 +164,6 @@ WEAK(xen_entry) subarch_entries: .long .Ldefault_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 @@ -347,7 +345,6 @@ ENTRY(startup_32_smp) movl %eax,%cr0 lgdt early_gdt_descr - lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -380,37 +377,6 @@ ENDPROC(startup_32_smp) */ __INIT setup_once: - /* - * Set up a idt with 256 interrupt gates that push zero if there - * is no error code and then jump to early_idt_handler_common. - * It doesn't actually load the idt - that needs to be done on - * each CPU. Interrupts are enabled elsewhere, when we can be - * relatively sure everything is ok. - */ - - movl $idt_table,%edi - movl $early_idt_handler_array,%eax - movl $NUM_EXCEPTION_VECTORS,%ecx -1: - movl %eax,(%edi) - movl %eax,4(%edi) - /* interrupt gate, dpl=0, present */ - movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $EARLY_IDT_HANDLER_SIZE,%eax - addl $8,%edi - loop 1b - - movl $256 - NUM_EXCEPTION_VECTORS,%ecx - movl $ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ -2: - movl %eax,(%edi) - movl %edx,4(%edi) - addl $8,%edi - loop 2b - #ifdef CONFIG_CC_STACKPROTECTOR /* * Configure the stack canary. The linker can't handle this by @@ -457,12 +423,9 @@ early_idt_handler_common: /* The vector number is in pt_regs->gs */ cld - pushl %fs /* pt_regs->fs */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %es /* pt_regs->es */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %ds /* pt_regs->ds */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ pushl %eax /* pt_regs->ax */ pushl %ebp /* pt_regs->bp */ pushl %edi /* pt_regs->di */ @@ -479,9 +442,8 @@ early_idt_handler_common: /* Load the vector number into EDX */ movl PT_GS(%esp), %edx - /* Load GS into pt_regs->gs and clear high bits */ + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ movw %gs, PT_GS(%esp) - movw $0, PT_GS+2(%esp) movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ call early_fixup_exception @@ -493,18 +455,17 @@ early_idt_handler_common: popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ - popl %ds /* pt_regs->ds */ - popl %es /* pt_regs->es */ - popl %fs /* pt_regs->fs */ - popl %gs /* pt_regs->gs */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ - ALIGN -ignore_int: +ENTRY(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -539,7 +500,8 @@ ignore_int: hlt_loop: hlt jmp hlt_loop -ENDPROC(ignore_int) +ENDPROC(early_ignore_irq) + __INITDATA .align 4 GLOBAL(early_recursion_flag) @@ -628,7 +590,6 @@ int_msg: .data .globl boot_gdt_descr -.globl idt_descr ALIGN # early boot GDT descriptor (must use 1:1 address mapping) @@ -637,11 +598,6 @@ boot_gdt_descr: .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET - .word 0 # 32-bit align idt_desc.address -idt_descr: - .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table - # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6225550883df..513cbb012ecc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,12 +73,19 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ leaq _text(%rip), %rdi pushq %rsi call __startup_64 popq %rsi - movq $(early_top_pgt - __START_KERNEL_map), %rax + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -98,7 +105,16 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_top_pgt - __START_KERNEL_map), %rax + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax 1: /* Enable PAE mode, PGE and LA57 */ @@ -335,9 +351,9 @@ GLOBAL(name) NEXT_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(early_dynamic_pgts) @@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else NEXT_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. @@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt) #ifdef CONFIG_X86_5LEVEL NEXT_PAGE(level4_kernel_pgt) .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level2_kernel_pgt) /* @@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000000000000..6107ee1cb8d5 --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,371 @@ +/* + * Interrupt descriptor table related code + * + * This file is licensed under the GPL V2 + */ +#include <linux/interrupt.h> + +#include <asm/traps.h> +#include <asm/proto.h> +#include <asm/desc.h> + +struct idt_data { + unsigned int vector; + unsigned int segment; + struct idt_bits bits; + const void *addr; +}; + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type, \ + .bits.dpl = _dpl, \ + .bits.p = 1, \ + .addr = _addr, \ + .segment = _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Interrupt gate with interrupt stack */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate with interrupt stack */ +#define SISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Task gate */ +#define TSKG(_vector, _gdt) \ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, debug), + SYSG(X86_TRAP_BP, int3), +#ifdef CONFIG_X86_32 + INTG(X86_TRAP_PF, page_fault), +#endif +}; + +/* + * The default IDT entries which are set up in trap_init() before + * cpu_init() is invoked. Interrupt stacks cannot be used at that point and + * the traps which use them are reinitialized with IST after cpu_init() has + * set up TSS. + */ +static const __initdata struct idt_data def_idts[] = { + INTG(X86_TRAP_DE, divide_error), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BR, bounds), + INTG(X86_TRAP_UD, invalid_op), + INTG(X86_TRAP_NM, device_not_available), + INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), + INTG(X86_TRAP_TS, invalid_TSS), + INTG(X86_TRAP_NP, segment_not_present), + INTG(X86_TRAP_SS, stack_segment), + INTG(X86_TRAP_GP, general_protection), + INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), + INTG(X86_TRAP_MF, coprocessor_error), + INTG(X86_TRAP_AC, alignment_check), + INTG(X86_TRAP_XF, simd_coprocessor_error), + +#ifdef CONFIG_X86_32 + TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), +#else + INTG(X86_TRAP_DF, double_fault), +#endif + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BP, int3), + +#ifdef CONFIG_X86_MCE + INTG(X86_TRAP_MC, &machine_check), +#endif + + SYSG(X86_TRAP_OF, overflow), +#if defined(CONFIG_IA32_EMULATION) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), +#elif defined(CONFIG_X86_32) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), +#endif +}; + +/* + * The APIC and SMP idt entries + */ +static const __initdata struct idt_data apic_idts[] = { +#ifdef CONFIG_SMP + INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), + INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), + INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), + INTG(REBOOT_VECTOR, reboot_interrupt), +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + INTG(THERMAL_APIC_VECTOR, thermal_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_AMD + INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), +# ifdef CONFIG_HAVE_KVM + INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +# endif +# ifdef CONFIG_IRQ_WORK + INTG(IRQ_WORK_VECTOR, irq_work_interrupt), +# endif + INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), + INTG(ERROR_APIC_VECTOR, error_interrupt), +#endif +}; + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, page_fault), +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +static const __initdata struct idt_data dbg_idts[] = { + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_BP, int3), +}; +#endif + +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +struct desc_ptr idt_descr __ro_after_init = { + .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .address = (unsigned long) idt_table, +}; + +#ifdef CONFIG_X86_64 +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initdata struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, debug, DEBUG_STACK), + ISTG(X86_TRAP_NMI, nmi, NMI_STACK), + SISTG(X86_TRAP_BP, int3, DEBUG_STACK), + ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), +#endif +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +const struct desc_ptr debug_idt_descr = { + .size = IDT_ENTRIES * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; +#endif + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low = (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + +static void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(&desc, t); + write_idt_entry(idt, t->vector, &desc); + if (sys) + set_bit(t->vector, used_vectors); + } +} + +static void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + BUG_ON(n > 0xFF); + + memset(&data, 0, sizeof(data)); + data.vector = n; + data.addr = addr; + data.segment = __KERNEL_CS; + data.bits.type = GATE_INTERRUPT; + data.bits.p = 1; + + idt_setup_from_table(idt_table, &data, 1, false); +} + +/** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), + true); + load_idt(&idt_descr); +} + +/** + * idt_setup_traps - Initialize the idt table with default traps + */ +void __init idt_setup_traps(void) +{ + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); +} + +#ifdef CONFIG_X86_64 +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * FIXME: Why is 32bit and 64bit installing the PF handler at different + * places in the early setup code? + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, + ARRAY_SIZE(early_pf_idts), true); +} + +/** + * idt_setup_ist_traps - Initialize the idt table with traps using IST + */ +void __init idt_setup_ist_traps(void) +{ + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); +} + +/** + * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps + */ +void __init idt_setup_debugidt_traps(void) +{ + memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); + + idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); +} +#endif + +/** + * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates + */ +void __init idt_setup_apic_and_irq_gates(void) +{ + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); + + for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { +#ifdef CONFIG_X86_LOCAL_APIC + set_bit(i, used_vectors); + set_intr_gate(i, spurious_interrupt); +#else + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); +#endif + } +} + +/** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif + load_idt(&idt_descr); +} + +/** + * idt_invalidate - Invalidate interrupt descriptor table + * @addr: The virtual address of the 'invalid' IDT + */ +void idt_invalidate(void *addr) +{ + struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + + load_idt(&idt); +} + +void __init update_intr_gate(unsigned int n, const void *addr) +{ + if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + return; + set_intr_gate(n, addr); +} + +void alloc_intr_gate(unsigned int n, const void *addr) +{ + BUG_ON(n < FIRST_SYSTEM_VECTOR); + if (!test_and_set_bit(n, used_vectors)) + set_intr_gate(n, addr); +} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4ed0aba8dbc8..52089c043160 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); atomic_t irq_err_count; -/* Function pointer for generic interrupt vector handling */ -void (*x86_platform_ipi_callback)(void) = NULL; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_puts(p, " APIC ICR read retries\n"); -#endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_puts(p, " Platform interrupts\n"); } +#endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) @@ -183,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; sum += irq_stats(cpu)->icr_read_retry_count; -#endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; +#endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -259,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -void __smp_x86_platform_ipi(void) -{ - inc_irq_stat(x86_platform_ipis); - - if (x86_platform_ipi_callback) - x86_platform_ipi_callback(); -} - __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - __smp_x86_platform_ipi(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); exiting_irq(); set_irq_regs(old_regs); } +#endif #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {} @@ -334,19 +331,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) } #endif -__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); - trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); - __smp_x86_platform_ipi(); - trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); - set_irq_regs(old_regs); -} - -EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU @@ -431,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void) * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; - vector < first_system_vector; vector++) { + vector < FIRST_SYSTEM_VECTOR; vector++) { if (!test_bit(vector, used_vectors) && IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { if (++count == this_count) diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 275487872be2..70dee056f92b 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -11,35 +11,23 @@ #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> -static inline void __smp_irq_work_interrupt(void) -{ - inc_irq_stat(apic_irq_work_irqs); - irq_work_run(); -} - +#ifdef CONFIG_X86_LOCAL_APIC __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_irq_work_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); - __smp_irq_work_interrupt(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); exiting_irq(); } void arch_irq_work_raise(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!arch_irq_work_has_interrupt()) return; apic->send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); -#endif } +#endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c7fd18526c3e..1add9e08e83e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - return 1; - } - - return 0; -} - void __init init_ISA_irqs(void) { struct irq_chip *chip = legacy_pic->chip; @@ -99,100 +87,12 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); - - /* IPI used for rebooting/stopping */ - alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_THERMAL_VECTOR - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#ifdef CONFIG_X86_MCE_THRESHOLD - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#ifdef CONFIG_X86_MCE_AMD - alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI for X86 platform specific use */ - alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); -#ifdef CONFIG_HAVE_KVM - /* IPI for KVM to deliver posted interrupt */ - alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); - /* IPI for KVM to deliver interrupt to wake up tasks */ - alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); - /* IPI for KVM to deliver nested posted interrupt */ - alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi); -#endif - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* IRQ work interrupts: */ -# ifdef CONFIG_IRQ_WORK - alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); -# endif - -#endif -} - void __init native_init_IRQ(void) { - int i; - /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - apic_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - i = FIRST_EXTERNAL_VECTOR; -#ifndef CONFIG_X86_LOCAL_APIC -#define first_system_vector NR_VECTORS -#endif - for_each_clear_bit_from(i, used_vectors, first_system_vector) { - /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } -#ifdef CONFIG_X86_LOCAL_APIC - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) - set_intr_gate(i, spurious_interrupt); -#endif + idt_setup_apic_and_irq_gates(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 38b64587b31b..fd6f8fbbe6f2 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, struct setup_data_node *node = file->private_data; unsigned long remain; loff_t pos = *ppos; - struct page *pg; void *p; u64 pa; @@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, count = node->len - pos; pa = node->paddr + sizeof(struct setup_data) + pos; - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - p = ioremap_cache(pa, count); - if (!p) - return -ENXIO; - } else - p = __va(pa); + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; remain = copy_to_user(user_buf, p, count); - if (PageHighMem(pg)) - iounmap(p); + memunmap(p); if (remain) return -EFAULT; @@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent) struct setup_data *data; int error; struct dentry *d; - struct page *pg; u64 pa_data; int no = 0; @@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - data = ioremap_cache(pa_data, sizeof(*data)); - if (!data) { - kfree(node); - error = -ENXIO; - goto err_dir; - } - } else - data = __va(pa_data); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } node->paddr = pa_data; node->type = data->type; @@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = create_setup_data_node(d, no, node); pa_data = data->next; - if (PageHighMem(pg)) - iounmap(data); + memunmap(data); if (error) goto err_dir; no++; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 69ea0bc1cfa3..4f98aad38237 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -39,6 +39,7 @@ #include <asm/insn.h> #include <asm/debugreg.h> #include <asm/set_memory.h> +#include <asm/sections.h> #include "common.h" @@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr) /* * Do not optimize in the entry code due to the unstable - * stack handling. + * stack handling and registers setup. */ - if ((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) + if (((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) || + ((paddr >= (unsigned long)__irqentry_text_start) && + (paddr < (unsigned long)__irqentry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 06e1ff5562c0..4b0592ca9e47 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -16,8 +16,8 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/io.h> -#include <asm/io.h> #include <asm/setup.h> static ssize_t version_show(struct kobject *kobj, @@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr) *paddr = pa_data; return 0; } - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size) u64 pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; if (nr == i) { *size = data->len; - iounmap(data); + memunmap(data); return 0; } pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; ret = sprintf(buf, "0x%x\n", data->type); - iounmap(data); + memunmap(data); return ret; } @@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp, goto out; ret = count; - p = ioremap_cache(paddr + sizeof(*data), data->len); + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; } memcpy(buf, p + off, count); - iounmap(p); + memunmap(p); out: - iounmap(data); + memunmap(data); return ret; } @@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr) *nr = 0; while (pa_data) { *nr += 1; - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) { ret = -ENOMEM; goto out; } pa_data = data->next; - iounmap(data); + memunmap(data); } out: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d04e30e3c0ff..874827b0d7ca 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -263,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - trace_do_page_fault(regs, error_code); + do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ @@ -455,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static void __init kvm_apf_trap_init(void) { - set_intr_gate(14, async_page_fault); + update_intr_gate(X86_TRAP_PF, async_page_fault); } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a870910c8565..f0e64db18ac8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -21,6 +21,25 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + /* context.lock is held for us, so we don't need any locking. */ static void flush_ldt(void *__mm) { @@ -32,6 +51,8 @@ static void flush_ldt(void *__mm) pc = &mm->context; set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + + refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8c53c5d7a1bc..00bc751c861c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -26,18 +26,6 @@ #include <asm/set_memory.h> #include <asm/debugreg.h> -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - load_idt(&curidt); -} - - static void set_gdt(void *newgdt, __u16 limit) { struct desc_ptr curgdt; @@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image) * If you want to load them you must set up your own idt & gdt. */ set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + idt_invalidate(phys_to_virt(0)); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cb0a30473c23..1f790cf9d38f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); return 0; err: free_transition_pgtable(image); @@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .alloc_pgt_page = alloc_pgt_page, .context = image, .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image) image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, - image->preserve_context); + image->preserve_context, + sme_active()); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } + +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) +{ + /* + * If SME is active we need to be sure that kexec pages are + * not encrypted because when we boot to the new kernel the + * pages won't be accessed encrypted (initially). + */ + return set_memory_decrypted((unsigned long)vaddr, pages); +} + +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) +{ + /* + * If SME is active we need to reset the pages back to being + * an encrypted mapping before freeing them. + */ + set_memory_encrypted((unsigned long)vaddr, pages); +} diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f67bd3205df7..62e7d70aadd5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/setup.h> +#include <asm/unwind.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr, locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; } if (alt) { @@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr, /* make jump label nops */ jump_label_apply_nops(me); + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0d904d759ff1..5cbb3177ed17 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) } } -static struct mpf_intel *mpf_found; +static unsigned long mpf_base; static unsigned long __init get_mpc_size(unsigned long physptr) { struct mpc_table *mpc; unsigned long size; - mpc = early_ioremap(physptr, PAGE_SIZE); + mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; - early_iounmap(mpc, PAGE_SIZE); + early_memunmap(mpc, PAGE_SIZE); apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); return size; @@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) unsigned long size; size = get_mpc_size(mpf->physptr); - mpc = early_ioremap(mpf->physptr, size); + mpc = early_memremap(mpf->physptr, size); + /* * Read the physical hardware table. Anything here will * override the defaults. @@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #endif pr_err("BIOS bug, MP table errors detected!...\n"); pr_cont("... disabling SMP support. (tell your hw vendor)\n"); - early_iounmap(mpc, size); + early_memunmap(mpc, size); return -1; } - early_iounmap(mpc, size); + early_memunmap(mpc, size); if (early) return -1; @@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) */ void __init default_get_smp_config(unsigned int early) { - struct mpf_intel *mpf = mpf_found; + struct mpf_intel *mpf; if (!smp_found_config) return; - if (!mpf) + if (!mpf_base) return; if (acpi_lapic && early) @@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: error mapping MP table\n"); + return; + } + pr_info("Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early) /* * Now see if we need to read further. */ - if (mpf->feature1 != 0) { + if (mpf->feature1) { if (early) { /* * local APIC has default address @@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - if (check_physptr(mpf, early)) + if (check_physptr(mpf, early)) { + early_memunmap(mpf, sizeof(*mpf)); return; + } } else BUG(); @@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early) /* * Only use the first configuration found. */ + + early_memunmap(mpf, sizeof(*mpf)); } static void __init smp_reserve_memory(struct mpf_intel *mpf) @@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) static int __init smp_scan_config(unsigned long base, unsigned long length) { - unsigned int *bp = phys_to_virt(base); + unsigned int *bp; struct mpf_intel *mpf; - unsigned long mem; + int ret = 0; apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { + bp = early_memremap(base, length); mpf = (struct mpf_intel *)bp; if ((*bp == SMP_MAGIC_IDENT) && (mpf->length == 1) && @@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; #endif - mpf_found = mpf; + mpf_base = base; - pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", - (unsigned long long) virt_to_phys(mpf), - (unsigned long long) virt_to_phys(mpf) + - sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", + base, base + sizeof(*mpf) - 1, mpf); - mem = virt_to_phys(mpf); - memblock_reserve(mem, sizeof(*mpf)); + memblock_reserve(base, sizeof(*mpf)); if (mpf->physptr) smp_reserve_memory(mpf); - return 1; + ret = 1; } - bp += 4; + early_memunmap(bp, length); + + if (ret) + break; + + base += 16; length -= 16; } - return 0; + return ret; } void __init default_find_smp_config(void) @@ -838,29 +852,40 @@ static int __init update_mp_table(void) char oem[10]; struct mpf_intel *mpf; struct mpc_table *mpc, *mpc_new; + unsigned long size; if (!enable_update_mptable) return 0; - mpf = mpf_found; - if (!mpf) + if (!mpf_base) return 0; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: mpf early_memremap() failed\n"); + return 0; + } + /* * Now see if we need to go further. */ - if (mpf->feature1 != 0) - return 0; + if (mpf->feature1) + goto do_unmap_mpf; if (!mpf->physptr) - return 0; + goto do_unmap_mpf; - mpc = phys_to_virt(mpf->physptr); + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + if (!mpc) { + pr_err("MPTABLE: mpc early_memremap() failed\n"); + goto do_unmap_mpf; + } if (!smp_check_mpc(mpc, oem, str)) - return 0; + goto do_unmap_mpc; - pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); + pr_info("mpf: %llx\n", (u64)mpf_base); pr_info("physptr: %x\n", mpf->physptr); if (mpc_new_phys && mpc->length > mpc_new_length) { @@ -878,21 +903,32 @@ static int __init update_mp_table(void) new = mpf_checksum((unsigned char *)mpc, mpc->length); if (old == new) { pr_info("mpc is readonly, please try alloc_mptable instead\n"); - return 0; + goto do_unmap_mpc; } pr_info("use in-position replacing\n"); } else { + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); + if (!mpc_new) { + pr_err("MPTABLE: new mpc early_memremap() failed\n"); + goto do_unmap_mpc; + } mpf->physptr = mpc_new_phys; - mpc_new = phys_to_virt(mpc_new_phys); memcpy(mpc_new, mpc, mpc->length); + early_memunmap(mpc, size); mpc = mpc_new; + size = mpc_new_length; /* check if we can modify that */ if (mpc_new_phys - mpf->physptr) { struct mpf_intel *mpf_new; /* steal 16 bytes from [0, 1k) */ + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); + if (!mpf_new) { + pr_err("MPTABLE: new mpf early_memremap() failed\n"); + goto do_unmap_mpc; + } pr_info("mpf new: %x\n", 0x400 - 16); - mpf_new = phys_to_virt(0x400 - 16); memcpy(mpf_new, mpf, 16); + early_memunmap(mpf, sizeof(*mpf)); mpf = mpf_new; mpf->physptr = mpc_new_phys; } @@ -909,6 +945,12 @@ static int __init update_mp_table(void) */ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); +do_unmap_mpc: + early_memunmap(mpc, size); + +do_unmap_mpf: + early_memunmap(mpf, sizeof(*mpf)); + return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 446c8aa09b9b..35aafc95e4b8 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -39,26 +39,26 @@ #include <trace/events/nmi.h> struct nmi_desc { - spinlock_t lock; + raw_spinlock_t lock; struct list_head head; }; static struct nmi_desc nmi_desc[NMI_MAX] = { { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), .head = LIST_HEAD_INIT(nmi_desc[0].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), .head = LIST_HEAD_INIT(nmi_desc[2].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), .head = LIST_HEAD_INIT(nmi_desc[3].head), }, @@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) init_irq_work(&action->irq_work, nmi_max_handler); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * Indicate if there are multiple registrations on the @@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) else list_add_tail_rcu(&action->list, &desc->head); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(__register_nmi_handler); @@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) struct nmiaction *n; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); list_for_each_entry_rcu(n, &desc->head, list) { /* @@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); } EXPORT_SYMBOL_GPL(unregister_nmi_handler); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bc0a849589bb..a14df9eecfed 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = { .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .safe_halt = native_safe_halt, .halt = native_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, -#endif }; __visible struct pv_cpu_ops pv_cpu_ops = { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5e16d3f29594..0accc2404b92 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -93,9 +93,12 @@ again: if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size), flag); - if (page && page_to_phys(page) + size > dma_mask) { - dma_release_from_contiguous(dev, page, count); - page = NULL; + if (page) { + addr = phys_to_dma(dev, page_to_phys(page)); + if (addr + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } } } /* fallback */ @@ -104,7 +107,7 @@ again: if (!page) return NULL; - addr = page_to_phys(page); + addr = phys_to_dma(dev, page_to_phys(page)); if (addr + size > dma_mask) { __free_pages(page, get_order(size)); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a6d404087fe3..4fc3cb60ea11 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t bus = page_to_phys(page) + offset; + dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) return NOMMU_MAPPING_ERROR; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 1e23577e17cf..677077510e30 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,12 +6,14 @@ #include <linux/swiotlb.h> #include <linux/bootmem.h> #include <linux/dma-mapping.h> +#include <linux/mem_encrypt.h> #include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/xen/swiotlb-xen.h> #include <asm/iommu_table.h> + int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override, pci_swiotlb_late_init); /* - * if 4GB or more detected (and iommu=off not set) return 1 - * and set swiotlb to 1. + * If 4GB or more detected (and iommu=off not set) or if SME is active + * then set swiotlb to 1 and return 1. */ int __init pci_swiotlb_detect_4gb(void) { @@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void) if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif + + /* + * If SME is active then swiotlb will be set to 1 so that bounce + * buffers are allocated and used for devices that do not support + * the addressing range required for the encryption mask. + */ + if (sme_active()) + swiotlb = 1; + return swiotlb; } IOMMU_INIT(pci_swiotlb_detect_4gb, diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 91271122f0df..502a77d0adb0 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void) x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: - case X86_SUBARCH_LGUEST: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3ca198080ea9..bd6b85fac666 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -355,6 +355,7 @@ bool xen_set_default_idle(void) return ret; } #endif + void stop_this_cpu(void *dummy) { local_irq_disable(); @@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); - for (;;) - halt(); + for (;;) { + /* + * Use wbinvd followed by hlt to stop the processor. This + * provides support for kexec on a processor that supports + * SME. With kexec, going from SME inactive to SME active + * requires clearing cache entries so that addresses without + * the encryption bit set don't corrupt the same physical + * address that has the encryption bit set when caches are + * flushed. To achieve this a wbinvd is performed followed by + * a hlt. Even if the processor is not in the kexec/SME + * scenario this only adds a wbinvd to a halting processor. + */ + asm volatile("wbinvd; hlt" : : : "memory"); + } } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c6d6dc5f8bb2..11966251cd42 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) @@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all) if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; gs = get_user_gs(regs); } else { sp = kernel_stack_pointer(regs); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c3169be4c596..302e7b2572d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/unistd.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ @@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, - (void *)regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, regs->sp, regs->flags); if (regs->orig_ax != -1) @@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task) } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); switch_fpu_prepare(prev_fpu, cpu); @@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, cpu); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 0bee04d41bed..eaa591cfd98b 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -1,6 +1,7 @@ /* * This file contains work-arounds for x86 and x86_64 platform bugs. */ +#include <linux/dmi.h> #include <linux/pci.h> #include <linux/irq.h> @@ -656,3 +657,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif #endif + +bool x86_apple_machine; +EXPORT_SYMBOL(x86_apple_machine); + +void __init early_platform_quirks(void) +{ + x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") || + dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a56bf6051f4e..54984b142641 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -38,8 +38,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static const struct desc_ptr no_idt = {}; - /* * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on @@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - load_idt(&no_idt); + idt_invalidate(NULL); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 98111b38ebfd..307d3bac5f04 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -47,6 +47,7 @@ relocate_kernel: * %rsi page_list * %rdx start address * %rcx preserve_context + * %r8 sme_active */ /* Save the CPU context, used for jumping back */ @@ -71,6 +72,9 @@ relocate_kernel: pushq $0 popfq + /* Save SME active flag */ + movq %r8, %r12 + /* * get physical address of control page now * this is impossible after page table switch @@ -132,6 +136,16 @@ identity_mapped: /* Flush the TLB (needed?) */ movq %r9, %cr3 + /* + * If SME is active, there could be old encrypted cache line + * entries that will conflict with the now unencrypted memory + * used by kexec. Flush the caches before copying the kernel. + */ + testq %r12, %r12 + jz 1f + wbinvd +1: + movq %rcx, %r11 call swap_pages diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3486d0498800..d84afb0a322d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -69,6 +69,7 @@ #include <linux/crash_dump.h> #include <linux/tboot.h> #include <linux/jiffies.h> +#include <linux/mem_encrypt.h> #include <linux/usb/xhci-dbgp.h> #include <video/edid.h> @@ -115,6 +116,7 @@ #include <asm/microcode.h> #include <asm/mmu_context.h> #include <asm/kaslr.h> +#include <asm/unwind.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -374,6 +376,14 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ + /* + * If SME is active, this memory will be marked encrypted by the + * kernel when it is accessed (including relocation). However, the + * ramdisk image was loaded decrypted by the bootloader, so make + * sure that it is encrypted before accessing it. + */ + sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); + initrd_start = 0; mapped_size = memblock_mem_size(max_pfn_mapped); @@ -890,7 +900,7 @@ void __init setup_arch(char **cmdline_p) */ olpc_ofw_detect(); - early_trap_init(); + idt_setup_early_traps(); early_cpu_init(); early_ioremap_init(); @@ -1161,7 +1171,7 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - early_trap_pf_init(); + idt_setup_early_pf(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) @@ -1206,6 +1216,8 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); + early_platform_quirks(); + /* * Parse the ACPI tables for possible boot-time SMP configuration. */ @@ -1310,6 +1322,8 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif + + unwind_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 10edd1e69a68..6e8fcb6f7e1e 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct gdt; + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xFFFFF); - pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, - 0x2 | DESCTYPE_S, 0x8); - gdt.s = 1; - write_gdt_entry(get_cpu_gdt_rw(cpu), - GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc30a74e4adb..e04442345fc0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && - (regs->ss & 0xffff) != __USER_DS && + regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d798c0da451c..5c574dff4c1a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -254,84 +254,45 @@ finish: } /* - * Reschedule call back. + * Reschedule call back. KVM uses this interrupt to force a cpu out of + * guest mode */ -static inline void __smp_reschedule_interrupt(void) -{ - inc_irq_stat(irq_resched_count); - scheduler_ipi(); -} - __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - __smp_reschedule_interrupt(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} - -__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) -{ - /* - * Need to call irq_enter() before calling the trace point. - * __smp_reschedule_interrupt() calls irq_enter/exit() too (in - * scheduler_ipi(). This is OK, since those functions are allowed - * to nest. - */ - ipi_entering_ack_irq(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - __smp_reschedule_interrupt(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - exiting_irq(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} + inc_irq_stat(irq_resched_count); -static inline void __smp_call_function_interrupt(void) -{ - generic_smp_call_function_interrupt(); - inc_irq_stat(irq_call_count); + if (trace_resched_ipi_enabled()) { + /* + * scheduler_ipi() might call irq_enter() as well, but + * nested calls are fine. + */ + irq_enter(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + irq_exit(); + return; + } + scheduler_ipi(); } __visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_call_function_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry -smp_trace_call_function_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); - __smp_call_function_interrupt(); - trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); -} - -static inline void __smp_call_function_single_interrupt(void) -{ - generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); -} - -__visible void __irq_entry -smp_call_function_single_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); - __smp_call_function_single_interrupt(); + generic_smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); exiting_irq(); } -__visible void __irq_entry -smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); - __smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 5f25cfbd952e..5ee663836c08 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re unsigned long addr, seg; addr = regs->ip; - seg = regs->cs & 0xffff; + seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 213ddf3e937d..73e4d28112f8 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,6 +21,7 @@ #include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> +#include <asm/mpx.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. @@ -100,8 +101,8 @@ out: return error; } -static void find_start_end(unsigned long flags, unsigned long *begin, - unsigned long *end) +static void find_start_end(unsigned long addr, unsigned long flags, + unsigned long *begin, unsigned long *end) { if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small @@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin, } *begin = get_mmap_base(1); - *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); + if (in_compat_syscall()) + *end = task_size_32bit(); + else + *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); } unsigned long @@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_unmapped_area_info info; unsigned long begin, end; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (flags & MAP_FIXED) return addr; - find_start_end(flags, &begin, &end); + find_start_end(addr, flags, &begin, &end); if (len > end) return -ENOMEM; @@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; struct vm_unmapped_area_info info; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + /* requested length too big for entire address space */ if (len > TASK_SIZE) return -ENOMEM; @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + * + * !in_compat_syscall() check to avoid high addresses for x32. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index dcd699baea1b..a106b9719c58 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx, while (n-- > 0) { if (LDT_empty(info) || LDT_zero(info)) { - desc->a = desc->b = 0; + memset(desc, 0, sizeof(*desc)); } else { fill_ldt(desc, info); diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 15515132bf0d..c6636d1f60b9 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -4,57 +4,38 @@ * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> * */ -#include <asm/hw_irq.h> -#include <asm/desc.h> +#include <linux/jump_label.h> #include <linux/atomic.h> -atomic_t trace_idt_ctr = ATOMIC_INIT(0); -struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) trace_idt_table }; - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; +#include <asm/hw_irq.h> +#include <asm/desc.h> -static int trace_irq_vector_refcount; -static DEFINE_MUTEX(irq_vector_mutex); +DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); -static void set_trace_idt_ctr(int val) +int trace_pagefault_reg(void) { - atomic_set(&trace_idt_ctr, val); - /* Ensure the trace_idt_ctr is set before sending IPI */ - wmb(); + static_branch_inc(&trace_pagefault_key); + return 0; } -static void switch_idt(void *arg) +void trace_pagefault_unreg(void) { - unsigned long flags; - - local_irq_save(flags); - load_current_idt(); - local_irq_restore(flags); + static_branch_dec(&trace_pagefault_key); } -int trace_irq_vector_regfunc(void) +#ifdef CONFIG_SMP + +DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); + +int trace_resched_ipi_reg(void) { - mutex_lock(&irq_vector_mutex); - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(1); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - trace_irq_vector_refcount++; - mutex_unlock(&irq_vector_mutex); + static_branch_inc(&trace_resched_ipi_key); return 0; } -void trace_irq_vector_unregfunc(void) +void trace_resched_ipi_unreg(void) { - mutex_lock(&irq_vector_mutex); - trace_irq_vector_refcount--; - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(0); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - mutex_unlock(&irq_vector_mutex); + static_branch_dec(&trace_resched_ipi_key); } + +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf54309b85da..34ea3651362e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -38,11 +38,6 @@ #include <linux/smp.h> #include <linux/io.h> -#ifdef CONFIG_EISA -#include <linux/ioport.h> -#include <linux/eisa.h> -#endif - #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif @@ -70,20 +65,13 @@ #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> #include <asm/proto.h> #endif -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[NR_VECTORS] __page_aligned_bss; - DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); static inline void cond_local_irq_enable(struct pt_regs *regs) { @@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif -/* Set of traps needed for early debugging. */ -void __init early_trap_init(void) -{ - /* - * Don't use IST to set DEBUG_STACK as it doesn't work until TSS - * is ready in cpu_init() <-- trap_init(). Before trap_init(), - * CPU runs at ring 0 so it is impossible to hit an invalid - * stack. Using the original stack works well enough at this - * early stage. DEBUG_STACK will be equipped after cpu_init() in - * trap_init(). - * - * We don't need to set trace_idt_table like set_intr_gate(), - * since we don't have trace_debug and it will be reset to - * 'debug' in trap_init() by set_intr_gate_ist(). - */ - set_intr_gate_notrace(X86_TRAP_DB, debug); - /* int3 can be called from all */ - set_system_intr_gate(X86_TRAP_BP, &int3); -#ifdef CONFIG_X86_32 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif - load_idt(&idt_descr); -} - -void __init early_trap_pf_init(void) -{ -#ifdef CONFIG_X86_64 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif -} - void __init trap_init(void) { - int i; - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(X86_TRAP_DE, divide_error); - set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); - /* int4 can be called from all */ - set_system_intr_gate(X86_TRAP_OF, &overflow); - set_intr_gate(X86_TRAP_BR, bounds); - set_intr_gate(X86_TRAP_UD, invalid_op); - set_intr_gate(X86_TRAP_NM, device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); - set_intr_gate(X86_TRAP_TS, invalid_TSS); - set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate(X86_TRAP_SS, stack_segment); - set_intr_gate(X86_TRAP_GP, general_protection); - set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); - set_intr_gate(X86_TRAP_MF, coprocessor_error); - set_intr_gate(X86_TRAP_AC, alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); -#endif - set_intr_gate(X86_TRAP_XF, simd_coprocessor_error); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif - -#ifdef CONFIG_X86_32 - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif + idt_setup_traps(); /* * Set the IDT descriptor to a fixed read-only location, so that the @@ -1030,20 +940,9 @@ void __init trap_init(void) */ cpu_init(); - /* - * X86_TRAP_DB and X86_TRAP_BP have been set - * in early_trap_init(). However, ITS works only after - * cpu_init() loads TSS. See comments in early_trap_init(). - */ - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + idt_setup_ist_traps(); x86_init.irqs.trap_init(); -#ifdef CONFIG_X86_64 - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(X86_TRAP_DB, &debug); - set_nmi_gate(X86_TRAP_BP, &int3); -#endif + idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index b9389d72b2f7..d145a0b1f529 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -10,20 +10,22 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs ? &state->regs->ip : state->bp + 1; +} static void unwind_dump(struct unwind_state *state) { @@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state) } } -unsigned long unwind_get_return_address(struct unwind_state *state) -{ - if (unwind_done(state)) - return 0; - - return __kernel_text_address(state->ip) ? state->ip : 0; -} -EXPORT_SYMBOL_GPL(unwind_get_return_address); - static size_t regs_size(struct pt_regs *regs) { /* x86_32 regs from kernel mode are two words shorter: */ @@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip) if (addr >= __entry_text_start && addr < __entry_text_end) return true; -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) if (addr >= __irqentry_text_start && addr < __irqentry_text_end) return true; -#endif return false; } diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 039f36738e49..4f0e17b90463 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c new file mode 100644 index 000000000000..570b70d3f604 --- /dev/null +++ b/arch/x86/kernel/unwind_orc.c @@ -0,0 +1,582 @@ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> +#include <asm/orc_types.h> +#include <asm/orc_lookup.h> +#include <asm/sections.h> + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static DEFINE_MUTEX(sort_mutex); +int *cur_orc_ip_table = __start_orc_unwind_ip; +struct orc_entry *cur_orc_table = __start_orc_unwind; + +unsigned int lookup_num_blocks; +bool orc_init; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *last = ip_table + num_entries - 1; + int *mid = first, *found = first; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, + mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +static struct orc_entry *orc_find(unsigned long ip) +{ + if (!orc_init) + return NULL; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", + idx, lookup_num_blocks, ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", + idx, lookup_num_blocks, start, stop, ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + return orc_module_find(ip); +} + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + struct orc_entry *orc_a, *orc_b; + struct orc_entry orc_tmp; + int *a = _a, *b = _b, tmp; + int delta = _b - _a; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + orc_tmp = *orc_a; + *orc_a = *orc_b; + *orc_b = orc_tmp; +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a; + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be on the left + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; +} + +#ifdef CONFIG_MODULES +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + int i; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* Sort the .orc_unwind and .orc_unwind_ip tables: */ + sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, + orc_sort_swap); + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, + LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, + LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + if (state->regs) + return &state->regs->ip; + + if (state->sp) + return (unsigned long *)state->sp - 1; + + return NULL; +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If the address isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that info->next_sp could point to an empty stack and the address + * could be on a subsequent stack. + */ + while (!on_stack(info, (void *)addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + unsigned long *val) +{ + if (!stack_access_ok(state, addr, sizeof(long))) + return false; + + *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); + return true; +} + +#define REGS_SIZE (sizeof(struct pt_regs)) +#define SP_OFFSET (offsetof(struct pt_regs, sp)) +#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) + +static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp, bool full) +{ + size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; + size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; + struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); + + if (IS_ENABLED(CONFIG_X86_64)) { + if (!stack_access_ok(state, addr, regs_size)) + return false; + + *ip = regs->ip; + *sp = regs->sp; + + return true; + } + + if (!stack_access_ok(state, addr, sp_offset)) + return false; + + *ip = regs->ip; + + if (user_mode(regs)) { + if (!stack_access_ok(state, addr + sp_offset, + REGS_SIZE - SP_OFFSET)) + return false; + + *sp = regs->sp; + } else + *sp = (unsigned long)®s->sp; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; + struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + /* Have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto done; + + /* + * Find the orc_entry associated with the text address. + * + * Decrement call return addresses by one so they work for sibling + * calls and calls to noreturn functions. + */ + orc = orc_find(state->signal ? state->ip : state->ip - 1); + if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) + goto done; + orig_ip = state->ip; + + /* Find the previous frame's stack: */ + switch (orc->sp_reg) { + case ORC_REG_SP: + sp = state->sp + orc->sp_offset; + break; + + case ORC_REG_BP: + sp = state->bp + orc->sp_offset; + break; + + case ORC_REG_SP_INDIRECT: + sp = state->sp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_BP_INDIRECT: + sp = state->bp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_R10: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R10 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r10; + break; + + case ORC_REG_R13: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R13 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r13; + break; + + case ORC_REG_DI: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DI at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->di; + break; + + case ORC_REG_DX: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DX at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->dx; + break; + + default: + orc_warn("unknown SP base reg %d for ip %p\n", + orc->sp_reg, (void *)state->ip); + goto done; + } + + if (indirect) { + if (!deref_stack_reg(state, sp, &sp)) + goto done; + } + + /* Find IP, SP and possibly regs: */ + switch (orc->type) { + case ORC_TYPE_CALL: + ip_p = sp - sizeof(long); + + if (!deref_stack_reg(state, ip_p, &state->ip)) + goto done; + + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->ip, (void *)ip_p); + + state->sp = sp; + state->regs = NULL; + state->signal = false; + break; + + case ORC_TYPE_REGS: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + orc_warn("can't dereference registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + state->regs = (struct pt_regs *)sp; + state->full_regs = true; + state->signal = true; + break; + + case ORC_TYPE_REGS_IRET: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + orc_warn("can't dereference iret registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + ptregs = container_of((void *)sp, struct pt_regs, ip); + if ((unsigned long)ptregs >= prev_sp && + on_stack(&state->stack_info, ptregs, REGS_SIZE)) { + state->regs = ptregs; + state->full_regs = false; + } else + state->regs = NULL; + + state->signal = true; + break; + + default: + orc_warn("unknown .orc_unwind entry type %d\n", orc->type); + break; + } + + /* Find BP: */ + switch (orc->bp_reg) { + case ORC_REG_UNDEFINED: + if (state->regs && state->full_regs) + state->bp = state->regs->bp; + break; + + case ORC_REG_PREV_SP: + if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) + goto done; + break; + + case ORC_REG_BP: + if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) + goto done; + break; + + default: + orc_warn("unknown BP base reg %d for ip %p\n", + orc->bp_reg, (void *)orig_ip); + goto done; + } + + /* Prevent a recursive loop due to bad ORC data: */ + if (state->stack_info.type == prev_type && + on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && + state->sp <= prev_sp) { + orc_warn("stack going in the wrong direction? ip=%p\n", + (void *)orig_ip); + goto done; + } + + preempt_enable(); + return true; + +done: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* + * Refuse to unwind the stack of a task while it's executing on another + * CPU. This check is racy, but that's ok: the unwinder has other + * checks to prevent it from going off the rails. + */ + if (task_on_another_cpu(task)) + goto done; + + if (regs) { + if (user_mode(regs)) + goto done; + + state->ip = regs->ip; + state->sp = kernel_stack_pointer(regs); + state->bp = regs->bp; + state->regs = regs; + state->full_regs = true; + state->signal = true; + + } else if (task == current) { + asm volatile("lea (%%rip), %0\n\t" + "mov %%rsp, %1\n\t" + "mov %%rbp, %2\n\t" + : "=r" (state->ip), "=r" (state->sp), + "=r" (state->bp)); + + } else { + struct inactive_task_frame *frame = (void *)task->thread.sp; + + state->sp = task->thread.sp; + state->bp = READ_ONCE_NOCHECK(frame->bp); + state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + } + + if (get_stack_info((unsigned long *)state->sp, state->task, + &state->stack_info, &state->stack_mask)) + return; + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + + /* When starting from regs, skip the regs frame: */ + if (regs) { + unwind_next_frame(state); + return; + } + + /* Otherwise, skip ahead to the user-specified starting frame: */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->sp <= (unsigned long)first_frame)) + unwind_next_frame(state); + + return; + +done: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8a3b61be0aa..f05f00acac89 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -24,6 +24,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page_types.h> +#include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> @@ -148,6 +149,8 @@ SECTIONS BUG_TABLE + ORC_UNWIND_TABLE + . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2688c7dc5323..3ea624452f93 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -89,6 +89,5 @@ config KVM_MMU_AUDIT # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig -source drivers/lguest/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b1dd114956a..04d750813c9d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -108,7 +108,7 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) #define PT64_LVL_ADDR_MASK(level) \ @@ -126,7 +126,7 @@ module_param(dbg, bool, 0644); * PT32_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask) + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_present_mask; +static u64 __read_mostly shadow_me_mask; /* * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. @@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) */ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask) + u64 acc_track_mask, u64 me_mask) { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); @@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; shadow_acc_track_mask = acc_track_mask; + shadow_me_mask = me_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask; + shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) spte |= shadow_acc_track_value; @@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pte_access &= ~ACC_WRITE_MASK; spte |= (u64)pfn << PAGE_SHIFT; + spte |= shadow_me_mask; if (pte_access & ACC_WRITE_MASK) { @@ -4106,16 +4109,28 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + struct rsvd_bits_validate *shadow_zero_check; + int i; /* * Passing "true" to the last argument is okay; it adds a check * on bit 8 of the SPTEs which KVM doesn't use anyway. */ - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + shadow_zero_check = &context->shadow_zero_check; + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), true); + + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } + } EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); @@ -4133,17 +4148,29 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { + struct rsvd_bits_validate *shadow_zero_check; + int i; + + shadow_zero_check = &context->shadow_zero_check; + if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else - __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + __reset_rsvds_bits_mask_ept(shadow_zero_check, boot_cpu_data.x86_phys_bits, false); + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } } /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af256b786a70..8dbd8dbc83eb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb; struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; - phys_addr_t bpa = page_to_phys(svm->avic_backing_page); - phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); - phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); + phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); + phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page)); + phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page)); vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; @@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_MWAIT); } - control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = __pa(svm->msrpm); + control->iopm_base_pa = __sme_set(iopm_base); + control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return -EINVAL; new_entry = READ_ONCE(*entry); - new_entry = (page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); WRITE_ONCE(*entry, new_entry); svm->avic_physical_id_cache = entry; @@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb = page_address(page); clear_page(svm->vmcb); - svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); @@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); @@ -2330,7 +2330,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, offset_in_page(cr3) + index * 8, 8); if (ret) return 0; @@ -2342,7 +2342,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.nested_cr3 = root; + svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); svm_flush_tlb(vcpu); } @@ -2873,7 +2873,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; } @@ -4506,7 +4506,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, irq.vector); *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); + vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); vcpu_info->vector = irq.vector; return 0; @@ -4557,7 +4557,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct amd_iommu_pi_data pi; /* Try to enable guest_mode in IRTE */ - pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; + pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & + AVIC_HPA_MASK); pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; @@ -5006,7 +5007,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.cr3 = root; + svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); svm_flush_tlb(vcpu); } @@ -5015,7 +5016,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.nested_cr3 = root; + svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); /* Also sync guest cr3 here in case we live migrate */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6ef2940119b..70b90c0810d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6556,7 +6556,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK); + VMX_EPT_RWX_MASK, 0ull); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -8779,7 +8779,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) vector = exit_intr_info & INTR_INFO_VECTOR_MASK; desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(*desc); + entry = gate_offset(desc); asm volatile( #ifdef CONFIG_X86_64 "mov %%" _ASM_SP ", %[sp]\n\t" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 272320eb328c..ef5102f80497 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -54,6 +54,7 @@ #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> +#include <linux/mem_encrypt.h> #include <trace/events/kvm.h> @@ -6125,7 +6126,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0); + PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig deleted file mode 100644 index 08f41caada45..000000000000 --- a/arch/x86/lguest/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config LGUEST_GUEST - bool "Lguest guest support" - depends on X86_32 && PARAVIRT && PCI - select TTY - select VIRTUALIZATION - select VIRTIO - select VIRTIO_CONSOLE - help - Lguest is a tiny in-kernel hypervisor. Selecting this will - allow your kernel to boot under lguest. This option will increase - your kernel size by about 10k. If in doubt, say N. - - If you say Y here, make sure you say Y (or M) to the virtio block - and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile deleted file mode 100644 index 8f38d577a2fa..000000000000 --- a/arch/x86/lguest/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y := head_32.o boot.o -CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c deleted file mode 100644 index 99472698c931..000000000000 --- a/arch/x86/lguest/boot.c +++ /dev/null @@ -1,1558 +0,0 @@ -/*P:010 - * A hypervisor allows multiple Operating Systems to run on a single machine. - * To quote David Wheeler: "Any problem in computer science can be solved with - * another layer of indirection." - * - * We keep things simple in two ways. First, we start with a normal Linux - * kernel and insert a module (lg.ko) which allows us to run other Linux - * kernels the same way we'd run processes. We call the first kernel the Host, - * and the others the Guests. The program which sets up and configures Guests - * (such as the example in tools/lguest/lguest.c) is called the Launcher. - * - * Secondly, we only run specially modified Guests, not normal kernels: setting - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows - * how to be a Guest at boot time. This means that you can use the same kernel - * you boot normally (ie. as a Host) as a Guest. - * - * These Guests know that they cannot do privileged operations, such as disable - * interrupts, and that they have to ask the Host to do such things explicitly. - * This file consists of all the replacements for such low-level native - * hardware operations: these special Guest versions call the Host. - * - * So how does the kernel know it's a Guest? We'll see that later, but let's - * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. -:*/ - -/* - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include <linux/kernel.h> -#include <linux/start_kernel.h> -#include <linux/string.h> -#include <linux/console.h> -#include <linux/screen_info.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/clocksource.h> -#include <linux/clockchips.h> -#include <linux/lguest.h> -#include <linux/lguest_launcher.h> -#include <linux/virtio_console.h> -#include <linux/pm.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <linux/virtio_pci.h> -#include <asm/acpi.h> -#include <asm/apic.h> -#include <asm/lguest.h> -#include <asm/paravirt.h> -#include <asm/param.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/setup.h> -#include <asm/e820/api.h> -#include <asm/mce.h> -#include <asm/io.h> -#include <asm/fpu/api.h> -#include <asm/stackprotector.h> -#include <asm/reboot.h> /* for struct machine_ops */ -#include <asm/kvm_para.h> -#include <asm/pci_x86.h> -#include <asm/pci-direct.h> - -/*G:010 - * Welcome to the Guest! - * - * The Guest in our tale is a simple creature: identical to the Host but - * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). -:*/ - -struct lguest_data lguest_data = { - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_iret = (u32)lguest_noirq_iret, - .kernel_address = PAGE_OFFSET, - .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = IA32_SYSCALL_VECTOR, -}; - -/*G:037 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a - * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, - * and 255 once the Host has finished with it. - * - * If we come around to a slot which hasn't been finished, then the table is - * full and we just make the hypercall directly. This has the nice side - * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! - */ -static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* Note: This code assumes we're uniprocessor. */ - static unsigned int next_call; - unsigned long flags; - - /* - * Disable interrupts if not already disabled: we don't want an - * interrupt handler making a hypercall while we're already doing - * one! - */ - local_irq_save(flags); - if (lguest_data.hcall_status[next_call] != 0xFF) { - /* Table full, so do normal hcall which will flush table. */ - hcall(call, arg1, arg2, arg3, arg4); - } else { - lguest_data.hcalls[next_call].arg0 = call; - lguest_data.hcalls[next_call].arg1 = arg1; - lguest_data.hcalls[next_call].arg2 = arg2; - lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; - /* Arguments must all be written before we mark it to go */ - wmb(); - lguest_data.hcall_status[next_call] = 0; - if (++next_call == LHCALL_RING_SIZE) - next_call = 0; - } - local_irq_restore(flags); -} - -/*G:035 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real - * optimization trick! - * - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do - * them as a batch when lazy_mode is eventually turned off. Because hypercalls - * are reasonably expensive, batching them up makes sense. For example, a - * large munmap might update dozens of page table entries: that code calls - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls - * lguest_leave_lazy_mode(). - * - * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: - */ -static void lazy_hcall1(unsigned long call, unsigned long arg1) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, 0, 0, 0); - else - async_hcall(call, arg1, 0, 0, 0); -} - -/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ -static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, 0, 0); - else - async_hcall(call, arg1, arg2, 0, 0); -} - -static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, 0); - else - async_hcall(call, arg1, arg2, arg3, 0); -} - -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - -/*G:036 - * When lazy mode is turned off, we issue the do-nothing hypercall to - * flush any stored calls, and call the generic helper to reset the - * per-cpu lazy mode variable. - */ -static void lguest_leave_lazy_mmu_mode(void) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_leave_lazy_mmu(); -} - -/* - * We also catch the end of context switch; we enter lazy mode for much of - * that too, so again we need to flush here. - * - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU - * mode, but unlike Xen, lguest doesn't care about the difference). - */ -static void lguest_end_context_switch(struct task_struct *next) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_end_context_switch(next); -} - -/*G:032 - * After that diversion we return to our first native-instruction - * replacements: four functions for interrupt control. - * - * The simplest way of implementing these would be to have "turn interrupts - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: - * these are by far the most commonly called functions of those we override. - * - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", - * which the Guest can update with a single instruction. The Host knows to - * check there before it tries to deliver an interrupt. - */ - -/* - * save_flags() is expected to return the processor state (ie. "flags"). The - * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. - */ -asmlinkage __visible unsigned long lguest_save_fl(void) -{ - return lguest_data.irq_enabled; -} - -/* Interrupts go off... */ -asmlinkage __visible void lguest_irq_disable(void) -{ - lguest_data.irq_enabled = 0; -} - -/* - * Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. - */ -PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); -PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); -/*:*/ - -/* These are in head_32.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); - -/*M:003 - * We could be more efficient in our checking of outstanding interrupts, rather - * than using a branch. One way would be to put the "irq_enabled" field in a - * page by itself, and have the Host write-protect it when an interrupt comes - * in when irqs are disabled. There will then be a page fault as soon as - * interrupts are re-enabled. - * - * A better method is to implement soft interrupt disable generally for x86: - * instead of disabling interrupts, we set a flag. If an interrupt does come - * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. -:*/ - -/*G:034 - * The Interrupt Descriptor Table (IDT). - * - * The IDT tells the processor what to do when an interrupt comes in. Each - * entry in the table is a 64-bit descriptor: this holds the privilege level, - * address of the handler, and... well, who cares? The Guest just asks the - * Host to make the change anyway, because the Host controls the real IDT. - */ -static void lguest_write_idt_entry(gate_desc *dt, - int entrynum, const gate_desc *g) -{ - /* - * The gate_desc structure is 8 bytes long: we hand it to the Host in - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors - * around like this; typesafety wasn't a big concern in Linux's early - * years. - */ - u32 *desc = (u32 *)g; - /* Keep the local copy up to date. */ - native_write_idt_entry(dt, entrynum, g); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); -} - -/* - * Changing to a different IDT is very rare: we keep the IDT up-to-date every - * time it is written, so we can simply loop through all entries and tell the - * Host about them. - */ -static void lguest_load_idt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *idt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); -} - -/* - * The Global Descriptor Table. - * - * The Intel architecture defines another table, called the Global Descriptor - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" - * instruction, and then several other instructions refer to entries in the - * table. There are three entries which the Switcher needs, so the Host simply - * controls the entire thing and the Guest asks it to make changes using the - * LOAD_GDT hypercall. - * - * This is the exactly like the IDT code. - */ -static void lguest_load_gdt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *gdt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); -} - -/* - * For a single GDT entry which changes, we simply change our copy and - * then tell the host about it. - */ -static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, - const void *desc, int type) -{ - native_write_gdt_entry(dt, entrynum, desc, type); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b, 0); -} - -/* - * There are three "thread local storage" GDT entries which change - * on every context switch (these three entries are how glibc implements - * __thread variables). As an optimization, we have a hypercall - * specifically for this case. - * - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall - * which took a range of entries? - */ -static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * There's one problem which normal hardware doesn't have: the Host - * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. - */ - lazy_load_gs(0); - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); -} - -/*G:038 - * That's enough excitement for now, back to ploughing through each of the - * different pv_ops structures (we're about 1/3 of the way through). - * - * This is the Local Descriptor Table, another weird Intel thingy. Linux only - * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. - */ -static void lguest_set_ldt(const void *addr, unsigned entries) -{ -} - -/* - * This loads a GDT entry into the "Task Register": that entry points to a - * structure called the Task State Segment. Some comments scattered though the - * kernel code indicate that this used for task switching in ages past, along - * with blood sacrifice and astrology. - * - * Now there's nothing interesting in here that we don't get told elsewhere. - * But the native version uses the "ltr" instruction, which makes the Host - * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. - */ -static void lguest_load_tr_desc(void) -{ -} - -/* - * The "cpuid" instruction is a way of querying both the CPU identity - * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. - * As you might imagine, after a decade and a half this treatment, it is now a - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. - * - * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 6 languages. I am not making this up! - * - * We could get funky here and identify ourselves as "GenuineLguest", but - * instead we just use the real "cpuid" instruction. Then I pretty much turned - * off feature bits until the Guest booted. (Don't say that: you'll damage - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) No one's listening! They don't like you anyway, - * parenthetic weirdo! - * - * Replacing the cpuid so we can turn features off is great for the kernel, but - * anyone (including userspace) can just use the raw "cpuid" instruction and - * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. - */ -static void lguest_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int function = *ax; - - native_cpuid(ax, bx, cx, dx); - switch (function) { - /* - * CPUID 0 gives the highest legal CPUID number (and the ID string). - * We futureproof our code a little by sticking to known CPUID values. - */ - case 0: - if (*ax > 5) - *ax = 5; - break; - - /* - * CPUID 1 is a basic feature request. - * - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. - */ - case 1: - *cx &= 0x00002201; - *dx &= 0x07808151; - /* - * The Host can do a nice optimization if it knows that the - * kernel mappings (addresses above 0xC0000000 or whatever - * PAGE_OFFSET is set to) haven't changed. But Linux calls - * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. - */ - *dx |= 0x00002000; - /* - * We also lie, and say we're family id 5. 6 or greater - * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. - */ - *ax &= 0xFFFFF0FF; - *ax |= 0x00000500; - break; - - /* - * This is used to detect if we're running under KVM. We might be, - * but that's a Host matter, not us. So say we're not. - */ - case KVM_CPUID_SIGNATURE: - *bx = *cx = *dx = 0; - break; - - /* - * 0x80000000 returns the highest Extended Function, so we futureproof - * like we do above by limiting it to known fields. - */ - case 0x80000000: - if (*ax > 0x80000008) - *ax = 0x80000008; - break; - - /* - * PAE systems can mark pages as non-executable. Linux calls this the - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch it off here, since we don't - * support it. - */ - case 0x80000001: - *dx &= ~(1 << 20); - break; - } -} - -/* - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother - * it. The Host needs to know when the Guest wants to change them, so we have - * a whole series of functions like read_cr0() and write_cr0(). - * - * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but the only cr0 bit that Linux ever used at runtime was the - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) - * - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if - * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch if we wanted to, but wouldn't a name like - * "FPUTRAP bit" be a little less cryptic? - * - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore - * cr0. - */ -static void lguest_write_cr0(unsigned long val) -{ -} - -static unsigned long lguest_read_cr0(void) -{ - return 0; -} - -/* - * cr2 is the virtual address of the last page fault, which the Guest only ever - * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. - */ -static unsigned long lguest_read_cr2(void) -{ - return lguest_data.cr2; -} - -/* See lguest_set_pte() below. */ -static bool cr3_changed = false; -static unsigned long current_cr3; - -/* - * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. - */ -static void lguest_write_cr3(unsigned long cr3) -{ - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - current_cr3 = cr3; - - /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa_symbol(swapper_pg_dir) && - cr3 != __pa_symbol(initial_page_table)) - cr3_changed = true; -} - -static unsigned long lguest_read_cr3(void) -{ - return current_cr3; -} - -/* cr4 is used to enable and disable PGE, but we don't care. */ -static unsigned long lguest_read_cr4(void) -{ - return 0; -} - -static void lguest_write_cr4(unsigned long val) -{ -} - -/* - * Page Table Handling. - * - * Now would be a good time to take a rest and grab a coffee or similarly - * relaxing stimulant. The easy parts are behind us, and the trek gradually - * winds uphill from here. - * - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU - * maps virtual addresses to physical addresses using "page tables". We could - * use one huge index of 1 million entries: each address is 4 bytes, so that's - * 1024 pages just to hold the page tables. But since most virtual addresses - * are unused, we use a two level index which saves space. The cr3 register - * contains the physical address of the top level "page directory" page, which - * contains physical addresses of up to 1024 second-level pages. Each of these - * second level pages contains up to 1024 physical addresses of actual pages, - * or Page Table Entries (PTEs). - * - * Here's a diagram, where arrows indicate physical addresses: - * - * cr3 ---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * So to convert a virtual address to a physical address, we look up the top - * level, which points us to the second level, which gives us the physical - * address of that page. If the top level entry was not present, or the second - * level entry was not present, then the virtual address is invalid (we - * say "the page was not mapped"). - * - * Put another way, a 32-bit virtual address is divided up like so: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| - * Index into top Index into second Offset within page - * page directory page pagetable page - * - * Now, unfortunately, this isn't the whole story: Intel added Physical Address - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). - * These are held in 64-bit page table entries, so we can now only fit 512 - * entries in a page, and the neat three-level tree breaks down. - * - * The result is a four level page table: - * - * cr3 --> [ 4 Upper ] - * [ Level ] - * [ Entries ] - * [(PUD Page)]---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * - * And the virtual address is decoded as: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| - * Index into Index into mid Index into lower Offset within page - * top entries directory page pagetable page - * - * It's too hard to switch between these two formats at runtime, so Linux only - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many - * distributions turn it on, and not just for people with silly amounts of - * memory: the larger PTE entries allow room for the NX bit, which lets the - * kernel disable execution of pages and increase security. - * - * This was a problem for lguest, which couldn't run on these distributions; - * then Matias Zabaljauregui figured it all out and implemented it, and only a - * handful of puppies were crushed in the process! - * - * Back to our point: the kernel spends a lot of time changing both the - * top-level page directory and lower-level pagetable pages. The Guest doesn't - * know physical addresses, so while it maintains these page tables exactly - * like normal, it also needs to keep the Host informed whenever it makes a - * change: the Host will create the real page tables based on the Guests'. - */ - -/* - * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. We tell the Host the toplevel and - * address this corresponds to. The Guest uses one pagetable per process, so - * we need to tell the Host which one we're changing (mm->pgd). - */ -static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ -#ifdef CONFIG_X86_PAE - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif -} - -/* This is the "set and update" combo-meal-deal version. */ -static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - lguest_pte_update(mm, addr, ptep); -} - -/* - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. - */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif - -/* - * There are a couple of legacy places where the kernel sets a PTE, but we - * don't know the top level any more. This is useless for us, since we don't - * know which pagetable is changing or what address, so we just tell the Host - * to forget all of them. Fortunately, this is very rare. - * - * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 48 seconds! So we don't even tell - * the Host anything changed until we've done the first real page table switch, - * which brings boot back to 4.3 seconds. - */ -static void lguest_set_pte(pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -/* - * With 64-bit PTE values, we need to be careful setting them: if we set 32 - * bits at a time, the hardware could see a weird half-set entry. These - * versions ensure we update all 64 bits at once. - */ -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -static void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - -/* - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on - * native page table operations. On native hardware you can set a new page - * table entry whenever you want, but if you want to remove one you have to do - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). - * - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only - * called when a valid entry is written, not when it's removed (ie. marked not - * present). Instead, this is where we come when the Guest wants to remove a - * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). - */ -static void lguest_flush_tlb_single(unsigned long addr) -{ - /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); -} - -/* - * This is what happens after the Guest has removed a large number of entries. - * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. - */ -static void lguest_flush_tlb_user(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 0); -} - -/* - * This is called when the kernel page tables have changed. That's not very - * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. - */ -static void lguest_flush_tlb_kernel(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -/* - * The Unadvanced Programmable Interrupt Controller. - * - * This is an attempt to implement the simplest possible interrupt controller. - * I spent some time looking though routines like set_irq_chip_and_handler, - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and - * I *think* this is as simple as it gets. - * - * We can tell the Host what interrupts we want blocked ready for using the - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as - * simple as setting a bit. We don't actually "ack" interrupts as such, we - * just mask and unmask them. I wonder if we should be cleverer? - */ -static void disable_lguest_irq(struct irq_data *data) -{ - set_bit(data->irq, lguest_data.blocked_interrupts); -} - -static void enable_lguest_irq(struct irq_data *data) -{ - clear_bit(data->irq, lguest_data.blocked_interrupts); -} - -/* This structure describes the lguest IRQ controller. */ -static struct irq_chip lguest_irq_controller = { - .name = "lguest", - .irq_mask = disable_lguest_irq, - .irq_mask_ack = disable_lguest_irq, - .irq_unmask = enable_lguest_irq, -}; - -/* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -static int lguest_setup_irq(unsigned int irq) -{ - struct irq_desc *desc; - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - /* - * Tell the Linux infrastructure that the interrupt is - * controlled by our level-based lguest interrupt controller. - */ - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - desc = irq_to_desc(irq); - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); - return 0; -} - -static int lguest_enable_irq(struct pci_dev *dev) -{ - int err; - u8 line = 0; - - /* We literally use the PCI interrupt line as the irq number. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - err = lguest_setup_irq(line); - if (!err) - dev->irq = line; - return err; -} - -/* We don't do hotplug PCI, so this shouldn't be called. */ -static void lguest_disable_irq(struct pci_dev *dev) -{ - WARN_ON(1); -} - -/* - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls). - */ -static void __init lguest_init_IRQ(void) -{ - unsigned int i; - - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } - - /* - * This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. - */ - irq_ctx_init(smp_processor_id()); -} - -/* - * Time. - * - * It would be far better for everyone if the Guest had its own clock, but - * until then the Host gives us the time on every interrupt. - */ -static void lguest_get_wallclock(struct timespec *now) -{ - *now = lguest_data.time; -} - -/* - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us - * what speed it runs at, or 0 if it's unusable as a reliable clock source. - * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. - */ -static unsigned long lguest_tsc_khz(void) -{ - return lguest_data.tsc_khz; -} - -/* - * If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. - */ -static u64 lguest_clock_read(struct clocksource *cs) -{ - unsigned long sec, nsec; - - /* - * Since the time is in two parts (seconds and nanoseconds), we risk - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, - * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: - */ - do { - /* First we read the seconds part. */ - sec = lguest_data.time.tv_sec; - /* - * This read memory barrier tells the compiler and the CPU that - * this can't be reordered: we have to complete the above - * before going on. - */ - rmb(); - /* Now we read the nanoseconds part. */ - nsec = lguest_data.time.tv_nsec; - /* Make sure we've done that. */ - rmb(); - /* Now if the seconds part has changed, try again. */ - } while (unlikely(lguest_data.time.tv_sec != sec)); - - /* Our lguest clock is in real nanoseconds. */ - return sec*1000000000ULL + nsec; -} - -/* This is the fallback clocksource: lower priority than the TSC clocksource. */ -static struct clocksource lguest_clock = { - .name = "lguest", - .rating = 200, - .read = lguest_clock_read, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - * We also need a "struct clock_event_device": Linux asks us to set it to go - * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. - */ -static int lguest_clockevent_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* FIXME: I don't think this can ever happen, but James tells me he had - * to put this code in. Maybe we should remove it now. Anyone? */ - if (delta < LG_CLOCK_MIN_DELTA) { - if (printk_ratelimit()) - printk(KERN_DEBUG "%s: small delta %lu ns\n", - __func__, delta); - return -ETIME; - } - - /* Please wake us this far in the future. */ - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); - return 0; -} - -static int lguest_clockevent_shutdown(struct clock_event_device *evt) -{ - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - return 0; -} - -/* This describes our primitive timer chip. */ -static struct clock_event_device lguest_clockevent = { - .name = "lguest", - .features = CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = lguest_clockevent_set_next_event, - .set_state_shutdown = lguest_clockevent_shutdown, - .rating = INT_MAX, - .mult = 1, - .shift = 0, - .min_delta_ns = LG_CLOCK_MIN_DELTA, - .min_delta_ticks = LG_CLOCK_MIN_DELTA, - .max_delta_ns = LG_CLOCK_MAX_DELTA, - .max_delta_ticks = LG_CLOCK_MAX_DELTA, -}; - -/* - * This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. - */ -static void lguest_time_irq(struct irq_desc *desc) -{ - unsigned long flags; - - /* Don't interrupt us while this is running. */ - local_irq_save(flags); - lguest_clockevent.event_handler(&lguest_clockevent); - local_irq_restore(flags); -} - -/* - * At some point in the boot process, we get asked to set up our timing - * infrastructure. The kernel doesn't expect timer interrupts before this, but - * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. - */ -static void lguest_time_init(void) -{ - /* Set up the timer interrupt (0) to go to our simple timer routine */ - if (lguest_setup_irq(0) != 0) - panic("Could not set up timer irq"); - irq_set_handler(0, lguest_time_irq); - - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); - - /* We can't set cpumask in the initializer: damn C limitations! Set it - * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&lguest_clockevent); - - /* Finally, we unblock the timer interrupt. */ - clear_bit(0, lguest_data.blocked_interrupts); -} - -/* - * Miscellaneous bits and pieces. - * - * Here is an oddball collection of functions which the Guest needs for things - * to work. They're pretty simple. - */ - -/* - * The Guest needs to tell the Host what stack it expects traps to use. For - * native hardware, this is part of the Task State Segment mentioned above in - * lguest_load_tr_desc(), but to help hypervisors there's this special call. - * - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data - * segment), the privilege level (we're privilege level 1, the Host is 0 and - * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. - */ -static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, - THREAD_SIZE / PAGE_SIZE); - tss->x86_tss.sp0 = thread->sp0; -} - -/* Let's just say, I wouldn't do debugging under a Guest. */ -static unsigned long lguest_get_debugreg(int regno) -{ - /* FIXME: Implement */ - return 0; -} - -static void lguest_set_debugreg(int regno, unsigned long value) -{ - /* FIXME: Implement */ -} - -/* - * There are times when the kernel wants to make sure that no memory writes are - * caught in the cache (that they've all reached real hardware devices). This - * doesn't matter for the Guest which has virtual hardware. - * - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush - * (clflush) instruction is available and the kernel uses that. Otherwise, it - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can - * ignore clflush, but replace wbinvd. - */ -static void lguest_wbinvd(void) -{ -} - -/* - * If the Guest expects to have an Advanced Programmable Interrupt Controller, - * we play dumb by ignoring writes and returning 0 for reads. So it's no - * longer Programmable nor Controlling anything, and I don't think 8 lines of - * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. - */ -#ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(u32 reg, u32 v) -{ -} - -static u32 lguest_apic_read(u32 reg) -{ - return 0; -} - -static u64 lguest_apic_icr_read(void) -{ - return 0; -} - -static void lguest_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void lguest_apic_wait_icr_idle(void) -{ - return; -} - -static u32 lguest_apic_safe_wait_icr_idle(void) -{ - return 0; -} - -static void set_lguest_basic_apic_ops(void) -{ - apic->read = lguest_apic_read; - apic->write = lguest_apic_write; - apic->icr_read = lguest_apic_icr_read; - apic->icr_write = lguest_apic_icr_write; - apic->wait_icr_idle = lguest_apic_wait_icr_idle; - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; -}; -#endif - -/* STOP! Until an interrupt comes in. */ -static void lguest_safe_halt(void) -{ - hcall(LHCALL_HALT, 0, 0, 0, 0); -} - -/* - * The SHUTDOWN hypercall takes a string to describe what's happening, and - * an argument which says whether this to restart (reboot) the Guest or not. - * - * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. - */ -static void lguest_power_off(void) -{ - hcall(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF, 0, 0); -} - -/* - * Panicing. - * - * Don't. But if you did, this is what happens. - */ -static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) -{ - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); - /* The hcall won't return, but to keep gcc happy, we're "done". */ - return NOTIFY_DONE; -} - -static struct notifier_block paniced = { - .notifier_call = lguest_panic -}; - -/* Setting up memory is fairly easy. */ -static __init char *lguest_memory_setup(void) -{ - /* - * The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. - */ - e820__range_add(boot_params.e820_table[0].addr, - boot_params.e820_table[0].size, - boot_params.e820_table[0].type); - - /* This string is for the boot messages. */ - return "LGUEST"; -} - -/* Offset within PCI config space of BAR access capability. */ -static int console_cfg_offset = 0; -static int console_access_cap; - -/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ -static void set_cfg_window(u32 cfg_offset, u32 off) -{ - write_pci_config_byte(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, bar), - 0); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, length), - 4); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, offset), - off); -} - -static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) -{ - /* - * We could set this up once, then leave it; nothing else in the * - * kernel should touch these registers. But if it went wrong, that - * would be a horrible bug to find. - */ - set_cfg_window(cfg_offset, off); - write_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap), val); -} - -static void probe_pci_console(void) -{ - u8 cap, common_cap = 0, device_cap = 0; - u32 device_len; - - /* Avoid recursive printk into here. */ - console_cfg_offset = -1; - - if (!early_pci_allowed()) { - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); - return; - } - - /* We expect a console PCI device at BUS0, slot 1. */ - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { - printk(KERN_ERR "lguest: PCI device is %#x!\n", - read_pci_config(0, 1, 0, 0)); - return; - } - - /* Find the capabilities we need (must be in bar0) */ - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); - while (cap) { - u8 vndr = read_pci_config_byte(0, 1, 0, cap); - if (vndr == PCI_CAP_ID_VNDR) { - u8 type, bar; - - type = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, cfg_type)); - bar = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, bar)); - - switch (type) { - case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) - device_cap = cap; - break; - case VIRTIO_PCI_CAP_PCI_CFG: - console_access_cap = cap; - break; - } - } - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); - } - if (!device_cap || !console_access_cap) { - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", - common_cap, device_cap, console_access_cap); - return; - } - - /* - * Note that we can't check features, until we've set the DRIVER - * status bit. We don't want to do that until we have a real driver, - * so we just check that the device-specific config has room for - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE - * it should ignore the access. - */ - device_len = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, length)); - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) - + sizeof(u32))) { - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); - return; - } - - console_cfg_offset = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, offset)); - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); -} - -/* - * We will eventually use the virtio console device to produce console output, - * but before that is set up we use the virtio PCI console's backdoor mmio - * access and the "emergency" write facility (which is legal even before the - * device is configured). - */ -static __init int early_put_chars(u32 vtermno, const char *buf, int count) -{ - /* If we couldn't find PCI console, forget it. */ - if (console_cfg_offset < 0) - return count; - - if (unlikely(!console_cfg_offset)) { - probe_pci_console(); - if (console_cfg_offset < 0) - return count; - } - - write_bar_via_cfg(console_access_cap, - console_cfg_offset - + offsetof(struct virtio_console_config, emerg_wr), - buf[0]); - return 1; -} - -/* - * Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. - */ -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); -} - -/*G:050 - * Patching (Powerfully Placating Performance Pedants) - * - * We have already seen that pv_ops structures let us replace simple native - * instructions with calls to the appropriate back end all throughout the - * kernel. This allows the same kernel to run as a Guest and as a native - * kernel, but it's slow because of all the indirect branches. - * - * Remember that David Wheeler quote about "Any problem in computer science can - * be solved with another layer of indirection"? The rest of that quote is - * "... But that usually will create another problem." This is the first of - * those problems. - * - * Our current solution is to allow the paravirt back end to optionally patch - * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. - * - * First we need assembly templates of each of the patchable Guest operations, - * and these are in head_32.S. - */ - -/*G:060 We construct a table from the assembler templates: */ -static const struct lguest_insns -{ - const char *start, *end; -} lguest_insns[] = { - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, -}; - -/* - * Now our patch routine is fairly simple (based on the native one in - * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. - */ -static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, - unsigned long addr, unsigned len) -{ - unsigned int insn_len; - - /* Don't do anything special if we don't have a replacement */ - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - insn_len = lguest_insns[type].end - lguest_insns[type].start; - - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ - if (len < insn_len) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - /* Copy in our instructions. */ - memcpy(ibuf, lguest_insns[type].start, insn_len); - return insn_len; -} - -/*G:029 - * Once we get to lguest_init(), we know we're a Guest. The various - * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. - */ -__init void lguest_init(void) -{ - /* We're under lguest. */ - pv_info.name = "lguest"; - /* We're running at privilege level 1, not 0 as normal. */ - pv_info.kernel_rpl = 1; - /* Everyone except Xen runs with this set. */ - pv_info.shared_kernel_pmd = 1; - - /* - * We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. - */ - - /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); - pv_irq_ops.safe_halt = lguest_safe_halt; - - /* Setup operations */ - pv_init_ops.patch = lguest_patch; - - /* Intercepts of various CPU instructions */ - pv_cpu_ops.load_gdt = lguest_load_gdt; - pv_cpu_ops.cpuid = lguest_cpuid; - pv_cpu_ops.load_idt = lguest_load_idt; - pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_sp0 = lguest_load_sp0; - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; - pv_cpu_ops.set_ldt = lguest_set_ldt; - pv_cpu_ops.load_tls = lguest_load_tls; - pv_cpu_ops.get_debugreg = lguest_get_debugreg; - pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.read_cr0 = lguest_read_cr0; - pv_cpu_ops.write_cr0 = lguest_write_cr0; - pv_cpu_ops.read_cr4 = lguest_read_cr4; - pv_cpu_ops.write_cr4 = lguest_write_cr4; - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; - pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; - pv_cpu_ops.end_context_switch = lguest_end_context_switch; - - /* Pagetable management */ - pv_mmu_ops.write_cr3 = lguest_write_cr3; - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - pv_mmu_ops.set_pte = lguest_set_pte; - pv_mmu_ops.set_pte_at = lguest_set_pte_at; - pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif - pv_mmu_ops.read_cr2 = lguest_read_cr2; - pv_mmu_ops.read_cr3 = lguest_read_cr3; - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; - pv_mmu_ops.pte_update = lguest_pte_update; - -#ifdef CONFIG_X86_LOCAL_APIC - /* APIC read/write intercepts */ - set_lguest_basic_apic_ops(); -#endif - - x86_init.resources.memory_setup = lguest_memory_setup; - x86_init.irqs.intr_init = lguest_init_IRQ; - x86_init.timers.timer_init = lguest_time_init; - x86_platform.calibrate_tsc = lguest_tsc_khz; - x86_platform.get_wallclock = lguest_get_wallclock; - - /* - * Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). - */ - - /*G:070 - * Now we've seen all the paravirt_ops, we return to - * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - */ - - /* - * The stack protector is a weird thing where gcc places a canary - * value on the stack and then checks it on return. This file is - * compiled with -fno-stack-protector it, so we got this far without - * problems. The value of the canary is kept at offset 20 from the - * %gs register, so we need to set that up before calling C functions - * in other files. - */ - setup_stack_canary_segment(0); - - /* - * We could just call load_stack_canary_segment(), but we might as well - * call switch_to_new_gdt() which loads the whole table and sets up the - * per-cpu segment descriptor register %fs as well. - */ - switch_to_new_gdt(0); - - /* - * The Host<->Guest Switcher lives at the top of our address space, and - * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem - */ - reserve_top_address(lguest_data.reserve_mem); - - /* Hook in our special panic hypercall code. */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - - /* - * This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: - */ - cpu_detect(&new_cpu_data); - /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); - - /* Math is always hard! */ - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - - /* We don't have features. We have puppies! Puppies! */ -#ifdef CONFIG_X86_MCE - mca_cfg.disabled = true; -#endif -#ifdef CONFIG_ACPI - acpi_disabled = 1; -#endif - - /* - * We set the preferred console to "hvc". This is the "hypervisor - * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. - */ - add_preferred_console("hvc", 0, NULL); - - /* Register our very early console. */ - virtio_cons_early_init(early_put_chars); - - /* Don't let ACPI try to control our PCI interrupts. */ - disable_acpi(); - - /* We control them ourselves, by overriding these two hooks. */ - pcibios_enable_irq = lguest_enable_irq; - pcibios_disable_irq = lguest_disable_irq; - - /* - * Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off, and the reboot hook to our restart - * routine. - */ - pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; - - /* - * Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. - */ - i386_start_kernel(); -} -/* - * This marks the end of stage II of our journey, The Guest. - * - * It is now time for us to explore the layer of virtual drivers and complete - * our understanding of the Guest in "make Drivers". - */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S deleted file mode 100644 index d5ae63f5ec5d..000000000000 --- a/arch/x86/lguest/head_32.S +++ /dev/null @@ -1,192 +0,0 @@ -#include <linux/linkage.h> -#include <linux/lguest.h> -#include <asm/lguest_hcall.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/processor-flags.h> - -/*G:020 - - * Our story starts with the bzImage: booting starts at startup_32 in - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real - * kernel in place and then jumps into it: startup_32 in - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi - * register, which is created by the bootloader (the Launcher in our case). - * - * The startup_32 function does very little: it clears the uninitialized global - * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe, and populates some initial - * page tables. Finally it checks the 'hardware_subarch' field. This was - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's - * assigned number), then it calls us here. - * - * WARNING: be very careful here! We're running at addresses equal to physical - * addresses (around 0), not above PAGE_OFFSET as most code expects - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data without remembering to subtract __PAGE_OFFSET! - * - * The .section line puts this code in .init.text so it will be discarded after - * boot. - */ -.section .init.text, "ax", @progbits -ENTRY(lguest_entry) - /* - * We make the "initialization" hypercall now to tell the Host where - * our lguest_data struct is. - */ - movl $LHCALL_LGUEST_INIT, %eax - movl $lguest_data - __PAGE_OFFSET, %ebx - int $LGUEST_TRAP_ENTRY - - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ - movl $LHCALL_NEW_PGTABLE, %eax - movl $(initial_page_table - __PAGE_OFFSET), %ebx - int $LGUEST_TRAP_ENTRY - - /* Set up the initial stack so we can run C code. */ - movl $(init_thread_union+THREAD_SIZE),%esp - - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ - jmp lguest_init+__PAGE_OFFSET - -/*G:055 - * We create a macro which puts the assembler code between lgstart_ and lgend_ - * markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. - */ -.text -#define LGUEST_PATCH(name, insns...) \ - lgstart_##name: insns; lgend_##name:; \ - .globl lgstart_##name; .globl lgend_##name - -LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 - * But using those wrappers is inefficient (we'll see why that doesn't matter - * for save_fl and irq_disable later). If we write our routines carefully in - * assembler, we can avoid clobbering any registers and avoid jumping through - * the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine to - * enable interrupts: - */ -ENTRY(lg_irq_enable) - /* - * The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). - */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* - * But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. - */ - cmpl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* - * One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! - */ - ret -send_interrupts: - /* - * OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. - */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is the actual hypercall trap. */ - int $LGUEST_TRAP_ENTRY - /* Put eax back the way we found it. */ - popl %eax - ret - -/* - * Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. - */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* - * Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. - */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret -/*:*/ - -/* These demark the EIP where host should never deliver interrupts. */ -.global lguest_noirq_iret - -/*M:004 - * When the Host reflects a trap or injects an interrupt into the Guest, it - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, - * so the Guest iret logic does the right thing when restoring it. However, - * when the Host sets the Guest up for direct traps, such as system calls, the - * processor is the one to push eflags onto the stack, and the interrupt bit - * will be 1 (in reality, interrupts are always enabled in the Guest). - * - * This turns out to be harmless: the only trap which should happen under Linux - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc - * regions), which has to be reflected through the Host anyway. If another - * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! -:*/ - -/*G:045 - * There is one final paravirt_op that the Guest implements, and glancing at it - * you can see why I left it to last. It's *cool*! It's in *assembler*! - * - * The "iret" instruction is used to return from an interrupt or trap. The - * stack looks like this: - * old address - * old code segment & privilege level - * old processor flags ("eflags") - * - * The "iret" instruction pops those values off the stack and restores them all - * at once. The only problem is that eflags includes the Interrupt Flag which - * the Guest can't change: the CPU will simply ignore it when we do an "iret". - * So we have to copy eflags from the stack to lguest_data.irq_enabled before - * we do the "iret". - * - * There are two problems with this: firstly, we can't clobber any registers - * and secondly, the whole thing needs to be atomic. The first problem - * is solved by using "push memory"/"pop memory" instruction pair for copying. - * - * The second is harder: copying eflags to lguest_data.irq_enabled will turn - * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to tell the - * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we only need to protect - * one instruction, iret). - */ -ENTRY(lguest_iret) - pushl 2*4(%esp) - /* - * Note the %ss: segment prefix here. Normal data accesses use the - * "ds" segment, but that will have already been restored for whatever - * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. - */ - popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_iret: - iret diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 5cc78bf57232..3261abb21ef4 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, return 0; /* Buffer overrun */ } +/* + * Find a non-boolean option (i.e. option=argument). In accordance with + * standard Linux practice, if this option is repeated, this returns the + * last instance on the command line. + * + * @cmdline: the cmdline string + * @max_cmdline_size: the maximum size of cmdline + * @option: option string to look for + * @buffer: memory buffer to return the option argument + * @bufsize: size of the supplied memory buffer + * + * Returns the length of the argument (regardless of if it was + * truncated to fit in the buffer), or -1 on not found. + */ +static int +__cmdline_find_option(const char *cmdline, int max_cmdline_size, + const char *option, char *buffer, int bufsize) +{ + char c; + int pos = 0, len = -1; + const char *opptr = NULL; + char *bufptr = buffer; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + st_bufcpy, /* Copying this to buffer */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos++ < max_cmdline_size) { + c = *(char *)cmdline++; + if (!c) + break; + + switch (state) { + case st_wordstart: + if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + /* fall through */ + + case st_wordcmp: + if ((c == '=') && !*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for, prepare to + * copy the argument. + */ + len = 0; + bufptr = buffer; + state = st_bufcpy; + break; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ + + case st_wordskip: + if (myisspace(c)) + state = st_wordstart; + break; + + case st_bufcpy: + if (myisspace(c)) { + state = st_wordstart; + } else { + /* + * Increment len, but don't overrun the + * supplied buffer and leave room for the + * NULL terminator. + */ + if (++len < bufsize) + *bufptr++ = c; + } + break; + } + } + + if (bufsize) + *bufptr = '\0'; + + return len; +} + int cmdline_find_option_bool(const char *cmdline, const char *option) { return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); } + +int cmdline_find_option(const char *cmdline, const char *option, char *buffer, + int bufsize) +{ + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, + buffer, bufsize); +} diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S index f77ba3058b31..066996dba6a2 100644 --- a/arch/x86/math-emu/div_Xsig.S +++ b/arch/x86/math-emu/div_Xsig.S @@ -363,3 +363,4 @@ L_bugged_2: pop %ebx jmp L_exit #endif /* PARANOID */ +ENDPROC(div_Xsig) diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S index 47099628fa4c..2c71527bd917 100644 --- a/arch/x86/math-emu/div_small.S +++ b/arch/x86/math-emu/div_small.S @@ -44,4 +44,4 @@ ENTRY(FPU_div_small) leave ret - +ENDPROC(FPU_div_small) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 0203baefb5c0..d4a7df2205b8 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info) } code_descriptor = FPU_get_ldt_descriptor(FPU_CS); - if (SEG_D_SIZE(code_descriptor)) { + if (code_descriptor.d) { /* The above test may be wrong, the book is not clear */ /* Segmented 32 bit protected mode */ addr_modes.default_mode = SEG32; @@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info) /* 16 bit protected mode */ addr_modes.default_mode = PM16; } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor) + - 1) * SEG_GRANULARITY(code_descriptor) - - 1; + FPU_EIP += code_base = seg_get_base(&code_descriptor); + code_limit = seg_get_limit(&code_descriptor) + 1; + code_limit *= seg_get_granularity(&code_descriptor); + code_limit += code_base - 1; if (code_limit < code_base) code_limit = 0xffffffff; } diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index a179254a5122..699f329f1d40 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -34,17 +34,43 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) return ret; } -#define SEG_D_SIZE(x) ((x).b & (3 << 21)) -#define SEG_G_BIT(x) ((x).b & (1 << 23)) -#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) -#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23))) -#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \ - | (((s).b & 0xff) << 16) | ((s).a >> 16)) -#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff)) -#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11)) -#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9)) -#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ - == (1 << 10)) +#define SEG_TYPE_WRITABLE (1U << 1) +#define SEG_TYPE_EXPANDS_DOWN (1U << 2) +#define SEG_TYPE_EXECUTE (1U << 3) +#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE) +#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE) + +static inline unsigned long seg_get_base(struct desc_struct *d) +{ + unsigned long base = (unsigned long)d->base2 << 24; + + return base | ((unsigned long)d->base1 << 16) | d->base0; +} + +static inline unsigned long seg_get_limit(struct desc_struct *d) +{ + return ((unsigned long)d->limit1 << 16) | d->limit0; +} + +static inline unsigned long seg_get_granularity(struct desc_struct *d) +{ + return d->g ? 4096 : 1; +} + +static inline bool seg_expands_down(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN; +} + +static inline bool seg_execute_only(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE; +} + +static inline bool seg_writable(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; +} #define I387 (¤t->thread.fpu.state) #define FPU_info (I387->soft.info) diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b8ef9f9d2ffc..c48967c6a0e2 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -159,17 +159,18 @@ static long pm_address(u_char FPU_modrm, u_char segment, } descriptor = FPU_get_ldt_descriptor(addr->selector); - base_address = SEG_BASE_ADDR(descriptor); + base_address = seg_get_base(&descriptor); address = base_address + offset; - limit = base_address - + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1; + limit = seg_get_limit(&descriptor) + 1; + limit *= seg_get_granularity(&descriptor); + limit += base_address - 1; if (limit < base_address) limit = 0xffffffff; - if (SEG_EXPAND_DOWN(descriptor)) { - if (SEG_G_BIT(descriptor)) + if (seg_expands_down(&descriptor)) { + if (descriptor.g) { seg_top = 0xffffffff; - else { + } else { seg_top = base_address + (1 << 20); if (seg_top < base_address) seg_top = 0xffffffff; @@ -182,8 +183,8 @@ static long pm_address(u_char FPU_modrm, u_char segment, (address > limit) || (address < base_address) ? 0 : ((limit - address) >= 254 ? 255 : limit - address + 1); } - if (SEG_EXECUTE_ONLY(descriptor) || - (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { + if (seg_execute_only(&descriptor) || + (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { access_limit = 0; } return address; diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S index 717785a53eb4..22e0631bb85a 100644 --- a/arch/x86/math-emu/mul_Xsig.S +++ b/arch/x86/math-emu/mul_Xsig.S @@ -62,6 +62,7 @@ ENTRY(mul32_Xsig) popl %esi leave ret +ENDPROC(mul32_Xsig) ENTRY(mul64_Xsig) @@ -114,6 +115,7 @@ ENTRY(mul64_Xsig) popl %esi leave ret +ENDPROC(mul64_Xsig) @@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig) popl %esi leave ret - +ENDPROC(mul_Xsig_Xsig) diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S index 17315c89ff3d..a9aaf414135d 100644 --- a/arch/x86/math-emu/polynom_Xsig.S +++ b/arch/x86/math-emu/polynom_Xsig.S @@ -133,3 +133,4 @@ L_accum_done: popl %esi leave ret +ENDPROC(polynomial_Xsig) diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S index 8b6352efceef..53ac1a343c69 100644 --- a/arch/x86/math-emu/reg_norm.S +++ b/arch/x86/math-emu/reg_norm.S @@ -94,6 +94,7 @@ L_overflow: call arith_overflow pop %ebx jmp L_exit +ENDPROC(FPU_normalize) @@ -145,3 +146,4 @@ L_exit_nuo_zero: popl %ebx leave ret +ENDPROC(FPU_normalize_nuo) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index d1d4e48b4f67..41af5b208d88 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -706,3 +706,5 @@ L_exception_exit: mov $-1,%eax jmp fpu_reg_round_special_exit #endif /* PARANOID */ + +ENDPROC(FPU_round) diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S index 47c4c2434d85..3b1bc5e9b2f6 100644 --- a/arch/x86/math-emu/reg_u_add.S +++ b/arch/x86/math-emu/reg_u_add.S @@ -165,3 +165,4 @@ L_exit: leave ret #endif /* PARANOID */ +ENDPROC(FPU_u_add) diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S index cc00654b6f9a..796eb5ab921b 100644 --- a/arch/x86/math-emu/reg_u_div.S +++ b/arch/x86/math-emu/reg_u_div.S @@ -469,3 +469,5 @@ L_exit: leave ret #endif /* PARANOID */ + +ENDPROC(FPU_u_div) diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S index 973f12af97df..6196f68cf3c1 100644 --- a/arch/x86/math-emu/reg_u_mul.S +++ b/arch/x86/math-emu/reg_u_mul.S @@ -146,3 +146,4 @@ L_exit: ret #endif /* PARANOID */ +ENDPROC(FPU_u_mul) diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S index 1b6c24801d22..d115b900919a 100644 --- a/arch/x86/math-emu/reg_u_sub.S +++ b/arch/x86/math-emu/reg_u_sub.S @@ -270,3 +270,4 @@ L_exit: popl %esi leave ret +ENDPROC(FPU_u_sub) diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S index bbe0e87718e4..87c99749a495 100644 --- a/arch/x86/math-emu/round_Xsig.S +++ b/arch/x86/math-emu/round_Xsig.S @@ -78,7 +78,7 @@ L_exit: popl %ebx leave ret - +ENDPROC(round_Xsig) @@ -138,4 +138,4 @@ L_n_exit: popl %ebx leave ret - +ENDPROC(norm_Xsig) diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S index 31cdd118e918..c8552edeec75 100644 --- a/arch/x86/math-emu/shr_Xsig.S +++ b/arch/x86/math-emu/shr_Xsig.S @@ -85,3 +85,4 @@ L_more_than_95: popl %esi leave ret +ENDPROC(shr_Xsig) diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S index 518428317985..340dd6897f85 100644 --- a/arch/x86/math-emu/wm_shrx.S +++ b/arch/x86/math-emu/wm_shrx.S @@ -92,6 +92,7 @@ L_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrx) /*---------------------------------------------------------------------------+ @@ -202,3 +203,4 @@ Ls_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrxs) diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index d258f59564e1..695afae38fdf 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -468,3 +468,4 @@ sqrt_more_prec_large: /* Our estimate is too large */ movl $0x7fffff00,%eax jmp sqrt_round_result +ENDPROC(wm_sqrt) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f8..72bf8c01c6e3 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0470826d2bdc..5e3ac6fe6c9e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -13,12 +13,12 @@ */ #include <linux/debugfs.h> +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/seq_file.h> -#include <asm/kasan.h> #include <asm/pgtable.h> /* @@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) { pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = - { "cr3", "pgd", "pud", "pmd", "pte" }; + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; if (!pgprot_val(prot)) { /* Not present */ @@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); /* Bit 7 has a different meaning on level 3 vs 4 */ - if (level <= 3 && pr & _PAGE_PSE) + if (level <= 4 && pr & _PAGE_PSE) pt_dump_cont_printf(m, dmsg, "PSE "); else pt_dump_cont_printf(m, dmsg, " "); - if ((level == 4 && pr & _PAGE_PAT) || - ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + if ((level == 5 && pr & _PAGE_PAT) || + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); @@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) */ static unsigned long normalize_addr(unsigned long u) { -#ifdef CONFIG_X86_64 - return (signed long)(u << 16) >> 16; -#else - return u; -#endif + int shift; + if (!IS_ENABLED(CONFIG_X86_64)) + return u; + + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + return (signed long)(u << shift) >> shift; } /* @@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), 5); start++; } } +#ifdef CONFIG_KASAN + +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_zero_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + if (__pa(pt) == __pa(kasan_zero_pmd) || +#ifdef CONFIG_X86_5LEVEL + __pa(pt) == __pa(kasan_zero_p4d) || +#endif + __pa(pt) == __pa(kasan_zero_pud)) { + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); + note_page(m, st, __pgprot(prot), 5); + return true; + } + return false; +} +#else +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + return false; +} +#endif #if PTRS_PER_PMD > 1 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; - pmd_t *start; + pmd_t *start, *pmd_start; pgprotval_t prot; - start = (pmd_t *)pud_page_vaddr(addr); + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { if (pmd_large(*start) || !pmd_present(*start)) { prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 3); - } else { + note_page(m, st, __pgprot(prot), 4); + } else if (!kasan_page_table(m, st, pmd_start)) { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 4); start++; } } @@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, #if PTRS_PER_PUD > 1 -/* - * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y - * KASAN fills page tables with the same values. Since there is no - * point in checking page table more than once we just skip repeated - * entries. This saves us dozens of seconds during boot. - */ -static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) -{ - return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); -} - static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; - pud_t *start; + pud_t *start, *pud_start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *)p4d_page_vaddr(addr); + pud_start = start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); - if (!pud_none(*start) && - !pud_already_checked(prev_pud, start, st->check_wx)) { + if (!pud_none(*start)) { if (pud_large(*start) || !pud_present(*start)) { prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 2); - } else { + note_page(m, st, __pgprot(prot), 3); + } else if (!kasan_page_table(m, st, pud_start)) { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 3); prev_pud = start; start++; @@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; - p4d_t *start; + p4d_t *start, *p4d_start; pgprotval_t prot; - start = (p4d_t *)pgd_page_vaddr(addr); + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); @@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, if (p4d_large(*start) || !p4d_present(*start)) { prot = p4d_flags(*start); note_page(m, st, __pgprot(prot), 2); - } else { + } else if (!kasan_page_table(m, st, p4d_start)) { walk_pud_level(m, st, *start, P + i * P4D_LEVEL_MULT); } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..c076f710de4c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2a1fa10c6a98..b836a7274e12 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -396,14 +396,18 @@ static void dump_pagetable(unsigned long address) pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", pgd_val(*pgd)); + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde /* * We must not directly access the pte in the highpte @@ -415,9 +419,9 @@ static void dump_pagetable(unsigned long address) goto out; pte = pte_offset_kernel(pmd, address); - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); } #else /* CONFIG_X86_64: */ @@ -565,7 +569,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); + pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; @@ -574,7 +578,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(p4d)) goto bad; - printk("P4D %lx ", p4d_val(*p4d)); + pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_large(*p4d)) goto out; @@ -582,7 +586,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); + pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; @@ -590,7 +594,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); + pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; @@ -598,12 +602,12 @@ static void dump_pagetable(unsigned long address) if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); + pr_cont("PTE %lx", pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); return; bad: - printk("BAD\n"); + pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ @@ -1254,10 +1258,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * This function must have noinline because both callers - * {,trace_}do_page_fault() have notrace on. Having this an actual function - * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, @@ -1490,27 +1490,6 @@ good_area: } NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - /* - * We must have this function tagged with __kprobes, notrace and call - * read_cr2() before calling anything else. To avoid calling any kind - * of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contain all sorts of tracepoints. - */ - - prev_state = exception_enter(); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); -} -NOKPROBE_SYMBOL(do_page_fault); - -#ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) @@ -1521,22 +1500,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, trace_page_fault_kernel(address, regs, error_code); } +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ dotraplinkage void notrace -trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - /* - * The exception_enter and tracepoint processing could - * trigger another page faults (user space callchain - * reading) and destroy the original cr2 value, so read - * the faulting address now. - */ - unsigned long address = read_cr2(); + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(address, regs, error_code); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -NOKPROBE_SYMBOL(trace_do_page_fault); -#endif /* CONFIG_TRACING */ +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 2824607df108..6d06cf33e3de 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/elf.h> +#include <asm/mpx.h> #if 0 /* This is just for testing */ struct page * @@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ info.high_limit = in_compat_syscall() ? - tasksize_32bit() : tasksize_64bit(); + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, + unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; - unsigned long addr; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } @@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; + + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index adab1595f4bd..31cea988fa36 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (!pmd) return -ENOMEM; ident_pmd_init(info, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); } return 0; @@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (!pud) return -ENOMEM; ident_pud_init(info, pud, addr, next); - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } return 0; @@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long next; int result; + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; @@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (result) return result; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* * With p4d folded, pgd is equal to p4d. * The pgd entry has to point to the pud page table in this case. */ pud_t *pud = pud_offset(p4d, 0); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); } } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bf3f1065d6ad..7777ccc0e9f9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,7 +815,7 @@ void __init zone_sizes_init(void) DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, - .state = 0, + .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4c1b5fd0c7ad..34f0e1847dd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -13,6 +13,8 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> +#include <linux/mem_encrypt.h> +#include <linux/efi.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -21,6 +23,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/pat.h> +#include <asm/setup.h> #include "physaddr.h" @@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, } /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_ISA_range(phys_addr, last_addr)) - return (__force void __iomem *)phys_to_virt(phys_addr); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ pfn = phys_addr >> PAGE_SHIFT; @@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr) return; /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. */ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); return; + } addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); @@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) unsigned long offset = phys & ~PAGE_MASK; void *vaddr; - /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ - if (page_is_ram(start >> PAGE_SHIFT)) - return __va(phys); + /* memremap() maps if RAM, otherwise falls back to ioremap() */ + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); - vaddr = ioremap_cache(start, PAGE_SIZE); - /* Only add the offset on success and return NULL if the ioremap() failed: */ + /* Only add the offset on success and return NULL if memremap() failed */ if (vaddr) vaddr += offset; @@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { - if (page_is_ram(phys >> PAGE_SHIFT)) - return; + memunmap((void *)((unsigned long)addr & PAGE_MASK)); +} + +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. And since the encryption key can + * change across reboots, persistent memory should also be mapped + * decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + int is_pmem; + + /* + * Check if the address is part of a persistent memory region. + * This check covers areas added by E820, EFI and ACPI. + */ + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); + if (is_pmem != REGION_DISJOINT) + return true; + + /* + * Check if the non-volatile attribute is set for an EFI + * reserved area. + */ + if (efi_enabled(EFI_BOOT)) { + switch (efi_mem_type(phys_addr)) { + case EFI_RESERVED_TYPE: + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) + return true; + break; + default: + break; + } + } + + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + case E820_TYPE_PRAM: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + + paddr_next = data->next; + len = data->len; + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + + paddr_next = data->next; + len = data->len; + + early_memunmap(data, sizeof(*data)); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!sme_active()) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + return false; + + return true; +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + if (!sme_active()) + return prot; + + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + prot = pgprot_decrypted(prot); + else + prot = pgprot_encrypted(prot); + + return prot; +} + +bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) +{ + return arch_memremap_can_ram_remap(phys_addr, size, 0); +} + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; - iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 02c9d7553409..bc84b73684b7 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -11,8 +11,8 @@ #include <asm/e820/types.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/pgtable.h> -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_MAX_ENTRIES]; static int __init map_range(struct range *range) @@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -153,7 +153,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 000000000000..0fbd09269757 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,593 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/sections.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +unsigned long sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL_GPL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + local_flush_tlb(); + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + __native_flush_tlb(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void) +{ + if (!sme_me_mask) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); + + pr_info("AMD Secure Memory Encryption (SME) active\n"); +} + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) +{ + WARN(PAGE_ALIGN(size) != size, + "size is not page-aligned (%#lx)\n", size); + + /* Make the SWIOTLB buffer area decrypted */ + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); +} + +static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, + unsigned long end) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = start & PGDIR_MASK; + pgd_end = end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); + pgd_size *= sizeof(pgd_t); + + pgd_p = pgd_base + pgd_index(start); + + memset(pgd_p, 0, pgd_size); +} + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, + unsigned long vaddr, pmdval_t pmd_val) +{ + pgd_t *pgd_p; + p4d_t *p4d_p; + pud_t *pud_p; + pmd_t *pmd_p; + + pgd_p = pgd_base + pgd_index(vaddr); + if (native_pgd_val(*pgd_p)) { + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + else + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + } else { + pgd_t pgd; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p = pgtable_area; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); + } else { + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); + } + native_set_pgd(pgd_p, pgd); + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p += p4d_index(vaddr); + if (native_p4d_val(*p4d_p)) { + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); + } else { + p4d_t p4d; + + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); + native_set_p4d(p4d_p, p4d); + } + } + + pud_p += pud_index(vaddr); + if (native_pud_val(*pud_p)) { + if (native_pud_val(*pud_p) & _PAGE_PSE) + goto out; + + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); + } else { + pud_t pud; + + pmd_p = pgtable_area; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); + native_set_pud(pud_p, pud); + } + + pmd_p += pmd_index(vaddr); + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) + native_set_pmd(pmd_p, native_make_pmd(pmd_val)); + +out: + return pgtable_area; +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long p4d_size, pud_size, pmd_size; + unsigned long total; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. That mappings will be covered by 2MB + * PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. Incrementing the count for each covers the case where + * the addresses cross entries. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total = p4d_size + pud_size + pmd_size; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total += p4d_size + pud_size + pmd_size; + + return total; +} + +void __init sme_encrypt_kernel(void) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long pgtable_area_len; + unsigned long paddr, pmd_flags; + unsigned long decrypted_base; + void *pgtable_area; + pgd_t *pgd; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel by building new pagetables with + * the necessary attributes needed to encrypt the kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = workarea_start + workarea_len; + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + pgd = (pgd_t *)native_read_cr3_pa(); + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * to be encrypted. It starts with an empty PGD that will then be + * populated with new PUDs and PMDs as the encrypted and decrypted + * kernel mappings are created. + */ + pgd = pgtable_area; + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); + pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; + + /* Add encrypted kernel (identity) mappings */ + pmd_flags = PMD_FLAGS | _PAGE_ENC; + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base <<= PGDIR_SHIFT; + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* Add decrypted workarea mappings to both kernel mappings */ + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + sme_clear_pgd(pgd, kernel_start + decrypted_base, + kernel_end + decrypted_base); + + sme_clear_pgd(pgd, workarea_start + decrypted_base, + workarea_end + decrypted_base); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init __nostackprotector sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + + /* + * Check for the SME feature: + * CPUID Fn8000_001F[EAX] - Bit 0 + * Secure Memory Encryption support + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & 1)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if SME is enabled */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S new file mode 100644 index 000000000000..730e6d541df1 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -0,0 +1,149 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> + + .text + .code64 +ENTRY(sme_encrypt_execute) + + /* + * Entry parameters: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - length of kernel + * RCX - virtual address of the encryption workarea, including: + * - stack page (PAGE_SIZE) + * - encryption routine page (PAGE_SIZE) + * - intermediate copy buffer (PMD_PAGE_SIZE) + * R8 - physcial address of the pagetables to use for encryption + */ + + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ + + /* Set up a one page stack in the non-encrypted memory area */ + movq %rcx, %rax /* Workarea stack page */ + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ + + push %r12 + movq %rdi, %r10 /* Encrypted kernel */ + movq %rsi, %r11 /* Decrypted kernel */ + movq %rdx, %r12 /* Kernel length */ + + /* Copy encryption routine into the workarea */ + movq %rax, %rdi /* Workarea encryption routine */ + leaq __enc_copy(%rip), %rsi /* Encryption routine */ + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ + rep movsb + + /* Setup registers for call */ + movq %r10, %rdi /* Encrypted kernel */ + movq %r11, %rsi /* Decrypted kernel */ + movq %r8, %rdx /* Pagetables used for encryption */ + movq %r12, %rcx /* Kernel length */ + movq %rax, %r8 /* Workarea encryption routine */ + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + + call *%rax /* Call the encryption routine */ + + pop %r12 + + movq %rbp, %rsp /* Restore original stack pointer */ + pop %rbp + + ret +ENDPROC(sme_encrypt_execute) + +ENTRY(__enc_copy) +/* + * Routine used to encrypt kernel. + * This routine must be run outside of the kernel proper since + * the kernel will be encrypted during the process. So this + * routine is defined here and then copied to an area outside + * of the kernel where it will remain and run decrypted + * during execution. + * + * On entry the registers must be: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - address of the pagetables to use for encryption + * RCX - length of kernel + * R8 - intermediate copy buffer + * + * RAX - points to this routine + * + * The kernel will be encrypted by copying from the non-encrypted + * kernel space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted kernel space. The physical + * addresses of the two kernel space mappings are the same which + * results in the kernel being encrypted "in place". + */ + /* Enable the new page tables */ + mov %rdx, %cr3 + + /* Flush any global TLBs */ + mov %cr4, %rdx + andq $~X86_CR4_PGE, %rdx + mov %rdx, %cr4 + orq $X86_CR4_PGE, %rdx + mov %rdx, %cr4 + + /* Set the PAT register PA5 entry to write-protect */ + push %rcx + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + push %rdx /* Save original PAT value */ + andl $0xffff00ff, %edx /* Clear PA5 */ + orl $0x00000500, %edx /* Set PA5 to WP */ + wrmsr + pop %rdx /* RDX contains original PAT value */ + pop %rcx + + movq %rcx, %r9 /* Save kernel length */ + movq %rdi, %r10 /* Save encrypted kernel address */ + movq %rsi, %r11 /* Save decrypted kernel address */ + + wbinvd /* Invalidate any cache entries */ + + /* Copy/encrypt 2MB at a time */ +1: + movq %r11, %rsi /* Source - decrypted kernel */ + movq %r8, %rdi /* Dest - intermediate copy buffer */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + movq %r8, %rsi /* Source - intermediate copy buffer */ + movq %r10, %rdi /* Dest - encrypted kernel */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + addq $PMD_PAGE_SIZE, %r11 + addq $PMD_PAGE_SIZE, %r10 + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + jnz 1b /* Kernel length not zero? */ + + /* Restore PAT register */ + push %rdx /* Save original PAT value */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + pop %rdx /* Restore original PAT value */ + wrmsr + + ret +.L__enc_copy_end: +ENDPROC(__enc_copy) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index a88cfbfbd078..a99679826846 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -37,21 +37,21 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -unsigned long tasksize_32bit(void) +unsigned long task_size_32bit(void) { return IA32_PAGE_OFFSET; } -unsigned long tasksize_64bit(void) +unsigned long task_size_64bit(int full_addr_space) { - return TASK_SIZE_MAX; + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; } static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if (current->flags & PF_RANDOMIZE) { - max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); max <<= PAGE_SHIFT; } @@ -141,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -151,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), tasksize_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit()); #endif } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1c34b767c84c..9ceaa955d2ba 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -355,10 +355,19 @@ int mpx_enable_management(void) */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); + + /* MPX doesn't support addresses above 47 bits yet. */ + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { + pr_warn_once("%s (%d): MPX cannot handle addresses " + "above 47-bits. Disabling.", + current->comm, current->pid); + ret = -ENXIO; + goto out; + } mm->context.bd_addr = bd_base; if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; - +out: up_write(&mm->mmap_sem); return ret; } @@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, if (ret) force_sig(SIGSEGV, current); } + +/* MPX cannot handle addresses above 47 bits yet. */ +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (!kernel_managing_mpx_tables(current->mm)) + return addr; + if (addr + len <= DEFAULT_MAP_WINDOW) + return addr; + if (flags & MAP_FIXED) + return -ENOMEM; + + /* + * Requested len is larger than the whole area we're allowed to map in. + * Resetting hinting address wouldn't do much good -- fail early. + */ + if (len > DEFAULT_MAP_WINDOW) + return -ENOMEM; + + /* Look for unmap area within DEFAULT_MAP_WINDOW */ + return 0; +} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..d805162e6045 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. + * to max_addr. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 size; int big; int nid = 0; @@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. @@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 min_size; int nid = 0; int i, ret; @@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + /** * numa_emulation - Emulate NUMA nodes * @numa_meminfo: NUMA configuration to massage @@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * Determine the max emulated nid and the default phys nid to use * for unmapped nodes. */ - max_emu_nid = 0; - dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (dfl_phys_nid == NUMA_NO_NODE) - dfl_phys_nid = emu_nid_to_phys[i]; - } - } - if (dfl_phys_nid == NUMA_NO_NODE) { - pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); - goto no_emu; - } + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); /* commit */ *numa_meminfo = ei; + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 757b0bcdf712..dfb7d657cf43 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages) __pgprot(0), 1, 0, NULL); } +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + struct cpa_data cpa; + unsigned long start; + int ret; + + /* Nothing to do if the SME is not active */ + if (!sme_active()) + return 0; + + /* Should not be working on unaligned addresses */ + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) + addr &= PAGE_MASK; + + start = addr; + + memset(&cpa, 0, sizeof(cpa)); + cpa.vaddr = &addr; + cpa.numpages = numpages; + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); + cpa.pgd = init_mm.pgd; + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + vm_unmap_aliases(); + + /* + * Before changing the encryption attribute, we need to flush caches. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 1); + else + cpa_flush_all(1); + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * After changing the encryption attribute, we need to flush TLBs + * again in case any speculative TLB caching occurred (but no need + * to flush caches again). We could just use cpa_flush_all(), but + * in case TLB flushing gets optimized in the cpa_flush_range() + * path use the same logic as above. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 0); + else + cpa_flush_all(0); + + return ret; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, true); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, false); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 45979502f64b..fe7d57a8fb60 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -293,7 +293,7 @@ void init_cache_modes(void) * pat_init - Initialize PAT MSR and PAT table * * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC and WT. + * to enable additional cache attributes, WC, WT and WP. * * This function must be called on all CPUs using the specific sequence of * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this @@ -352,7 +352,7 @@ void pat_init(void) * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved - * 101 5 WC : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * @@ -360,7 +360,7 @@ void pat_init(void) * corresponding types in the presence of PAT errata. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { @@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) + vma_prot = pgprot_decrypted(vma_prot); + return vma_prot; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 508a708eb9a6..218834a3e9ad 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); + tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_page(tlb, page); + tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pud)); + tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(p4d)); + tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 014d07a80053..dbbcfd59726a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,42 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) +{ + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -43,12 +79,11 @@ void leave_mm(int cpu) if (loaded_mm == &init_mm) return; - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); + /* Warn if we're not lazy. */ + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); switch_mm(NULL, &init_mm, NULL); } -EXPORT_SYMBOL_GPL(leave_mm); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -63,115 +98,263 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned cpu = smp_processor_id(); struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; /* - * NB: The scheduler will call us with prev == next when - * switching from lazy TLB mode to normal mode if active_mm - * isn't changing. When this happens, there is no guarantee - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + */ + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); if (real_prev == next) { + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * There's nothing to do: we weren't lazy, and we + * aren't changing our mm. We don't need to flush + * anything, nor do we need to update CR3, CR4, or + * LDTR. + */ + return; + } + + /* Resume remote flushes and then read tlb_gen. */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < + next_tlb_gen) { + /* + * Ideally, we'd have a flush_tlb() variant that + * takes the known CR3 value as input. This would + * be faster on Xen PV and on hypothetical CPUs + * on which INVPCID is fast. + */ + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, + next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | prev_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } + /* - * There's nothing to do: we always keep the per-mm control - * regs in sync with cpu_tlbstate.loaded_mm. Just - * sanity-check mm_cpumask. + * We just exited lazy mode, which means that CR4 and/or LDTR + * may be stale. (Changes to the required CR4 and LDTR states + * are not reflected in tlb_gen.) */ - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) - cpumask_set_cpu(cpu, mm_cpumask(next)); - return; - } + } else { + u16 new_asid; + bool need_flush; + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int index = pgd_index(current_stack_pointer()); + pgd_t *pgd = next->pgd + index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[index]); + } + + /* Stop remote flushes for the previous mm */ + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. + * Start remote flushes and then read tlb_gen. */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | new_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); + } - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } - this_cpu_write(cpu_tlbstate.loaded_mm, next); + load_mm_cr4(next); + switch_ldt(real_prev, next); +} - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - cpumask_set_cpu(cpu, mm_cpumask(next)); +/* + * Call this when reinitializing a CPU. It fixes the following potential + * problems: + * + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. + * + * - The TLB contains junk in slots corresponding to inactive ASIDs. + * + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. + */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - */ - load_cr3(next->pgd); + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we have to - * call the tracepoint specially here. + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + WARN_ON(boot_cpu_has(X86_CR4_PCIDE) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); - /* Stop flush ipis for the previous mm */ - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + /* Force ASID 0 and force a TLB flush. */ + write_cr3(cr3 & ~CR3_PCID_MASK); - /* Load per-mm CR4 and LDTR state */ - load_mm_cr4(next); - switch_ldt(real_prev, next); + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); } +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ static void flush_tlb_func_common(const struct flush_tlb_info *f, bool local, enum tlb_flush_reason reason) { + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { - leave_mm(smp_processor_id()); + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + /* + * We're in lazy mode -- don't flush. We can get here on + * remote flushes due to races and on local flushes if a + * kernel thread coincidentally flushes the mm it's lazily + * still using. + */ return; } - if (f->end == TLB_FLUSH_ALL) { - local_flush_tlb(); - if (local) - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); - } else { + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ unsigned long addr; unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + addr = f->start; while (addr < f->end) { __flush_tlb_single(addr); @@ -180,7 +363,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) @@ -214,6 +406,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); @@ -250,8 +457,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); - /* Synchronize with switch_mm. */ - smp_mb(); + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && @@ -273,6 +480,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), &info); + put_cpu(); } @@ -281,8 +489,6 @@ static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -335,6 +541,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) flush_tlb_others(&batch->cpumask, &info); + cpumask_clear(&batch->cpumask); put_cpu(); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e1324f280e06..8c9573660d51 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -94,7 +94,9 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JNE 0x75 #define X86_JBE 0x76 #define X86_JA 0x77 +#define X86_JL 0x7C #define X86_JGE 0x7D +#define X86_JLE 0x7E #define X86_JG 0x7F static void bpf_flush_icache(void *start, void *end) @@ -285,7 +287,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ offsetof(struct bpf_array, map.max_entries)); EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ -#define OFFSET1 47 /* number of bytes to jump */ +#define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -294,21 +296,20 @@ static void emit_bpf_tail_call(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 36 +#define OFFSET2 32 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); - EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) * goto out; */ - EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ #define OFFSET3 10 EMIT2(X86_JE, OFFSET3); /* je out */ label3 = cnt; @@ -888,9 +889,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39, add_2reg(0xC0, dst_reg, src_reg)); @@ -911,9 +916,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: /* cmp dst_reg, imm8/32 */ EMIT1(add_1mod(0x48, dst_reg)); @@ -935,18 +944,34 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ /* GT is unsigned '>', JA in x86 */ jmp_cond = X86_JA; break; + case BPF_JLT: + /* LT is unsigned '<', JB in x86 */ + jmp_cond = X86_JB; + break; case BPF_JGE: /* GE is unsigned '>=', JAE in x86 */ jmp_cond = X86_JAE; break; + case BPF_JLE: + /* LE is unsigned '<=', JBE in x86 */ + jmp_cond = X86_JBE; + break; case BPF_JSGT: /* signed '>', GT in x86 */ jmp_cond = X86_JG; break; + case BPF_JSLT: + /* signed '<', LT in x86 */ + jmp_cond = X86_JL; + break; case BPF_JSGE: /* signed '>=', GE in x86 */ jmp_cond = X86_JGE; break; + case BPF_JSLE: + /* signed '<=', LE in x86 */ + jmp_cond = X86_JLE; + break; default: /* to silence gcc warning */ return -EFAULT; } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132b0ed4..7a5350d08cef 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap(pa_data, sizeof(*rom)); + data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev) } } pa_data = data->next; - iounmap(data); + memunmap(data); } set_dma_domain_ops(dev); set_dev_domain_options(dev); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 5a18aedcb341..b901ece278dd 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -215,16 +215,23 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) struct irq_alloc_info info; int polarity; int ret; + u8 gsi; if (dev->irq_managed && dev->irq > 0) return 0; + ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (ret < 0) { + dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret); + return ret; + } + switch (intel_mid_identify_cpu()) { case INTEL_MID_CPU_CHIP_TANGIER: polarity = IOAPIC_POL_HIGH; /* Special treatment for IRQ0 */ - if (dev->irq == 0) { + if (gsi == 0) { /* * Skip HS UART common registers device since it has * IRQ0 assigned and not used by the kernel. @@ -253,10 +260,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - ret = mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info); + ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (ret < 0) return ret; + dev->irq = ret; dev->irq_managed = 1; return 0; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f084d8718ac4..928b6dceeca0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -1032,25 +1032,6 @@ void __init efi_enter_virtual_mode(void) efi_dump_pagetable(); } -/* - * Convenience functions to obtain memory types and attributes - */ -u32 efi_mem_type(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - - if (!efi_enabled(EFI_MEMMAP)) - return 0; - - for_each_efi_memory_desc(md) { - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->type; - } - return 0; -} - static int __init arch_parse_efi_cmdline(char *str) { if (!str) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 9bf72f5bfedb..12e83888e5b9 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size) int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { - unsigned long pfn, text; + unsigned long pfn, text, pf; struct page *page; unsigned npages; pgd_t *pgd; @@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd); + /* + * Since the PGD is encrypted, set the encryption mask so that when + * this value is loaded into cr3 the PGD will be decrypted during + * the pagetable walk. + */ + efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd); pgd = efi_pgd; /* @@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * phys_efi_set_virtual_address_map(). */ pfn = pa_memmap >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) { + pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC; + if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) { pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); return 1; } @@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) text = __pa(_text); pfn = text >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) { + pf = _PAGE_RW | _PAGE_ENC; + if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) { pr_err("Failed to map kernel text 1:1\n"); return 1; } diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 5a0483e7bf66..dc036e511f48 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) return 0; } -static struct bt_sfi_data tng_bt_sfi_data __initdata = { +static const struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 9e304e2ea4f5..4f5fa65a1011 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -30,13 +30,13 @@ static int tangier_probe(struct platform_device *pdev) { struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; - int gsi, irq; + int gsi = TANGIER_EXT_TIMER0_MSI; + int irq; if (!pdata) return -EINVAL; /* IOAPIC builds identity mapping between GSI and IRQ on MID */ - gsi = pdata->irq; ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) { @@ -44,11 +44,11 @@ static int tangier_probe(struct platform_device *pdev) return irq; } + pdata->irq = irq; return 0; } static struct intel_mid_wdt_pdata tangier_pdata = { - .irq = TANGIER_EXT_TIMER0_MSI, .probe = tangier_probe, }; diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 12a272582cdc..86676cec99a1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -183,6 +183,7 @@ void __init x86_intel_mid_early_setup(void) x86_init.timers.timer_init = intel_mid_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_init.irqs.pre_vector_init = x86_init_noop; @@ -191,7 +192,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; - x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.init = intel_mid_pci_init; diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index ef03852ea6e8..49ec5b94c71f 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -444,7 +444,7 @@ static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states) static int pnw_set_initial_state(struct mid_pwr *pwr) { /* On Penwell SRAM must stay powered on */ - const u32 states[] = { + static const u32 states[] = { 0xf00fffff, /* PM_SSC(0) */ 0xffffffff, /* PM_SSC(1) */ 0xffffffff, /* PM_SSC(2) */ @@ -455,7 +455,7 @@ static int pnw_set_initial_state(struct mid_pwr *pwr) static int tng_set_initial_state(struct mid_pwr *pwr) { - const u32 states[] = { + static const u32 states[] = { 0xffffffff, /* PM_SSC(0) */ 0xffffffff, /* PM_SSC(1) */ 0xffffffff, /* PM_SSC(2) */ diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 78459a6d455a..4d68d59f457d 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -181,6 +181,7 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ + initialize_tlbstate_and_flush(); fpu__resume_cpu(); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index cd4be19c36dc..1f71980fc5e0 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -1,6 +1,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <asm/set_memory.h> #include <asm/pgtable.h> @@ -59,6 +60,13 @@ static void __init setup_real_mode(void) base = (unsigned char *)real_mode_header; + /* + * If SME is active, the trampoline area will need to be in + * decrypted memory in order to bring up other processors + * successfully. + */ + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); + memcpy(base, real_mode_blob, size); phys_base = __pa(base); @@ -100,6 +108,10 @@ static void __init setup_real_mode(void) trampoline_cr4_features = &trampoline_header->cr4; *trampoline_cr4_features = mmu_cr4_features; + trampoline_header->flags = 0; + if (sme_active()) + trampoline_header->flags |= TH_FLAGS_SME_ACTIVE; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_top_pgt[511].pgd; diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index dac7b20d2f9d..614fd7064d0a 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -30,6 +30,7 @@ #include <asm/msr.h> #include <asm/segment.h> #include <asm/processor-flags.h> +#include <asm/realmode.h> #include "realmode.h" .text @@ -92,6 +93,28 @@ ENTRY(startup_32) movl %edx, %fs movl %edx, %gs + /* + * Check for memory encryption support. This is a safety net in + * case BIOS hasn't done the necessary step of setting the bit in + * the MSR for this AP. If SME is active and we've gotten this far + * then it is safe for us to set the MSR bit and continue. If we + * don't we'll eventually crash trying to execute encrypted + * instructions. + */ + bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags + jnc .Ldone + movl $MSR_K8_SYSCFG, %ecx + rdmsr + bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax + jc .Ldone + + /* + * Memory encryption is enabled but the SME enable bit for this + * CPU has has not been set. It is safe to set it, so do so. + */ + wrmsr +.Ldone: + movl pa_tr_cr4, %eax movl %eax, %cr4 # Enable PAE mode @@ -147,6 +170,7 @@ GLOBAL(trampoline_header) tr_start: .space 8 GLOBAL(tr_efer) .space 8 GLOBAL(tr_cr4) .space 4 + GLOBAL(tr_flags) .space 4 END(trampoline_header) #include "trampoline_common.S" diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 027987638e98..1ecd419811a2 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -17,6 +17,9 @@ config XEN_PV bool "Xen PV guest support" default y depends on XEN + # XEN_PV is not ready to work with 5-level paging. + # Changes to hypervisor are also required. + depends on !X86_5LEVEL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU help @@ -75,4 +78,6 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" depends on XEN && XEN_PVHVM && ACPI + # Pre-built page tables are not ready to handle 5-level paging. + depends on !X86_5LEVEL def_bool n diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 811e4ddb3f37..ae2a2e2d6362 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -263,6 +263,13 @@ static void __init xen_init_capabilities(void) setup_clear_cpu_cap(X86_FEATURE_MTRR); setup_clear_cpu_cap(X86_FEATURE_ACC); setup_clear_cpu_cap(X86_FEATURE_X2APIC); + setup_clear_cpu_cap(X86_FEATURE_SME); + + /* + * Xen PV would need some work to support PCID: CR3 handling as well + * as xen_flush_tlb_others() would need updating. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); if (!xen_initial_domain()) setup_clear_cpu_cap(X86_FEATURE_ACPI); @@ -494,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) static inline bool desc_equal(const struct desc_struct *d1, const struct desc_struct *d2) { - return d1->a == d2->a && d1->b == d2->b; + return !memcmp(d1, d2, sizeof(*d1)); } static void load_TLS_descriptor(struct thread_struct *t, @@ -579,59 +586,91 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } +#ifdef CONFIG_X86_64 +struct trap_array_entry { + void (*orig)(void); + void (*xen)(void); + bool ist_okay; +}; + +static struct trap_array_entry trap_array[] = { + { debug, xen_xendebug, true }, + { int3, xen_xenint3, true }, + { double_fault, xen_double_fault, true }, +#ifdef CONFIG_X86_MCE + { machine_check, xen_machine_check, true }, +#endif + { nmi, xen_nmi, true }, + { overflow, xen_overflow, false }, +#ifdef CONFIG_IA32_EMULATION + { entry_INT80_compat, xen_entry_INT80_compat, false }, +#endif + { page_fault, xen_page_fault, false }, + { divide_error, xen_divide_error, false }, + { bounds, xen_bounds, false }, + { invalid_op, xen_invalid_op, false }, + { device_not_available, xen_device_not_available, false }, + { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, + { invalid_TSS, xen_invalid_TSS, false }, + { segment_not_present, xen_segment_not_present, false }, + { stack_segment, xen_stack_segment, false }, + { general_protection, xen_general_protection, false }, + { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, + { coprocessor_error, xen_coprocessor_error, false }, + { alignment_check, xen_alignment_check, false }, + { simd_coprocessor_error, xen_simd_coprocessor_error, false }, +}; + +static bool get_trap_addr(void **addr, unsigned int ist) +{ + unsigned int nr; + bool ist_okay = false; + + /* + * Replace trap handler addresses by Xen specific ones. + * Check for known traps using IST and whitelist them. + * The debugger ones are the only ones we care about. + * Xen will handle faults like double_fault, * so we should never see + * them. Warn if there's an unexpected IST-using fault handler. + */ + for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { + struct trap_array_entry *entry = trap_array + nr; + + if (*addr == entry->orig) { + *addr = entry->xen; + ist_okay = entry->ist_okay; + break; + } + } + + if (WARN_ON(ist != 0 && !ist_okay)) + return false; + + return true; +} +#endif + static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { unsigned long addr; - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) + if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT) return 0; info->vector = vector; - addr = gate_offset(*val); + addr = gate_offset(val); #ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ + if (!get_trap_addr((void **)&addr, val->bits.ist)) return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } #endif /* CONFIG_X86_64 */ info->address = addr; - info->cs = gate_segment(*val); - info->flags = val->dpl; + info->cs = gate_segment(val); + info->flags = val->bits.dpl; /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) + if (val->bits.type == GATE_INTERRUPT) info->flags |= 1 << 2; return 1; @@ -981,59 +1020,6 @@ void __ref xen_setup_vcpu_info_placement(void) } } -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (xen_have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - static const struct pv_info xen_info __initconst = { .shared_kernel_pmd = 0, @@ -1043,10 +1029,6 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, @@ -1244,7 +1226,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_init_ops = xen_init_ops; + pv_init_ops.patch = paravirt_patch_default; pv_cpu_ops = xen_cpu_ops; x86_platform.get_nmi_reason = xen_get_nmi_reason; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 33e92955e09d..d4eff5676cfa 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { .safe_halt = xen_safe_halt, .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif }; void __init xen_init_irq_ops(void) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3be06f3caf3c..3e15345abfe7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -84,7 +84,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, else rmd->mfn++; - rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; + rmd->mmu_update->ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; rmd->mmu_update->val = pte_val_ma(pte); rmd->mmu_update++; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index cab28cf2cffb..6b983b300666 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -162,26 +162,6 @@ static bool xen_page_pinned(void *ptr) return PagePinned(page); } -void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) -{ - struct multicall_space mcs; - struct mmu_update *u; - - trace_xen_mmu_set_domain_pte(ptep, pteval, domid); - - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - - /* ptep might be kmapped when using 32-bit HIGHPTE */ - u->ptr = virt_to_machine(ptep).maddr; - u->val = pte_val_ma(pteval); - - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} -EXPORT_SYMBOL_GPL(xen_set_domain_pte); - static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; @@ -1005,14 +985,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); } return; } - cpumask_copy(mask, mm_cpumask(mm)); /* * It's possible that a vcpu may have a stale reference to our @@ -1021,6 +999,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) * look at its actual current cr3 value, and force it to flush * if needed. */ + cpumask_clear(mask); for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) cpumask_set_cpu(cpu, mask); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 276da636dd39..6083ba462f35 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -212,8 +212,7 @@ void __ref xen_build_mfn_list_list(void) unsigned int level, topidx, mididx; unsigned long *mid_mfn_p; - if (xen_feature(XENFEAT_auto_translated_physmap) || - xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) + if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) return; /* Pre-initialize p2m_top_mfn to be completely missing */ @@ -269,9 +268,6 @@ void __ref xen_build_mfn_list_list(void) void xen_setup_mfn_list_list(void) { - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) @@ -291,9 +287,6 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned long pfn; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list; xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE); @@ -540,9 +533,6 @@ int xen_alloc_p2m_entry(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -640,9 +630,6 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (unlikely(pfn_s >= xen_p2m_size)) return 0; - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) - return pfn_e - pfn_s; - if (pfn_s > pfn_e) return 0; @@ -660,10 +647,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) pte_t *ptep; unsigned int level; - /* don't track P2M changes in autotranslate guests */ - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) - return true; - if (unlikely(pfn >= xen_p2m_size)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; @@ -711,9 +694,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, int i, ret = 0; pte_t *pte; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - if (kmap_ops) { ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, kmap_ops, count); @@ -756,9 +736,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, { int i, ret = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - for (i = 0; i < count; i++) { unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); unsigned long pfn = page_to_pfn(pages[i]); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c81046323ebc..ac55c02f98e9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -340,8 +340,6 @@ static void __init xen_do_set_identity_and_remap_chunk( WARN_ON(size == 0); - BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); - mfn_save = virt_to_mfn(buf); for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; @@ -1024,8 +1022,7 @@ void __init xen_pvmmu_arch_setup(void) void __init xen_arch_setup(void) { xen_panic_handler_init(); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_pvmmu_arch_setup(); + xen_pvmmu_arch_setup(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index eff224df813f..dcd31fa39b5d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in percpu data) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/asm-offsets.h> @@ -16,7 +10,7 @@ #include <asm/processor-flags.h> #include <asm/frame.h> -#include "xen-asm.h" +#include <linux/linkage.h> /* * Enable events. This clears the event mask and tests the pending @@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_irq_enable_direct) FRAME_END ret ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) /* @@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct) */ ENTRY(xen_irq_disable_direct) movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) +ENDPROC(xen_irq_disable_direct) /* * (xen_)save_fl is used to get the current interrupt enable status. @@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah -ENDPATCH(xen_save_fl_direct) ret ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) /* @@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct) /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jnz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_restore_fl_direct) FRAME_END ret ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) /* diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h deleted file mode 100644 index 465276467a47..000000000000 --- a/arch/x86/xen/xen-asm.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _XEN_XEN_ASM_H -#define _XEN_XEN_ASM_H - -#include <linux/linkage.h> - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index feb6d40a0860..1200e262a116 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/thread_info.h> @@ -18,21 +12,10 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call xen_force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 /* * This is run where a normal iret would be run, with the same stack setup: diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c3df43141e70..dae2cc33afb5 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/errno.h> @@ -20,13 +14,44 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> + +.macro xen_pv_trap name +ENTRY(xen_\name) + pop %rcx + pop %r11 + jmp \name +END(xen_\name) +.endm -ENTRY(xen_adjust_exception_frame) - mov 8+0(%rsp), %rcx - mov 8+8(%rsp), %r11 - ret $16 -ENDPROC(xen_adjust_exception_frame) +xen_pv_trap divide_error +xen_pv_trap debug +xen_pv_trap xendebug +xen_pv_trap int3 +xen_pv_trap xenint3 +xen_pv_trap nmi +xen_pv_trap overflow +xen_pv_trap bounds +xen_pv_trap invalid_op +xen_pv_trap device_not_available +xen_pv_trap double_fault +xen_pv_trap coprocessor_segment_overrun +xen_pv_trap invalid_TSS +xen_pv_trap segment_not_present +xen_pv_trap stack_segment +xen_pv_trap general_protection +xen_pv_trap page_fault +xen_pv_trap spurious_interrupt_bug +xen_pv_trap coprocessor_error +xen_pv_trap alignment_check +#ifdef CONFIG_X86_MCE +xen_pv_trap machine_check +#endif /* CONFIG_X86_MCE */ +xen_pv_trap simd_coprocessor_error +#ifdef CONFIG_IA32_EMULATION +xen_pv_trap entry_INT80_compat +#endif +xen_pv_trap hypervisor_callback hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* @@ -46,9 +71,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 */ ENTRY(xen_iret) pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_iret) -RELOC(xen_iret, 1b+1) + jmp hypercall_iret ENTRY(xen_sysret64) /* @@ -65,9 +88,7 @@ ENTRY(xen_sysret64) pushq %rcx pushq $VGCF_in_syscall -1: jmp hypercall_iret -ENDPATCH(xen_sysret64) -RELOC(xen_sysret64, 1b+1) + jmp hypercall_iret /* * Xen handles syscall callbacks much like ordinary exceptions, which @@ -82,34 +103,47 @@ RELOC(xen_sysret64, 1b+1) * rip * r11 * rsp->rcx - * - * In all the entrypoints, we undo all that to make it look like a - * CPU-generated syscall/sysenter and jump to the normal entrypoint. */ -.macro undo_xen_syscall - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp -.endm - /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) - undo_xen_syscall - jmp entry_SYSCALL_64_after_swapgs + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER_DS and __USER_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER_DS, 4*8(%rsp) + movq $__USER_CS, 1*8(%rsp) + + jmp entry_SYSCALL_64_after_hwframe ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) - undo_xen_syscall - jmp entry_SYSCALL_compat + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSCALL_compat_after_hwframe ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) - undo_xen_syscall + mov 0*8(%rsp), %rcx + mov 1*8(%rsp), %r11 + mov 5*8(%rsp), %rsp jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 72a8e6adebe6..a7525e95d53f 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -58,7 +58,7 @@ ENTRY(hypercall_page) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ - ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) #endif #ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0d5004477db6..c8a6d224f7ed 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -129,23 +129,15 @@ static inline void __init xen_efi_init(void) } #endif -/* Declare an asm function, along with symbols needed to make it - inlineable */ -#define DECL_ASM(ret, name, ...) \ - __visible ret name(__VA_ARGS__); \ - extern char name##_end[] __visible; \ - extern char name##_reloc[] __visible - -DECL_ASM(void, xen_irq_enable_direct, void); -DECL_ASM(void, xen_irq_disable_direct, void); -DECL_ASM(unsigned long, xen_save_fl_direct, void); -DECL_ASM(void, xen_restore_fl_direct, unsigned long); +__visible void xen_irq_enable_direct(void); +__visible void xen_irq_disable_direct(void); +__visible unsigned long xen_save_fl_direct(void); +__visible void xen_restore_fl_direct(unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); __visible void xen_sysret32(void); __visible void xen_sysret64(void); -__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h index b39531babec0..eaaf1ebcc7a4 100644 --- a/arch/xtensa/include/asm/futex.h +++ b/arch/xtensa/include/asm/futex.h @@ -44,18 +44,10 @@ : "r" (uaddr), "I" (-EFAULT), "r" (oparg) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; #if !XCHAL_HAVE_S32C1I return -ENOSYS; @@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (ret) - return ret; + if (!ret) + *oval = oldval; - switch (cmp) { - case FUTEX_OP_CMP_EQ: return (oldval == cmparg); - case FUTEX_OP_CMP_NE: return (oldval != cmparg); - case FUTEX_OP_CMP_LT: return (oldval < cmparg); - case FUTEX_OP_CMP_GE: return (oldval >= cmparg); - case FUTEX_OP_CMP_LE: return (oldval <= cmparg); - case FUTEX_OP_CMP_GT: return (oldval > cmparg); - } - - return -ENOSYS; + return ret; } static inline int diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index a36221cf6363..3bb49681ee24 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -33,11 +33,6 @@ #define arch_spin_is_locked(x) ((x)->slock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 24365b30aae9..b15b278aa314 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,20 +103,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 3eed2761c149..220059999e74 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -113,4 +113,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _XTENSA_SOCKET_H */ diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 33bfa5270d95..08175df7a69e 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -273,8 +273,8 @@ void __init init_arch(bp_tag_t *bp_start) * Initialize system. Setup memory and reserve regions. */ -extern char _end; -extern char _stext; +extern char _end[]; +extern char _stext[]; extern char _WindowVectors_text_start; extern char _WindowVectors_text_end; extern char _DebugInterruptVector_literal_start; @@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p) } #endif - mem_reserve(__pa(&_stext), __pa(&_end)); + mem_reserve(__pa(_stext), __pa(_end)); #ifdef CONFIG_VECTORS_OFFSET mem_reserve(__pa(&_WindowVectors_text_start), |