diff options
Diffstat (limited to 'arch')
523 files changed, 12865 insertions, 9454 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 21d0089117fe..2520ca5b42eb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -931,6 +931,18 @@ config STRICT_MODULE_RWX config ARCH_WANT_RELAX_ORDER bool +config ARCH_HAS_REFCOUNT + bool + help + An architecture selects this when it has implemented refcount_t + using open coded assembly primitives that provide an optimized + refcount_t implementation, possibly at the expense of some full + refcount state checks of CONFIG_REFCOUNT_FULL=y. + + The refcount overflow check behavior, however, must be retained. + Catching overflows is the primary security concern for protecting + against bugs in reference counts. + config REFCOUNT_FULL bool "Perform full reference count validation at the expense of speed" help diff --git a/arch/alpha/defconfig b/arch/alpha/defconfig index 539e8b5a6cbd..f4ec420d7f2d 100644 --- a/arch/alpha/defconfig +++ b/arch/alpha/defconfig @@ -19,7 +19,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m # CONFIG_IPV6 is not set CONFIG_NETFILTER=y -CONFIG_IP_NF_QUEUE=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_FILTER=m CONFIG_VLAN_8021Q=m @@ -57,7 +56,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_RTC=y CONFIG_EXT2_FS=y CONFIG_REISERFS_FS=m -CONFIG_AUTOFS_FS=m CONFIG_ISO9660_FS=y CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index d103db5af5ff..5b974ab8425c 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += clkdev.h generic-y += exec.h generic-y += export.h +generic-y += fb.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/alpha/include/asm/asm-prototypes.h b/arch/alpha/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..d12c68ea340b --- /dev/null +++ b/arch/alpha/include/asm/asm-prototypes.h @@ -0,0 +1,18 @@ +#include <linux/spinlock.h> + +#include <asm/checksum.h> +#include <asm/console.h> +#include <asm/page.h> +#include <asm/string.h> +#include <asm/uaccess.h> + +#include <asm-generic/asm-prototypes.h> + +extern void __divl(void); +extern void __reml(void); +extern void __divq(void); +extern void __remq(void); +extern void __divlu(void); +extern void __remlu(void); +extern void __divqu(void); +extern void __remqu(void); diff --git a/arch/alpha/include/asm/core_marvel.h b/arch/alpha/include/asm/core_marvel.h index dad300fa14ce..8dcf9dbda618 100644 --- a/arch/alpha/include/asm/core_marvel.h +++ b/arch/alpha/include/asm/core_marvel.h @@ -312,7 +312,7 @@ struct io7 { io7_port7_csrs *csrs; struct io7_port ports[IO7_NUM_PORTS]; - spinlock_t irq_lock; + raw_spinlock_t irq_lock; }; #ifndef __EXTERN_INLINE diff --git a/arch/alpha/include/asm/fb.h b/arch/alpha/include/asm/fb.h deleted file mode 100644 index fa9bbb96b2b3..000000000000 --- a/arch/alpha/include/asm/fb.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ -#include <linux/device.h> - -/* Caching is off in the I/O space quadrant by design. */ -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index fb01dfb760c2..05a70edd57b6 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -25,18 +25,10 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h index a40b9fc0c6c3..718ac0b64adf 100644 --- a/arch/alpha/include/asm/spinlock.h +++ b/arch/alpha/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_spin_is_locked(x) ((x)->lock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_spin_value_unlocked(arch_spinlock_t lock) { return lock.lock == 0; diff --git a/arch/alpha/include/uapi/asm/unistd.h b/arch/alpha/include/uapi/asm/unistd.h index a2945fea6c86..53de540e39a7 100644 --- a/arch/alpha/include/uapi/asm/unistd.h +++ b/arch/alpha/include/uapi/asm/unistd.h @@ -366,11 +366,6 @@ #define __NR_epoll_create 407 #define __NR_epoll_ctl 408 #define __NR_epoll_wait 409 -/* Feb 2007: These three sys_epoll defines shouldn't be here but culling - * them would break userspace apps ... we'll kill them off in 2010 :) */ -#define __NR_sys_epoll_create __NR_epoll_create -#define __NR_sys_epoll_ctl __NR_epoll_ctl -#define __NR_sys_epoll_wait __NR_epoll_wait #define __NR_remap_file_pages 410 #define __NR_set_tid_address 411 #define __NR_restart_syscall 412 diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 03ff832b1cb4..b10c316475dd 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -118,7 +118,7 @@ alloc_io7(unsigned int pe) io7 = alloc_bootmem(sizeof(*io7)); io7->pe = pe; - spin_lock_init(&io7->irq_lock); + raw_spin_lock_init(&io7->irq_lock); for (h = 0; h < 4; h++) { io7->ports[h].io7 = io7; diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index ffbdb3fb672f..676bab6e3123 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -42,11 +42,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - struct resource *res; - - res = alloc_bootmem(sizeof(*res)); - - return res; + return alloc_bootmem(sizeof(struct resource)); } asmlinkage long diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 92c0d460815b..cbecd527c696 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c @@ -38,7 +38,7 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num, unsigned long nr, start, size; int shift = sparse ? 5 : 0; - nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + nr = vma_pages(vma); start = vma->vm_pgoff; size = ((pci_resource_len(pdev, num) - 1) >> (PAGE_SHIFT - shift)) + 1; @@ -64,8 +64,7 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma, int sparse) { - struct pci_dev *pdev = to_pci_dev(container_of(kobj, - struct device, kobj)); + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); struct resource *res = attr->private; enum pci_mmap_state mmap_type; struct pci_bus_region bar; @@ -255,7 +254,7 @@ static int __legacy_mmap_fits(struct pci_controller *hose, { unsigned long nr, start, size; - nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + nr = vma_pages(vma); start = vma->vm_pgoff; size = ((res_size - 1) >> PAGE_SHIFT) + 1; diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 5f387ee5b5c5..8322df174bbf 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -379,11 +379,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - struct resource *res; - - res = alloc_bootmem(sizeof(*res)); - - return res; + return alloc_bootmem(sizeof(struct resource)); } diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 491e6a604e82..249229ab4942 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -1094,8 +1094,9 @@ get_sysnames(unsigned long type, unsigned long variation, unsigned long cpu, default: /* default to variation "0" for now */ break; case ST_DEC_EB164: - if (member < ARRAY_SIZE(eb164_indices)) - *variation_name = eb164_names[eb164_indices[member]]; + if (member >= ARRAY_SIZE(eb164_indices)) + break; + *variation_name = eb164_names[eb164_indices[member]]; /* PC164 may show as EB164 variation, but with EV56 CPU, so, since no true EB164 had anything but EV5... */ if (eb164_indices[member] == 0 && cpu == EV56_CPU) diff --git a/arch/alpha/kernel/smc37c669.c b/arch/alpha/kernel/smc37c669.c index c803fc76ae4f..4dbd4e415041 100644 --- a/arch/alpha/kernel/smc37c669.c +++ b/arch/alpha/kernel/smc37c669.c @@ -2007,11 +2007,8 @@ static void __init SMC37c669_config_mode( static unsigned char __init SMC37c669_read_config( unsigned char index ) { - unsigned char data; - - wb( &SMC37c669->index_port, index ); - data = rb( &SMC37c669->data_port ); - return data; + wb(&SMC37c669->index_port, index); + return rb(&SMC37c669->data_port); } /* diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 24e41bd7d3c9..3e533920371f 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -115,11 +115,11 @@ io7_enable_irq(struct irq_data *d) return; } - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); *ctl |= 1UL << 24; mb(); *ctl; - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void @@ -136,11 +136,11 @@ io7_disable_irq(struct irq_data *d) return; } - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); *ctl &= ~(1UL << 24); mb(); *ctl; - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void @@ -263,7 +263,7 @@ init_io7_irqs(struct io7 *io7, */ printk(" Interrupts reported to CPU at PE %u\n", boot_cpuid); - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); /* set up the error irqs */ io7_redirect_irq(io7, &io7->csrs->HLT_CTL.csr, boot_cpuid); @@ -295,7 +295,7 @@ init_io7_irqs(struct io7 *io7, for (i = 0; i < 16; ++i) init_one_io7_msi(io7, i, boot_cpuid); - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void __init diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 65bb102d985b..ddb89a18cf26 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -193,8 +193,10 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) static long dummy_emul(void) { return 0; } long (*alpha_fp_emul_imprecise)(struct pt_regs *regs, unsigned long writemask) = (void *)dummy_emul; +EXPORT_SYMBOL_GPL(alpha_fp_emul_imprecise); long (*alpha_fp_emul) (unsigned long pc) = (void *)dummy_emul; +EXPORT_SYMBOL_GPL(alpha_fp_emul); #else long alpha_fp_emul_imprecise(struct pt_regs *regs, unsigned long writemask); long alpha_fp_emul (unsigned long pc); diff --git a/arch/alpha/math-emu/math.c b/arch/alpha/math-emu/math.c index d17d705f6545..1c2d456da7f2 100644 --- a/arch/alpha/math-emu/math.c +++ b/arch/alpha/math-emu/math.c @@ -53,6 +53,7 @@ extern void alpha_write_fp_reg_s (unsigned long reg, unsigned long val); #ifdef MODULE MODULE_DESCRIPTION("FP Software completion module"); +MODULE_LICENSE("GPL v2"); extern long (*alpha_fp_emul_imprecise)(struct pt_regs *, unsigned long); extern long (*alpha_fp_emul) (unsigned long pc); diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 54b54da6384c..11859287c52a 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i) atomic_ops_unlock(flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #endif /* diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 11e1b1f3acda..eb887dd13e74 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -73,20 +73,11 @@ #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; - #ifndef CONFIG_ARC_HAS_LLSC preempt_disable(); /* to guarantee atomic r-m-w of futex op */ #endif @@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index 233d5ffe6ec7..a325e6a36523 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_is_locked(x) ((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #ifdef CONFIG_ARC_HAS_LLSC static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arm/boot/dts/imx6q-evi.dts b/arch/arm/boot/dts/imx6q-evi.dts index 1f0f950dc11e..e0aea782c666 100644 --- a/arch/arm/boot/dts/imx6q-evi.dts +++ b/arch/arm/boot/dts/imx6q-evi.dts @@ -94,6 +94,15 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_ecspi1 &pinctrl_ecspi1cs>; status = "okay"; + + fpga: fpga@0 { + compatible = "altr,fpga-passive-serial"; + spi-max-frequency = <20000000>; + reg = <0>; + pinctrl-0 = <&pinctrl_fpgaspi>; + nconfig-gpios = <&gpio4 9 GPIO_ACTIVE_LOW>; + nstat-gpios = <&gpio4 11 GPIO_ACTIVE_LOW>; + }; }; &ecspi3 { @@ -319,6 +328,13 @@ >; }; + pinctrl_fpgaspi: fpgaspigrp { + fsl,pins = < + MX6QDL_PAD_KEY_ROW1__GPIO4_IO09 0x1b0b0 + MX6QDL_PAD_KEY_ROW2__GPIO4_IO11 0x1b0b0 + >; + }; + pinctrl_gpminand: gpminandgrp { fsl,pins = < MX6QDL_PAD_NANDF_CLE__NAND_CLE 0xb0b1 diff --git a/arch/arm/boot/dts/imx7ulp-pinfunc.h b/arch/arm/boot/dts/imx7ulp-pinfunc.h new file mode 100644 index 000000000000..fe511775b518 --- /dev/null +++ b/arch/arm/boot/dts/imx7ulp-pinfunc.h @@ -0,0 +1,468 @@ +/* + * Copyright 2016 Freescale Semiconductor, Inc. + * Copyright 2017 NXP + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#ifndef __DTS_IMX7ULP_PINFUNC_H +#define __DTS_IMX7ULP_PINFUNC_H + +/* + * The pin function ID is a tuple of + * <mux_conf_reg input_reg mux_mode input_val> + */ + +#define IMX7ULP_PAD_PTC0__PTC0 0x0000 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC0__TRACE_D15 0x0000 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC0__LPUART4_CTS_B 0x0000 0x0244 0x4 0x1 +#define IMX7ULP_PAD_PTC0__LPI2C4_SCL 0x0000 0x0278 0x5 0x1 +#define IMX7ULP_PAD_PTC0__TPM4_CLKIN 0x0000 0x0298 0x6 0x1 +#define IMX7ULP_PAD_PTC0__FB_AD0 0x0000 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC1__PTC1 0x0004 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC1__TRACE_D14 0x0004 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC1__LPUART4_RTS_B 0x0004 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC1__LPI2C4_SDA 0x0004 0x027c 0x5 0x1 +#define IMX7ULP_PAD_PTC1__TPM4_CH0 0x0004 0x0280 0x6 0x1 +#define IMX7ULP_PAD_PTC1__FB_AD1 0x0004 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC2__PTC2 0x0008 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC2__TRACE_D13 0x0008 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC2__LPUART4_TX 0x0008 0x024c 0x4 0x1 +#define IMX7ULP_PAD_PTC2__LPI2C4_HREQ 0x0008 0x0274 0x5 0x1 +#define IMX7ULP_PAD_PTC2__TPM4_CH1 0x0008 0x0284 0x6 0x1 +#define IMX7ULP_PAD_PTC2__FB_AD2 0x0008 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC3__PTC3 0x000c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC3__TRACE_D12 0x000c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC3__LPUART4_RX 0x000c 0x0248 0x4 0x1 +#define IMX7ULP_PAD_PTC3__TPM4_CH2 0x000c 0x0288 0x6 0x1 +#define IMX7ULP_PAD_PTC3__FB_AD3 0x000c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC4__PTC4 0x0010 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC4__TRACE_D11 0x0010 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC4__FXIO1_D0 0x0010 0x0204 0x2 0x1 +#define IMX7ULP_PAD_PTC4__LPSPI2_PCS1 0x0010 0x02a0 0x3 0x1 +#define IMX7ULP_PAD_PTC4__LPUART5_CTS_B 0x0010 0x0250 0x4 0x1 +#define IMX7ULP_PAD_PTC4__LPI2C5_SCL 0x0010 0x02bc 0x5 0x1 +#define IMX7ULP_PAD_PTC4__TPM4_CH3 0x0010 0x028c 0x6 0x1 +#define IMX7ULP_PAD_PTC4__FB_AD4 0x0010 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC5__PTC5 0x0014 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC5__TRACE_D10 0x0014 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC5__FXIO1_D1 0x0014 0x0208 0x2 0x1 +#define IMX7ULP_PAD_PTC5__LPSPI2_PCS2 0x0014 0x02a4 0x3 0x1 +#define IMX7ULP_PAD_PTC5__LPUART5_RTS_B 0x0014 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC5__LPI2C5_SDA 0x0014 0x02c0 0x5 0x1 +#define IMX7ULP_PAD_PTC5__TPM4_CH4 0x0014 0x0290 0x6 0x1 +#define IMX7ULP_PAD_PTC5__FB_AD5 0x0014 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC6__PTC6 0x0018 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC6__TRACE_D9 0x0018 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC6__FXIO1_D2 0x0018 0x020c 0x2 0x1 +#define IMX7ULP_PAD_PTC6__LPSPI2_PCS3 0x0018 0x02a8 0x3 0x1 +#define IMX7ULP_PAD_PTC6__LPUART5_TX 0x0018 0x0258 0x4 0x1 +#define IMX7ULP_PAD_PTC6__LPI2C5_HREQ 0x0018 0x02b8 0x5 0x1 +#define IMX7ULP_PAD_PTC6__TPM4_CH5 0x0018 0x0294 0x6 0x1 +#define IMX7ULP_PAD_PTC6__FB_AD6 0x0018 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC7__PTC7 0x001c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC7__TRACE_D8 0x001c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC7__FXIO1_D3 0x001c 0x0210 0x2 0x1 +#define IMX7ULP_PAD_PTC7__LPUART5_RX 0x001c 0x0254 0x4 0x1 +#define IMX7ULP_PAD_PTC7__TPM5_CH1 0x001c 0x02c8 0x6 0x1 +#define IMX7ULP_PAD_PTC7__FB_AD7 0x001c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC8__PTC8 0x0020 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC8__TRACE_D7 0x0020 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC8__FXIO1_D4 0x0020 0x0214 0x2 0x1 +#define IMX7ULP_PAD_PTC8__LPSPI2_SIN 0x0020 0x02b0 0x3 0x1 +#define IMX7ULP_PAD_PTC8__LPUART6_CTS_B 0x0020 0x025c 0x4 0x1 +#define IMX7ULP_PAD_PTC8__LPI2C6_SCL 0x0020 0x02fc 0x5 0x1 +#define IMX7ULP_PAD_PTC8__TPM5_CLKIN 0x0020 0x02cc 0x6 0x1 +#define IMX7ULP_PAD_PTC8__FB_AD8 0x0020 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC9__PTC9 0x0024 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC9__TRACE_D6 0x0024 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC9__FXIO1_D5 0x0024 0x0218 0x2 0x1 +#define IMX7ULP_PAD_PTC9__LPSPI2_SOUT 0x0024 0x02b4 0x3 0x1 +#define IMX7ULP_PAD_PTC9__LPUART6_RTS_B 0x0024 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC9__LPI2C6_SDA 0x0024 0x0300 0x5 0x1 +#define IMX7ULP_PAD_PTC9__TPM5_CH0 0x0024 0x02c4 0x6 0x1 +#define IMX7ULP_PAD_PTC9__FB_AD9 0x0024 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC10__PTC10 0x0028 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC10__TRACE_D5 0x0028 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC10__FXIO1_D6 0x0028 0x021c 0x2 0x1 +#define IMX7ULP_PAD_PTC10__LPSPI2_SCK 0x0028 0x02ac 0x3 0x1 +#define IMX7ULP_PAD_PTC10__LPUART6_TX 0x0028 0x0264 0x4 0x1 +#define IMX7ULP_PAD_PTC10__LPI2C6_HREQ 0x0028 0x02f8 0x5 0x1 +#define IMX7ULP_PAD_PTC10__TPM7_CH3 0x0028 0x02e8 0x6 0x1 +#define IMX7ULP_PAD_PTC10__FB_AD10 0x0028 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC11__PTC11 0x002c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC11__TRACE_D4 0x002c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC11__FXIO1_D7 0x002c 0x0220 0x2 0x1 +#define IMX7ULP_PAD_PTC11__LPSPI2_PCS0 0x002c 0x029c 0x3 0x1 +#define IMX7ULP_PAD_PTC11__LPUART6_RX 0x002c 0x0260 0x4 0x1 +#define IMX7ULP_PAD_PTC11__TPM7_CH4 0x002c 0x02ec 0x6 0x1 +#define IMX7ULP_PAD_PTC11__FB_AD11 0x002c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC12__PTC12 0x0030 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC12__TRACE_D3 0x0030 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC12__FXIO1_D8 0x0030 0x0224 0x2 0x1 +#define IMX7ULP_PAD_PTC12__LPSPI3_PCS1 0x0030 0x0314 0x3 0x1 +#define IMX7ULP_PAD_PTC12__LPUART7_CTS_B 0x0030 0x0268 0x4 0x1 +#define IMX7ULP_PAD_PTC12__LPI2C7_SCL 0x0030 0x0308 0x5 0x1 +#define IMX7ULP_PAD_PTC12__TPM7_CH5 0x0030 0x02f0 0x6 0x1 +#define IMX7ULP_PAD_PTC12__FB_AD12 0x0030 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC13__PTC13 0x0034 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC13__TRACE_D2 0x0034 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC13__FXIO1_D9 0x0034 0x0228 0x2 0x1 +#define IMX7ULP_PAD_PTC13__LPSPI3_PCS2 0x0034 0x0318 0x3 0x1 +#define IMX7ULP_PAD_PTC13__LPUART7_RTS_B 0x0034 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC13__LPI2C7_SDA 0x0034 0x030c 0x5 0x1 +#define IMX7ULP_PAD_PTC13__TPM7_CLKIN 0x0034 0x02f4 0x6 0x1 +#define IMX7ULP_PAD_PTC13__FB_AD13 0x0034 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC14__PTC14 0x0038 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC14__TRACE_D1 0x0038 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC14__FXIO1_D10 0x0038 0x022c 0x2 0x1 +#define IMX7ULP_PAD_PTC14__LPSPI3_PCS3 0x0038 0x031c 0x3 0x1 +#define IMX7ULP_PAD_PTC14__LPUART7_TX 0x0038 0x0270 0x4 0x1 +#define IMX7ULP_PAD_PTC14__LPI2C7_HREQ 0x0038 0x0304 0x5 0x1 +#define IMX7ULP_PAD_PTC14__TPM7_CH0 0x0038 0x02dc 0x6 0x1 +#define IMX7ULP_PAD_PTC14__FB_AD14 0x0038 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC15__PTC15 0x003c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC15__TRACE_D0 0x003c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC15__FXIO1_D11 0x003c 0x0230 0x2 0x1 +#define IMX7ULP_PAD_PTC15__LPUART7_RX 0x003c 0x026c 0x4 0x1 +#define IMX7ULP_PAD_PTC15__TPM7_CH1 0x003c 0x02e0 0x6 0x1 +#define IMX7ULP_PAD_PTC15__FB_AD15 0x003c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC16__PTC16 0x0040 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC16__TRACE_CLKOUT 0x0040 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC16__FXIO1_D12 0x0040 0x0234 0x2 0x1 +#define IMX7ULP_PAD_PTC16__LPSPI3_SIN 0x0040 0x0324 0x3 0x1 +#define IMX7ULP_PAD_PTC16__TPM7_CH2 0x0040 0x02e4 0x6 0x1 +#define IMX7ULP_PAD_PTC16__FB_ALE_FB_CS1_B_FB_TS_B 0x0040 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC17__PTC17 0x0044 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC17__FXIO1_D13 0x0044 0x0238 0x2 0x1 +#define IMX7ULP_PAD_PTC17__LPSPI3_SOUT 0x0044 0x0328 0x3 0x1 +#define IMX7ULP_PAD_PTC17__TPM6_CLKIN 0x0044 0x02d8 0x6 0x1 +#define IMX7ULP_PAD_PTC17__FB_CS0_B 0x0044 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC18__PTC18 0x0048 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC18__FXIO1_D14 0x0048 0x023c 0x2 0x1 +#define IMX7ULP_PAD_PTC18__LPSPI3_SCK 0x0048 0x0320 0x3 0x1 +#define IMX7ULP_PAD_PTC18__TPM6_CH0 0x0048 0x02d0 0x6 0x1 +#define IMX7ULP_PAD_PTC18__FB_OE_B 0x0048 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC19__PTC19 0x004c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC19__FXIO1_D15 0x004c 0x0240 0x2 0x1 +#define IMX7ULP_PAD_PTC19__LPSPI3_PCS0 0x004c 0x0310 0x3 0x1 +#define IMX7ULP_PAD_PTC19__TPM6_CH1 0x004c 0x02d4 0x6 0x1 +#define IMX7ULP_PAD_PTC19__FB_A16 0x004c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTD0__PTD0 0x0080 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD0__SDHC0_RESET_B 0x0080 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD1__PTD1 0x0084 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD1__SDHC0_CMD 0x0084 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD2__PTD2 0x0088 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD2__SDHC0_CLK 0x0088 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD3__PTD3 0x008c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD3__SDHC0_D7 0x008c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD4__PTD4 0x0090 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD4__SDHC0_D6 0x0090 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD5__PTD5 0x0094 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD5__SDHC0_D5 0x0094 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD6__PTD6 0x0098 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD6__SDHC0_D4 0x0098 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD7__PTD7 0x009c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD7__SDHC0_D3 0x009c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD8__PTD8 0x00a0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD8__TPM4_CLKIN 0x00a0 0x0298 0x6 0x2 +#define IMX7ULP_PAD_PTD8__SDHC0_D2 0x00a0 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD9__PTD9 0x00a4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD9__TPM4_CH0 0x00a4 0x0280 0x6 0x2 +#define IMX7ULP_PAD_PTD9__SDHC0_D1 0x00a4 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD10__PTD10 0x00a8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD10__TPM4_CH1 0x00a8 0x0284 0x6 0x2 +#define IMX7ULP_PAD_PTD10__SDHC0_D0 0x00a8 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD11__PTD11 0x00ac 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD11__TPM4_CH2 0x00ac 0x0288 0x6 0x2 +#define IMX7ULP_PAD_PTD11__SDHC0_DQS 0x00ac 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE0__PTE0 0x0100 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE0__FXIO1_D31 0x0100 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE0__LPSPI2_PCS1 0x0100 0x02a0 0x3 0x2 +#define IMX7ULP_PAD_PTE0__LPUART4_CTS_B 0x0100 0x0244 0x4 0x2 +#define IMX7ULP_PAD_PTE0__LPI2C4_SCL 0x0100 0x0278 0x5 0x2 +#define IMX7ULP_PAD_PTE0__SDHC1_D1 0x0100 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE0__FB_A25 0x0100 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE1__PTE1 0x0104 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE1__FXIO1_D30 0x0104 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE1__LPSPI2_PCS2 0x0104 0x02a4 0x3 0x2 +#define IMX7ULP_PAD_PTE1__LPUART4_RTS_B 0x0104 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE1__LPI2C4_SDA 0x0104 0x027c 0x5 0x2 +#define IMX7ULP_PAD_PTE1__SDHC1_D0 0x0104 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE1__FB_A26 0x0104 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE2__PTE2 0x0108 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE2__FXIO1_D29 0x0108 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE2__LPSPI2_PCS3 0x0108 0x02a8 0x3 0x2 +#define IMX7ULP_PAD_PTE2__LPUART4_TX 0x0108 0x024c 0x4 0x2 +#define IMX7ULP_PAD_PTE2__LPI2C4_HREQ 0x0108 0x0274 0x5 0x2 +#define IMX7ULP_PAD_PTE2__SDHC1_CLK 0x0108 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE3__PTE3 0x010c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE3__FXIO1_D28 0x010c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE3__LPUART4_RX 0x010c 0x0248 0x4 0x2 +#define IMX7ULP_PAD_PTE3__TPM5_CH1 0x010c 0x02c8 0x6 0x2 +#define IMX7ULP_PAD_PTE3__SDHC1_CMD 0x010c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE4__PTE4 0x0110 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE4__FXIO1_D27 0x0110 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE4__LPSPI2_SIN 0x0110 0x02b0 0x3 0x2 +#define IMX7ULP_PAD_PTE4__LPUART5_CTS_B 0x0110 0x0250 0x4 0x2 +#define IMX7ULP_PAD_PTE4__LPI2C5_SCL 0x0110 0x02bc 0x5 0x2 +#define IMX7ULP_PAD_PTE4__TPM5_CLKIN 0x0110 0x02cc 0x6 0x2 +#define IMX7ULP_PAD_PTE4__SDHC1_D3 0x0110 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE5__PTE5 0x0114 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE5__FXIO1_D26 0x0114 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE5__LPSPI2_SOUT 0x0114 0x02b4 0x3 0x2 +#define IMX7ULP_PAD_PTE5__LPUART5_RTS_B 0x0114 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE5__LPI2C5_SDA 0x0114 0x02c0 0x5 0x2 +#define IMX7ULP_PAD_PTE5__TPM5_CH0 0x0114 0x02c4 0x6 0x2 +#define IMX7ULP_PAD_PTE5__SDHC1_D2 0x0114 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE6__PTE6 0x0118 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE6__FXIO1_D25 0x0118 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE6__LPSPI2_SCK 0x0118 0x02ac 0x3 0x2 +#define IMX7ULP_PAD_PTE6__LPUART5_TX 0x0118 0x0258 0x4 0x2 +#define IMX7ULP_PAD_PTE6__LPI2C5_HREQ 0x0118 0x02b8 0x5 0x2 +#define IMX7ULP_PAD_PTE6__TPM7_CH3 0x0118 0x02e8 0x6 0x2 +#define IMX7ULP_PAD_PTE6__SDHC1_D4 0x0118 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE6__FB_A17 0x0118 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE7__PTE7 0x011c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE7__TRACE_D7 0x011c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE7__VIU_FID 0x011c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE7__FXIO1_D24 0x011c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE7__LPSPI2_PCS0 0x011c 0x029c 0x3 0x2 +#define IMX7ULP_PAD_PTE7__LPUART5_RX 0x011c 0x0254 0x4 0x2 +#define IMX7ULP_PAD_PTE7__TPM7_CH4 0x011c 0x02ec 0x6 0x2 +#define IMX7ULP_PAD_PTE7__SDHC1_D5 0x011c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE7__FB_A18 0x011c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE8__PTE8 0x0120 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE8__TRACE_D6 0x0120 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE8__VIU_D16 0x0120 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE8__FXIO1_D23 0x0120 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE8__LPSPI3_PCS1 0x0120 0x0314 0x3 0x2 +#define IMX7ULP_PAD_PTE8__LPUART6_CTS_B 0x0120 0x025c 0x4 0x2 +#define IMX7ULP_PAD_PTE8__LPI2C6_SCL 0x0120 0x02fc 0x5 0x2 +#define IMX7ULP_PAD_PTE8__TPM7_CH5 0x0120 0x02f0 0x6 0x2 +#define IMX7ULP_PAD_PTE8__SDHC1_WP 0x0120 0x0200 0x7 0x1 +#define IMX7ULP_PAD_PTE8__SDHC1_D6 0x0120 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE8__FB_CS3_B_FB_BE7_0_BLS31_24_B 0x0120 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE9__PTE9 0x0124 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE9__TRACE_D5 0x0124 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE9__VIU_D17 0x0124 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE9__FXIO1_D22 0x0124 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE9__LPSPI3_PCS2 0x0124 0x0318 0x3 0x2 +#define IMX7ULP_PAD_PTE9__LPUART6_RTS_B 0x0124 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE9__LPI2C6_SDA 0x0124 0x0300 0x5 0x2 +#define IMX7ULP_PAD_PTE9__TPM7_CLKIN 0x0124 0x02f4 0x6 0x2 +#define IMX7ULP_PAD_PTE9__SDHC1_CD 0x0124 0x032c 0x7 0x1 +#define IMX7ULP_PAD_PTE9__SDHC1_D7 0x0124 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE9__FB_TBST_B_FB_CS2_B_FB_BE15_8_BLS23_16_B 0x0124 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE10__PTE10 0x0128 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE10__TRACE_D4 0x0128 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE10__VIU_D18 0x0128 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE10__FXIO1_D21 0x0128 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE10__LPSPI3_PCS3 0x0128 0x031c 0x3 0x2 +#define IMX7ULP_PAD_PTE10__LPUART6_TX 0x0128 0x0264 0x4 0x2 +#define IMX7ULP_PAD_PTE10__LPI2C6_HREQ 0x0128 0x02f8 0x5 0x2 +#define IMX7ULP_PAD_PTE10__TPM7_CH0 0x0128 0x02dc 0x6 0x2 +#define IMX7ULP_PAD_PTE10__SDHC1_VS 0x0128 0x0000 0x7 0x0 +#define IMX7ULP_PAD_PTE10__SDHC1_DQS 0x0128 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE10__FB_A19 0x0128 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE11__PTE11 0x012c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE11__TRACE_D3 0x012c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE11__VIU_D19 0x012c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE11__FXIO1_D20 0x012c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE11__LPUART6_RX 0x012c 0x0260 0x4 0x2 +#define IMX7ULP_PAD_PTE11__TPM7_CH1 0x012c 0x02e0 0x6 0x2 +#define IMX7ULP_PAD_PTE11__SDHC1_RESET_B 0x012c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE11__FB_A20 0x012c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE12__PTE12 0x0130 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE12__TRACE_D2 0x0130 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE12__VIU_D20 0x0130 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE12__FXIO1_D19 0x0130 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE12__LPSPI3_SIN 0x0130 0x0324 0x3 0x2 +#define IMX7ULP_PAD_PTE12__LPUART7_CTS_B 0x0130 0x0268 0x4 0x2 +#define IMX7ULP_PAD_PTE12__LPI2C7_SCL 0x0130 0x0308 0x5 0x2 +#define IMX7ULP_PAD_PTE12__TPM7_CH2 0x0130 0x02e4 0x6 0x2 +#define IMX7ULP_PAD_PTE12__SDHC1_WP 0x0130 0x0200 0x8 0x2 +#define IMX7ULP_PAD_PTE12__FB_A21 0x0130 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE13__PTE13 0x0134 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE13__TRACE_D1 0x0134 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE13__VIU_D21 0x0134 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE13__FXIO1_D18 0x0134 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE13__LPSPI3_SOUT 0x0134 0x0328 0x3 0x2 +#define IMX7ULP_PAD_PTE13__LPUART7_RTS_B 0x0134 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE13__LPI2C7_SDA 0x0134 0x030c 0x5 0x2 +#define IMX7ULP_PAD_PTE13__TPM6_CLKIN 0x0134 0x02d8 0x6 0x2 +#define IMX7ULP_PAD_PTE13__SDHC1_CD 0x0134 0x032c 0x8 0x2 +#define IMX7ULP_PAD_PTE13__FB_A22 0x0134 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE14__PTE14 0x0138 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE14__TRACE_D0 0x0138 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE14__VIU_D22 0x0138 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE14__FXIO1_D17 0x0138 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE14__LPSPI3_SCK 0x0138 0x0320 0x3 0x2 +#define IMX7ULP_PAD_PTE14__LPUART7_TX 0x0138 0x0270 0x4 0x2 +#define IMX7ULP_PAD_PTE14__LPI2C7_HREQ 0x0138 0x0304 0x5 0x2 +#define IMX7ULP_PAD_PTE14__TPM6_CH0 0x0138 0x02d0 0x6 0x2 +#define IMX7ULP_PAD_PTE14__SDHC1_VS 0x0138 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE14__FB_A23 0x0138 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE15__PTE15 0x013c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE15__TRACE_CLKOUT 0x013c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE15__VIU_D23 0x013c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE15__FXIO1_D16 0x013c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE15__LPSPI3_PCS0 0x013c 0x0310 0x3 0x2 +#define IMX7ULP_PAD_PTE15__LPUART7_RX 0x013c 0x026c 0x4 0x2 +#define IMX7ULP_PAD_PTE15__TPM6_CH1 0x013c 0x02d4 0x6 0x2 +#define IMX7ULP_PAD_PTE15__FB_A24 0x013c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF0__PTF0 0x0180 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF0__VIU_DE 0x0180 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF0__LPUART4_CTS_B 0x0180 0x0244 0x4 0x3 +#define IMX7ULP_PAD_PTF0__LPI2C4_SCL 0x0180 0x0278 0x5 0x3 +#define IMX7ULP_PAD_PTF0__TPM4_CLKIN 0x0180 0x0298 0x6 0x3 +#define IMX7ULP_PAD_PTF0__FB_RW_B 0x0180 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF1__PTF1 0x0184 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF1__VIU_HSYNC 0x0184 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF1__LPUART4_RTS_B 0x0184 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF1__LPI2C4_SDA 0x0184 0x027c 0x5 0x3 +#define IMX7ULP_PAD_PTF1__TPM4_CH0 0x0184 0x0280 0x6 0x3 +#define IMX7ULP_PAD_PTF1__CLKOUT 0x0184 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF2__PTF2 0x0188 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF2__VIU_VSYNC 0x0188 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF2__LPUART4_TX 0x0188 0x024c 0x4 0x3 +#define IMX7ULP_PAD_PTF2__LPI2C4_HREQ 0x0188 0x0274 0x5 0x3 +#define IMX7ULP_PAD_PTF2__TPM4_CH1 0x0188 0x0284 0x6 0x3 +#define IMX7ULP_PAD_PTF2__FB_TSIZ1_FB_CS5_B_FB_BE23_16_BLS15_8_B 0x0188 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF3__PTF3 0x018c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF3__VIU_PCLK 0x018c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF3__LPUART4_RX 0x018c 0x0248 0x4 0x3 +#define IMX7ULP_PAD_PTF3__TPM4_CH2 0x018c 0x0288 0x6 0x3 +#define IMX7ULP_PAD_PTF3__FB_AD16 0x018c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF4__PTF4 0x0190 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF4__VIU_D0 0x0190 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF4__FXIO1_D0 0x0190 0x0204 0x2 0x2 +#define IMX7ULP_PAD_PTF4__LPSPI2_PCS1 0x0190 0x02a0 0x3 0x3 +#define IMX7ULP_PAD_PTF4__LPUART5_CTS_B 0x0190 0x0250 0x4 0x3 +#define IMX7ULP_PAD_PTF4__LPI2C5_SCL 0x0190 0x02bc 0x5 0x3 +#define IMX7ULP_PAD_PTF4__TPM4_CH3 0x0190 0x028c 0x6 0x2 +#define IMX7ULP_PAD_PTF4__FB_AD17 0x0190 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF5__PTF5 0x0194 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF5__VIU_D1 0x0194 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF5__FXIO1_D1 0x0194 0x0208 0x2 0x2 +#define IMX7ULP_PAD_PTF5__LPSPI2_PCS2 0x0194 0x02a4 0x3 0x3 +#define IMX7ULP_PAD_PTF5__LPUART5_RTS_B 0x0194 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF5__LPI2C5_SDA 0x0194 0x02c0 0x5 0x3 +#define IMX7ULP_PAD_PTF5__TPM4_CH4 0x0194 0x0290 0x6 0x2 +#define IMX7ULP_PAD_PTF5__FB_AD18 0x0194 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF6__PTF6 0x0198 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF6__VIU_D2 0x0198 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF6__FXIO1_D2 0x0198 0x020c 0x2 0x2 +#define IMX7ULP_PAD_PTF6__LPSPI2_PCS3 0x0198 0x02a8 0x3 0x3 +#define IMX7ULP_PAD_PTF6__LPUART5_TX 0x0198 0x0258 0x4 0x3 +#define IMX7ULP_PAD_PTF6__LPI2C5_HREQ 0x0198 0x02b8 0x5 0x3 +#define IMX7ULP_PAD_PTF6__TPM4_CH5 0x0198 0x0294 0x6 0x2 +#define IMX7ULP_PAD_PTF6__FB_AD19 0x0198 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF7__PTF7 0x019c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF7__VIU_D3 0x019c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF7__FXIO1_D3 0x019c 0x0210 0x2 0x2 +#define IMX7ULP_PAD_PTF7__LPUART5_RX 0x019c 0x0254 0x4 0x3 +#define IMX7ULP_PAD_PTF7__TPM5_CH1 0x019c 0x02c8 0x6 0x3 +#define IMX7ULP_PAD_PTF7__FB_AD20 0x019c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF8__PTF8 0x01a0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF8__USB1_ULPI_CLK 0x01a0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF8__VIU_D4 0x01a0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF8__FXIO1_D4 0x01a0 0x0214 0x2 0x2 +#define IMX7ULP_PAD_PTF8__LPSPI2_SIN 0x01a0 0x02b0 0x3 0x3 +#define IMX7ULP_PAD_PTF8__LPUART6_CTS_B 0x01a0 0x025c 0x4 0x3 +#define IMX7ULP_PAD_PTF8__LPI2C6_SCL 0x01a0 0x02fc 0x5 0x3 +#define IMX7ULP_PAD_PTF8__TPM5_CLKIN 0x01a0 0x02cc 0x6 0x3 +#define IMX7ULP_PAD_PTF8__FB_AD21 0x01a0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF9__PTF9 0x01a4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF9__USB1_ULPI_NXT 0x01a4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF9__VIU_D5 0x01a4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF9__FXIO1_D5 0x01a4 0x0218 0x2 0x2 +#define IMX7ULP_PAD_PTF9__LPSPI2_SOUT 0x01a4 0x02b4 0x3 0x3 +#define IMX7ULP_PAD_PTF9__LPUART6_RTS_B 0x01a4 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF9__LPI2C6_SDA 0x01a4 0x0300 0x5 0x3 +#define IMX7ULP_PAD_PTF9__TPM5_CH0 0x01a4 0x02c4 0x6 0x3 +#define IMX7ULP_PAD_PTF9__FB_AD22 0x01a4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF10__PTF10 0x01a8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF10__USB1_ULPI_STP 0x01a8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF10__VIU_D6 0x01a8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF10__FXIO1_D6 0x01a8 0x021c 0x2 0x2 +#define IMX7ULP_PAD_PTF10__LPSPI2_SCK 0x01a8 0x02ac 0x3 0x3 +#define IMX7ULP_PAD_PTF10__LPUART6_TX 0x01a8 0x0264 0x4 0x3 +#define IMX7ULP_PAD_PTF10__LPI2C6_HREQ 0x01a8 0x02f8 0x5 0x3 +#define IMX7ULP_PAD_PTF10__TPM7_CH3 0x01a8 0x02e8 0x6 0x3 +#define IMX7ULP_PAD_PTF10__FB_AD23 0x01a8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF11__PTF11 0x01ac 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF11__USB1_ULPI_DIR 0x01ac 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF11__VIU_D7 0x01ac 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF11__FXIO1_D7 0x01ac 0x0220 0x2 0x2 +#define IMX7ULP_PAD_PTF11__LPSPI2_PCS0 0x01ac 0x029c 0x3 0x3 +#define IMX7ULP_PAD_PTF11__LPUART6_RX 0x01ac 0x0260 0x4 0x3 +#define IMX7ULP_PAD_PTF11__TPM7_CH4 0x01ac 0x02ec 0x6 0x3 +#define IMX7ULP_PAD_PTF11__FB_CS4_B_FB_TSIZ0_FB_BE31_24_BLS7_0_B 0x01ac 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF12__PTF12 0x01b0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF12__USB1_ULPI_DATA0 0x01b0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF12__VIU_D8 0x01b0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF12__FXIO1_D8 0x01b0 0x0224 0x2 0x2 +#define IMX7ULP_PAD_PTF12__LPSPI3_PCS1 0x01b0 0x0314 0x3 0x3 +#define IMX7ULP_PAD_PTF12__LPUART7_CTS_B 0x01b0 0x0268 0x4 0x3 +#define IMX7ULP_PAD_PTF12__LPI2C7_SCL 0x01b0 0x0308 0x5 0x3 +#define IMX7ULP_PAD_PTF12__TPM7_CH5 0x01b0 0x02f0 0x6 0x3 +#define IMX7ULP_PAD_PTF12__FB_AD24 0x01b0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF13__PTF13 0x01b4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF13__USB1_ULPI_DATA1 0x01b4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF13__VIU_D9 0x01b4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF13__FXIO1_D9 0x01b4 0x0228 0x2 0x2 +#define IMX7ULP_PAD_PTF13__LPSPI3_PCS2 0x01b4 0x0318 0x3 0x3 +#define IMX7ULP_PAD_PTF13__LPUART7_RTS_B 0x01b4 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF13__LPI2C7_SDA 0x01b4 0x030c 0x5 0x3 +#define IMX7ULP_PAD_PTF13__TPM7_CLKIN 0x01b4 0x02f4 0x6 0x3 +#define IMX7ULP_PAD_PTF13__FB_AD25 0x01b4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF14__PTF14 0x01b8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF14__USB1_ULPI_DATA2 0x01b8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF14__VIU_D10 0x01b8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF14__FXIO1_D10 0x01b8 0x022c 0x2 0x2 +#define IMX7ULP_PAD_PTF14__LPSPI3_PCS3 0x01b8 0x031c 0x3 0x3 +#define IMX7ULP_PAD_PTF14__LPUART7_TX 0x01b8 0x0270 0x4 0x3 +#define IMX7ULP_PAD_PTF14__LPI2C7_HREQ 0x01b8 0x0304 0x5 0x3 +#define IMX7ULP_PAD_PTF14__TPM7_CH0 0x01b8 0x02dc 0x6 0x3 +#define IMX7ULP_PAD_PTF14__FB_AD26 0x01b8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF15__PTF15 0x01bc 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF15__USB1_ULPI_DATA3 0x01bc 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF15__VIU_D11 0x01bc 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF15__FXIO1_D11 0x01bc 0x0230 0x2 0x2 +#define IMX7ULP_PAD_PTF15__LPUART7_RX 0x01bc 0x026c 0x4 0x3 +#define IMX7ULP_PAD_PTF15__TPM7_CH1 0x01bc 0x02e0 0x6 0x3 +#define IMX7ULP_PAD_PTF15__FB_AD27 0x01bc 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF16__PTF16 0x01c0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF16__USB1_ULPI_DATA4 0x01c0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF16__VIU_D12 0x01c0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF16__FXIO1_D12 0x01c0 0x0234 0x2 0x2 +#define IMX7ULP_PAD_PTF16__LPSPI3_SIN 0x01c0 0x0324 0x3 0x3 +#define IMX7ULP_PAD_PTF16__TPM7_CH2 0x01c0 0x02e4 0x6 0x3 +#define IMX7ULP_PAD_PTF16__FB_AD28 0x01c0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF17__PTF17 0x01c4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF17__USB1_ULPI_DATA5 0x01c4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF17__VIU_D13 0x01c4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF17__FXIO1_D13 0x01c4 0x0238 0x2 0x2 +#define IMX7ULP_PAD_PTF17__LPSPI3_SOUT 0x01c4 0x0328 0x3 0x3 +#define IMX7ULP_PAD_PTF17__TPM6_CLKIN 0x01c4 0x02d8 0x6 0x3 +#define IMX7ULP_PAD_PTF17__FB_AD29 0x01c4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF18__PTF18 0x01c8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF18__USB1_ULPI_DATA6 0x01c8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF18__VIU_D14 0x01c8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF18__FXIO1_D14 0x01c8 0x023c 0x2 0x2 +#define IMX7ULP_PAD_PTF18__LPSPI3_SCK 0x01c8 0x0320 0x3 0x3 +#define IMX7ULP_PAD_PTF18__TPM6_CH0 0x01c8 0x02d0 0x6 0x3 +#define IMX7ULP_PAD_PTF18__FB_AD30 0x01c8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF19__PTF19 0x01cc 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF19__USB1_ULPI_DATA7 0x01cc 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF19__VIU_D15 0x01cc 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF19__FXIO1_D15 0x01cc 0x0240 0x2 0x2 +#define IMX7ULP_PAD_PTF19__LPSPI3_PCS0 0x01cc 0x0310 0x3 0x3 +#define IMX7ULP_PAD_PTF19__TPM6_CH1 0x01cc 0x02d4 0x6 0x3 +#define IMX7ULP_PAD_PTF19__FB_AD31 0x01cc 0x0000 0x9 0x0 + +#endif /* __DTS_IMX7ULP_PINFUNC_H */ diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index 7bb9df2c1460..9319e1f0f1d8 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -129,14 +129,14 @@ }; msi1: msi-controller@1570e00 { - compatible = "fsl,1s1021a-msi"; + compatible = "fsl,ls1021a-msi"; reg = <0x0 0x1570e00 0x0 0x8>; msi-controller; interrupts = <GIC_SPI 179 IRQ_TYPE_LEVEL_HIGH>; }; msi2: msi-controller@1570e08 { - compatible = "fsl,1s1021a-msi"; + compatible = "fsl,ls1021a-msi"; reg = <0x0 0x1570e08 0x0 0x8>; msi-controller; interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_HIGH>; @@ -699,7 +699,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi1>; + msi-parent = <&msi1>, <&msi2>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>, @@ -722,7 +722,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi2>; + msi-parent = <&msi1>, <&msi2>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts index 6713d0f2b3f4..b1502df7b509 100644 --- a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts +++ b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts @@ -56,8 +56,6 @@ aliases { serial0 = &uart0; - /* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */ - ethernet0 = &emac; ethernet1 = &xr819; }; @@ -104,13 +102,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts index d756ff825116..a337af1de322 100644 --- a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts +++ b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts @@ -52,7 +52,6 @@ compatible = "sinovoip,bpi-m2-plus", "allwinner,sun8i-h3"; aliases { - ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; }; @@ -115,30 +114,12 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - - allwinner,leds-active-low; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; status = "okay"; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <0>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts index 78f6c24952dd..8d2cc6e9a03f 100644 --- a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts +++ b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts @@ -46,10 +46,3 @@ model = "FriendlyARM NanoPi NEO"; compatible = "friendlyarm,nanopi-neo", "allwinner,sun8i-h3"; }; - -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts index 17cdeae19c6f..8ff71b1bb45b 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts @@ -54,7 +54,6 @@ aliases { serial0 = &uart0; /* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */ - ethernet0 = &emac; ethernet1 = &rtl8189; }; @@ -118,13 +117,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts index 6880268e8b87..5fea430e0eb1 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts @@ -52,7 +52,6 @@ compatible = "xunlong,orangepi-one", "allwinner,sun8i-h3"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -98,13 +97,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts index a10281b455f5..8b93f5c781a7 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts @@ -53,11 +53,6 @@ }; }; -&emac { - /* LEDs changed to active high on the plus */ - /delete-property/ allwinner,leds-active-low; -}; - &mmc1 { pinctrl-names = "default"; pinctrl-0 = <&mmc1_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts index 998b60f8d295..1a044b17d6c6 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts @@ -52,7 +52,6 @@ compatible = "xunlong,orangepi-pc", "allwinner,sun8i-h3"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -114,13 +113,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts index 331ed683ac62..828ae7a526d9 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts @@ -47,10 +47,6 @@ model = "Xunlong Orange Pi Plus / Plus 2"; compatible = "xunlong,orangepi-plus", "allwinner,sun8i-h3"; - aliases { - ethernet0 = &emac; - }; - reg_gmac_3v3: gmac-3v3 { compatible = "regulator-fixed"; regulator-name = "gmac-3v3"; @@ -78,24 +74,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - - allwinner,leds-active-low; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <0>; - }; -}; - &mmc2 { pinctrl-names = "default"; pinctrl-0 = <&mmc2_8bit_pins>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts index 80026f3caafc..97920b12a944 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts @@ -61,19 +61,3 @@ gpio = <&pio 3 6 GPIO_ACTIVE_HIGH>; /* PD6 */ }; }; - -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; diff --git a/arch/arm/boot/dts/sunxi-h3-h5.dtsi b/arch/arm/boot/dts/sunxi-h3-h5.dtsi index d38282b9e5d4..11240a8313c2 100644 --- a/arch/arm/boot/dts/sunxi-h3-h5.dtsi +++ b/arch/arm/boot/dts/sunxi-h3-h5.dtsi @@ -391,32 +391,6 @@ clocks = <&osc24M>; }; - emac: ethernet@1c30000 { - compatible = "allwinner,sun8i-h3-emac"; - syscon = <&syscon>; - reg = <0x01c30000 0x10000>; - interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "macirq"; - resets = <&ccu RST_BUS_EMAC>; - reset-names = "stmmaceth"; - clocks = <&ccu CLK_BUS_EMAC>; - clock-names = "stmmaceth"; - #address-cells = <1>; - #size-cells = <0>; - status = "disabled"; - - mdio: mdio { - #address-cells = <1>; - #size-cells = <0>; - int_mii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - clocks = <&ccu CLK_BUS_EPHY>; - resets = <&ccu RST_BUS_EPHY>; - }; - }; - }; - spi0: spi@01c68000 { compatible = "allwinner,sun8i-h3-spi"; reg = <0x01c68000 0x1000>; diff --git a/arch/arm/boot/dts/tango4-smp8758.dtsi b/arch/arm/boot/dts/tango4-smp8758.dtsi index d2e65c46bcc7..eca33d568690 100644 --- a/arch/arm/boot/dts/tango4-smp8758.dtsi +++ b/arch/arm/boot/dts/tango4-smp8758.dtsi @@ -13,7 +13,6 @@ reg = <0>; clocks = <&clkgen CPU_CLK>; clock-latency = <1>; - operating-points = <1215000 0 607500 0 405000 0 243000 0 135000 0>; }; cpu1: cpu@1 { diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 27475904e096..eee269321923 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -276,6 +276,12 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) #define gicr_write_pendbaser(v, c) __gic_writeq_nonatomic(v, c) /* + * GICR_xLPIR - only the lower bits are significant + */ +#define gic_read_lpir(c) readl_relaxed(c) +#define gic_write_lpir(v, c) writel_relaxed(lower_32_bits(v), c) + +/* * GITS_TYPER is an ID register and doesn't need atomicity. */ #define gits_read_typer(c) __gic_readq_nonatomic(c) @@ -291,5 +297,33 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) */ #define gits_write_cwriter(v, c) __gic_writeq_nonatomic(v, c) +/* + * GITS_VPROPBASER - hi and lo bits may be accessed independently. + */ +#define gits_write_vpropbaser(v, c) __gic_writeq_nonatomic(v, c) + +/* + * GITS_VPENDBASER - the Valid bit must be cleared before changing + * anything else. + */ +static inline void gits_write_vpendbaser(u64 val, void * __iomem addr) +{ + u32 tmp; + + tmp = readl_relaxed(addr + 4); + if (tmp & (GICR_VPENDBASER_Valid >> 32)) { + tmp &= ~(GICR_VPENDBASER_Valid >> 32); + writel_relaxed(tmp, addr + 4); + } + + /* + * Use the fact that __gic_writeq_nonatomic writes the second + * half of the 64bit quantity after the first. + */ + __gic_writeq_nonatomic(val, addr); +} + +#define gits_read_vpendbaser(c) __gic_readq_nonatomic(c) + #endif /* !__ASSEMBLY__ */ #endif /* !__ASM_ARCH_GICV3_H */ diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 6795368ad023..cc414382dab4 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, #endif /* !SMP */ static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - #ifndef CONFIG_SMP preempt_disable(); #endif @@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 4bec45442072..c030143c18c6 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -52,22 +52,6 @@ static inline void dsb_sev(void) * memory. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u16 owner = READ_ONCE(lock->tickets.owner); - - for (;;) { - arch_spinlock_t tmp = READ_ONCE(*lock); - - if (tmp.tickets.owner == tmp.tickets.next || - tmp.tickets.owner != owner) - break; - - wfe(); - } - smp_acquire__after_ctrl_dep(); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 776757d1604a..1d468b527b7b 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ +#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */ +#define TIF_SYSCALL_TRACE 5 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_FSCHECK) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index f555bb3664dc..683d9230984a 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -18,7 +18,6 @@ struct undef_hook { void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { extern char __irqentry_text_start[]; @@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr) return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 0bf2347495f1..87936dd5d151 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -70,6 +70,8 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); } #define segment_eq(a, b) ((a) == (b)) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index eb5cd77bf1d8..e33c32d56193 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -41,7 +41,9 @@ ret_fast_syscall: UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK + bne fast_work_pending + tst r1, #_TIF_WORK_MASK bne fast_work_pending /* perform architecture specific actions before user return */ @@ -67,12 +69,15 @@ ret_fast_syscall: str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK + bne fast_work_pending + tst r1, #_TIF_WORK_MASK beq no_work_pending UNWIND(.fnend ) ENDPROC(ret_fast_syscall) /* Slower path - fall through to work_pending */ +fast_work_pending: #endif tst r1, #_TIF_SYSCALL_WORK diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 5814298ef0b7..e2de50bf8742 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/tracehook.h> #include <linux/uprobes.h> +#include <linux/syscalls.h> #include <asm/elf.h> #include <asm/cacheflush.h> @@ -613,6 +614,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) * Update the trace code with the current status. */ trace_hardirqs_off(); + + /* Check valid user FS if needed */ + addr_limit_user_check(); + do { if (likely(thread_flags & _TIF_NEED_RESCHED)) { schedule(); diff --git a/arch/arm/mach-hisi/Kconfig b/arch/arm/mach-hisi/Kconfig index a3b091a4d344..65a048fa08ec 100644 --- a/arch/arm/mach-hisi/Kconfig +++ b/arch/arm/mach-hisi/Kconfig @@ -39,6 +39,7 @@ config ARCH_HIP04 select HAVE_ARM_ARCH_TIMER select MCPM if SMP select MCPM_QUAD_CLUSTER if SMP + select GENERIC_IRQ_EFFECTIVE_AFF_MASK help Support for Hisilicon HiP04 SoC family diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 779fb1f680b3..b3b3b3a19183 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -8,7 +8,7 @@ ccflags-y := -I$(srctree)/$(src)/include \ # Common support obj-y := id.o io.o control.o devices.o fb.o timer.o pm.o \ common.o dma.o wd_timer.o display.o i2c.o hdq1w.o omap_hwmod.o \ - omap_device.o omap-headsmp.o sram.o drm.o + omap_device.o omap-headsmp.o sram.o hwmod-common = omap_hwmod.o omap_hwmod_reset.o \ omap_hwmod_common_data.o diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c index b1e661bb5521..583fc39d84cd 100644 --- a/arch/arm/mach-omap2/board-generic.c +++ b/arch/arm/mach-omap2/board-generic.c @@ -33,6 +33,7 @@ static void __init __maybe_unused omap_generic_init(void) pdata_quirks_init(omap_dt_match_table); omapdss_init_of(); + omap_soc_device_init(); } #ifdef CONFIG_SOC_OMAP2420 diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c index 8fa01c0ecdb2..b3f6eb5d04a2 100644 --- a/arch/arm/mach-omap2/display.c +++ b/arch/arm/mach-omap2/display.c @@ -66,6 +66,7 @@ */ #define FRAMEDONE_IRQ_TIMEOUT 100 +#if defined(CONFIG_FB_OMAP2) static struct platform_device omap_display_device = { .name = "omapdss", .id = -1, @@ -163,6 +164,65 @@ static enum omapdss_version __init omap_display_get_version(void) return OMAPDSS_VER_UNKNOWN; } +static int __init omapdss_init_fbdev(void) +{ + static struct omap_dss_board_info board_data = { + .dsi_enable_pads = omap_dsi_enable_pads, + .dsi_disable_pads = omap_dsi_disable_pads, + .set_min_bus_tput = omap_dss_set_min_bus_tput, + }; + struct device_node *node; + int r; + + board_data.version = omap_display_get_version(); + if (board_data.version == OMAPDSS_VER_UNKNOWN) { + pr_err("DSS not supported on this SoC\n"); + return -ENODEV; + } + + omap_display_device.dev.platform_data = &board_data; + + r = platform_device_register(&omap_display_device); + if (r < 0) { + pr_err("Unable to register omapdss device\n"); + return r; + } + + /* create vrfb device */ + r = omap_init_vrfb(); + if (r < 0) { + pr_err("Unable to register omapvrfb device\n"); + return r; + } + + /* create FB device */ + r = omap_init_fb(); + if (r < 0) { + pr_err("Unable to register omapfb device\n"); + return r; + } + + /* create V4L2 display device */ + r = omap_init_vout(); + if (r < 0) { + pr_err("Unable to register omap_vout device\n"); + return r; + } + + /* add DSI info for omap4 */ + node = of_find_node_by_name(NULL, "omap4_padconf_global"); + if (node) + omap4_dsi_mux_syscon = syscon_node_to_regmap(node); + + return 0; +} +#else +static inline int omapdss_init_fbdev(void) +{ + return 0; +} +#endif /* CONFIG_FB_OMAP2 */ + static void dispc_disable_outputs(void) { u32 v, irq_mask = 0; @@ -335,16 +395,9 @@ static struct device_node * __init omapdss_find_dss_of_node(void) int __init omapdss_init_of(void) { int r; - enum omapdss_version ver; struct device_node *node; struct platform_device *pdev; - static struct omap_dss_board_info board_data = { - .dsi_enable_pads = omap_dsi_enable_pads, - .dsi_disable_pads = omap_dsi_disable_pads, - .set_min_bus_tput = omap_dss_set_min_bus_tput, - }; - /* only create dss helper devices if dss is enabled in the .dts */ node = omapdss_find_dss_of_node(); @@ -354,13 +407,6 @@ int __init omapdss_init_of(void) if (!of_device_is_available(node)) return 0; - ver = omap_display_get_version(); - - if (ver == OMAPDSS_VER_UNKNOWN) { - pr_err("DSS not supported on this SoC\n"); - return -ENODEV; - } - pdev = of_find_device_by_node(node); if (!pdev) { @@ -374,48 +420,5 @@ int __init omapdss_init_of(void) return r; } - board_data.version = ver; - - omap_display_device.dev.platform_data = &board_data; - - r = platform_device_register(&omap_display_device); - if (r < 0) { - pr_err("Unable to register omapdss device\n"); - return r; - } - - /* create DRM device */ - r = omap_init_drm(); - if (r < 0) { - pr_err("Unable to register omapdrm device\n"); - return r; - } - - /* create vrfb device */ - r = omap_init_vrfb(); - if (r < 0) { - pr_err("Unable to register omapvrfb device\n"); - return r; - } - - /* create FB device */ - r = omap_init_fb(); - if (r < 0) { - pr_err("Unable to register omapfb device\n"); - return r; - } - - /* create V4L2 display device */ - r = omap_init_vout(); - if (r < 0) { - pr_err("Unable to register omap_vout device\n"); - return r; - } - - /* add DSI info for omap4 */ - node = of_find_node_by_name(NULL, "omap4_padconf_global"); - if (node) - omap4_dsi_mux_syscon = syscon_node_to_regmap(node); - - return 0; + return omapdss_init_fbdev(); } diff --git a/arch/arm/mach-omap2/display.h b/arch/arm/mach-omap2/display.h index 9a39646d4316..42ec2e99a2f4 100644 --- a/arch/arm/mach-omap2/display.h +++ b/arch/arm/mach-omap2/display.h @@ -26,7 +26,6 @@ struct omap_dss_dispc_dev_attr { bool has_framedonetv_irq; }; -int omap_init_drm(void); int omap_init_vrfb(void); int omap_init_fb(void); int omap_init_vout(void); diff --git a/arch/arm/mach-omap2/drm.c b/arch/arm/mach-omap2/drm.c deleted file mode 100644 index 44fef961bb70..000000000000 --- a/arch/arm/mach-omap2/drm.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * DRM/KMS device registration for TI OMAP platforms - * - * Copyright (C) 2012 Texas Instruments - * Author: Rob Clark <rob.clark@linaro.org> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/platform_device.h> -#include <linux/dma-mapping.h> -#include <linux/platform_data/omap_drm.h> - -#include "soc.h" -#include "display.h" - -#if IS_ENABLED(CONFIG_DRM_OMAP) - -static struct omap_drm_platform_data platform_data; - -static struct platform_device omap_drm_device = { - .dev = { - .coherent_dma_mask = DMA_BIT_MASK(32), - .platform_data = &platform_data, - }, - .name = "omapdrm", - .id = 0, -}; - -int __init omap_init_drm(void) -{ - platform_data.omaprev = GET_OMAP_TYPE; - - return platform_device_register(&omap_drm_device); - -} -#else -int __init omap_init_drm(void) { return 0; } -#endif diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c index 1cd20e4d56b0..cb5d7314cf99 100644 --- a/arch/arm/mach-omap2/io.c +++ b/arch/arm/mach-omap2/io.c @@ -428,7 +428,6 @@ static void __init __maybe_unused omap_hwmod_init_postsetup(void) static void __init __maybe_unused omap_common_late_init(void) { omap2_common_pm_late_init(); - omap_soc_device_init(); } #ifdef CONFIG_SOC_OMAP2420 diff --git a/arch/arm/mach-tegra/cpuidle-tegra114.c b/arch/arm/mach-tegra/cpuidle-tegra114.c index d3aa9be16621..e3fbcfedf845 100644 --- a/arch/arm/mach-tegra/cpuidle-tegra114.c +++ b/arch/arm/mach-tegra/cpuidle-tegra114.c @@ -60,7 +60,7 @@ static int tegra114_idle_power_down(struct cpuidle_device *dev, return index; } -static void tegra114_idle_enter_freeze(struct cpuidle_device *dev, +static void tegra114_idle_enter_s2idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { @@ -77,7 +77,7 @@ static struct cpuidle_driver tegra_idle_driver = { #ifdef CONFIG_PM_SLEEP [1] = { .enter = tegra114_idle_power_down, - .enter_freeze = tegra114_idle_enter_freeze, + .enter_s2idle = tegra114_idle_enter_s2idle, .exit_latency = 500, .target_residency = 1000, .flags = CPUIDLE_FLAG_TIMER_STOP, diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dfd908630631..0df64a6a56d4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -75,6 +75,7 @@ config ARM64 select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_VMAP_STACK select HAVE_ARM_SMCCC select HAVE_EBPF_JIT select HAVE_C_RECORDMCOUNT @@ -960,6 +961,18 @@ config ARM64_UAO regular load/store instructions if the cpu does not implement the feature. +config ARM64_PMEM + bool "Enable support for persistent memory" + select ARCH_HAS_PMEM_API + select ARCH_HAS_UACCESS_FLUSHCACHE + help + Say Y to enable support for the persistent memory API based on the + ARMv8.2 DCPoP feature. + + The feature is detected at runtime, and the kernel will use DC CVAC + operations if DC CVAP is not supported (following the behaviour of + DC CVAP itself if the system does not define a point of persistence). + endmenu config ARM64_MODULE_CMODEL_LARGE diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts index ba2fde2909f9..6872135d7f84 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts @@ -51,7 +51,6 @@ compatible = "sinovoip,bananapi-m64", "allwinner,sun50i-a64"; aliases { - ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; }; @@ -68,14 +67,6 @@ }; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rgmii_pins>; - phy-mode = "rgmii"; - phy-handle = <&ext_rgmii_phy>; - status = "okay"; -}; - &i2c1 { pinctrl-names = "default"; pinctrl-0 = <&i2c1_pins>; @@ -86,13 +77,6 @@ bias-pull-up; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts index 24f1aac366d6..f82ccf332c0f 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts @@ -48,18 +48,3 @@ /* TODO: Camera, touchscreen, etc. */ }; - -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rgmii_pins>; - phy-mode = "rgmii"; - phy-handle = <&ext_rgmii_phy>; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts index 827168bc22ed..7c533b6d4ba9 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts @@ -51,7 +51,6 @@ compatible = "pine64,pine64", "allwinner,sun50i-a64"; aliases { - ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; serial2 = &uart2; @@ -79,15 +78,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rmii_pins>; - phy-mode = "rmii"; - phy-handle = <&ext_rmii_phy1>; - status = "okay"; - -}; - &i2c1 { pinctrl-names = "default"; pinctrl-0 = <&i2c1_pins>; @@ -98,13 +88,6 @@ bias-pull-up; }; -&mdio { - ext_rmii_phy1: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts index 216e3a5dafae..d891a1a27f6c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts @@ -53,7 +53,6 @@ "allwinner,sun50i-a64"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -77,21 +76,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rgmii_pins>; - phy-mode = "rgmii"; - phy-handle = <&ext_rgmii_phy>; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc2 { pinctrl-names = "default"; pinctrl-0 = <&mmc2_pins>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi index bd0f33b77f57..68aadc9b96dc 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi @@ -449,26 +449,6 @@ #size-cells = <0>; }; - emac: ethernet@1c30000 { - compatible = "allwinner,sun50i-a64-emac"; - syscon = <&syscon>; - reg = <0x01c30000 0x10000>; - interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "macirq"; - resets = <&ccu RST_BUS_EMAC>; - reset-names = "stmmaceth"; - clocks = <&ccu CLK_BUS_EMAC>; - clock-names = "stmmaceth"; - status = "disabled"; - #address-cells = <1>; - #size-cells = <0>; - - mdio: mdio { - #address-cells = <1>; - #size-cells = <0>; - }; - }; - gic: interrupt-controller@1c81000 { compatible = "arm,gic-400"; reg = <0x01c81000 0x1000>, diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts index 968908761194..1c2387bd5df6 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts @@ -50,7 +50,6 @@ compatible = "friendlyarm,nanopi-neo2", "allwinner,sun50i-h5"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -109,22 +108,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@7 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <7>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts index a8296feee884..4f77c8470f6c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts @@ -59,7 +59,6 @@ }; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -137,28 +136,12 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; status = "okay"; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts index d906b302cbcd..6be06873e5af 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts @@ -54,7 +54,6 @@ compatible = "xunlong,orangepi-prime", "allwinner,sun50i-h5"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -144,28 +143,12 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; status = "okay"; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi index e2b0da2c0bc7..105b2938082f 100644 --- a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi +++ b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi @@ -280,9 +280,6 @@ &decon { status = "okay"; - - i80-if-timings { - }; }; &decon_tv { @@ -1116,9 +1113,6 @@ &mic { status = "okay"; - - i80-if-timings { - }; }; &pmu_system_controller { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 31fd77f82ced..d16b9cc1e825 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -653,21 +653,21 @@ }; msi1: msi-controller1@1571000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1571000 0x0 0x8>; msi-controller; interrupts = <0 116 0x4>; }; msi2: msi-controller2@1572000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1572000 0x0 0x8>; msi-controller; interrupts = <0 126 0x4>; }; msi3: msi-controller3@1573000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1573000 0x0 0x8>; msi-controller; interrupts = <0 160 0x4>; @@ -689,7 +689,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi1>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 110 0x4>, @@ -714,7 +714,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi2>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 120 0x4>, @@ -739,7 +739,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x50 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi3>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 154 0x4>, diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index dc1640be0345..c8ff0baddf1d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -630,6 +630,37 @@ interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clockgen 4 1>; }; + + msi1: msi-controller@1580000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x1580000 0x0 0x10000>; + interrupts = <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>; + }; + + msi2: msi-controller@1590000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x1590000 0x0 0x10000>; + interrupts = <GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>; + }; + + msi3: msi-controller@15a0000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x15a0000 0x0 0x10000>; + interrupts = <GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>; + }; + }; reserved-memory { diff --git a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi index 1eb1f1e9aac4..4d360713ed12 100644 --- a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi @@ -268,10 +268,10 @@ ap_gpio: gpio { compatible = "marvell,armada-8k-gpio"; offset = <0x1040>; - ngpios = <19>; + ngpios = <20>; gpio-controller; #gpio-cells = <2>; - gpio-ranges = <&ap_pinctrl 0 0 19>; + gpio-ranges = <&ap_pinctrl 0 0 20>; }; }; }; diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index f81c7b685fc6..2326e39d5892 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += rwsem.h generic-y += segment.h generic-y += serial.h generic-y += set_memory.h -generic-y += simd.h generic-y += sizes.h generic-y += switch_to.h generic-y += trace_clock.h diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 8cef47fa2218..b7e3f74822da 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -116,6 +116,8 @@ static inline void gic_write_bpr1(u32 val) #define gic_read_typer(c) readq_relaxed(c) #define gic_write_irouter(v, c) writeq_relaxed(v, c) +#define gic_read_lpir(c) readq_relaxed(c) +#define gic_write_lpir(v, c) writeq_relaxed(v, c) #define gic_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) @@ -133,5 +135,10 @@ static inline void gic_write_bpr1(u32 val) #define gicr_write_pendbaser(v, c) writeq_relaxed(v, c) #define gicr_read_pendbaser(c) readq_relaxed(c) +#define gits_write_vpropbaser(v, c) writeq_relaxed(v, c) + +#define gits_write_vpendbaser(v, c) writeq_relaxed(v, c) +#define gits_read_vpendbaser(c) readq_relaxed(c) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARCH_GICV3_H */ diff --git a/arch/arm64/include/asm/asm-bug.h b/arch/arm64/include/asm/asm-bug.h new file mode 100644 index 000000000000..636e755bcdca --- /dev/null +++ b/arch/arm64/include/asm/asm-bug.h @@ -0,0 +1,54 @@ +#ifndef __ASM_ASM_BUG_H +/* + * Copyright (C) 2017 ARM Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#define __ASM_ASM_BUG_H + +#include <asm/brk-imm.h> + +#ifdef CONFIG_DEBUG_BUGVERBOSE +#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) +#define __BUGVERBOSE_LOCATION(file, line) \ + .pushsection .rodata.str,"aMS",@progbits,1; \ + 2: .string file; \ + .popsection; \ + \ + .long 2b - 0b; \ + .short line; +#else +#define _BUGVERBOSE_LOCATION(file, line) +#endif + +#ifdef CONFIG_GENERIC_BUG + +#define __BUG_ENTRY(flags) \ + .pushsection __bug_table,"aw"; \ + .align 2; \ + 0: .long 1f - 0b; \ +_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ + .short flags; \ + .popsection; \ + 1: +#else +#define __BUG_ENTRY(flags) +#endif + +#define ASM_BUG_FLAGS(flags) \ + __BUG_ENTRY(flags) \ + brk BUG_BRK_IMM + +#define ASM_BUG() ASM_BUG_FLAGS(0) + +#endif /* __ASM_ASM_BUG_H */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 1b67c3782d00..d58a6253c6ab 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -230,12 +230,18 @@ lr .req x30 // link register .endm /* - * @dst: Result of per_cpu(sym, smp_processor_id()) + * @dst: Result of per_cpu(sym, smp_processor_id()), can be SP for + * non-module code * @sym: The name of the per-cpu variable * @tmp: scratch register */ .macro adr_this_cpu, dst, sym, tmp +#ifndef MODULE + adrp \tmp, \sym + add \dst, \tmp, #:lo12:\sym +#else adr_l \dst, \sym +#endif mrs \tmp, tpidr_el1 add \dst, \dst, \tmp .endm @@ -353,6 +359,12 @@ alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE alternative_else dc civac, \kaddr alternative_endif + .elseif (\op == cvap) +alternative_if ARM64_HAS_DCPOP + sys 3, c7, c12, 1, \kaddr // dc cvap +alternative_else + dc cvac, \kaddr +alternative_endif .else dc \op, \kaddr .endif @@ -403,6 +415,17 @@ alternative_endif .size __pi_##x, . - x; \ ENDPROC(x) +/* + * Annotate a function as being unsuitable for kprobes. + */ +#ifdef CONFIG_KPROBES +#define NOKPROBE(x) \ + .pushsection "_kprobe_blacklist", "aw"; \ + .quad x; \ + .popsection; +#else +#define NOKPROBE(x) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index a02a57186f56..d7dc43752705 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,41 +18,12 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#include <asm/brk-imm.h> +#include <linux/stringify.h> -#ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) -#define __BUGVERBOSE_LOCATION(file, line) \ - ".pushsection .rodata.str,\"aMS\",@progbits,1\n" \ - "2: .string \"" file "\"\n\t" \ - ".popsection\n\t" \ - \ - ".long 2b - 0b\n\t" \ - ".short " #line "\n\t" -#else -#define _BUGVERBOSE_LOCATION(file, line) -#endif - -#ifdef CONFIG_GENERIC_BUG - -#define __BUG_ENTRY(flags) \ - ".pushsection __bug_table,\"aw\"\n\t" \ - ".align 2\n\t" \ - "0: .long 1f - 0b\n\t" \ -_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ - ".short " #flags "\n\t" \ - ".popsection\n" \ - "1: " -#else -#define __BUG_ENTRY(flags) "" -#endif +#include <asm/asm-bug.h> #define __BUG_FLAGS(flags) \ - asm volatile ( \ - __BUG_ENTRY(flags) \ - "brk %[imm]" :: [imm] "i" (BUG_BRK_IMM) \ - ); - + asm volatile (__stringify(ASM_BUG_FLAGS(flags))); #define BUG() do { \ __BUG_FLAGS(0); \ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index d74a284abdc2..76d1cc85d5b1 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -67,7 +67,9 @@ */ extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __inval_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); +extern void __clean_dcache_area_pop(void *addr, size_t len); extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); extern void sync_icache_aliases(void *kaddr, unsigned long len); @@ -150,6 +152,6 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) { } -int set_memory_valid(unsigned long addr, unsigned long size, int enable); +int set_memory_valid(unsigned long addr, int numpages, int enable); #endif diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 8d2272c6822c..8da621627d7c 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -39,7 +39,8 @@ #define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 #define ARM64_WORKAROUND_858921 19 #define ARM64_WORKAROUND_CAVIUM_30115 20 +#define ARM64_HAS_DCPOP 21 -#define ARM64_NCAPS 21 +#define ARM64_NCAPS 22 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8f3043aba873..0cad5a5894b9 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -3,7 +3,9 @@ #include <asm/boot.h> #include <asm/cpufeature.h> +#include <asm/fpsimd.h> #include <asm/io.h> +#include <asm/memory.h> #include <asm/mmu_context.h> #include <asm/neon.h> #include <asm/ptrace.h> @@ -20,8 +22,8 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define arch_efi_call_virt_setup() \ ({ \ - kernel_neon_begin(); \ efi_virtmap_load(); \ + __efi_fpsimd_begin(); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -33,8 +35,8 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define arch_efi_call_virt_teardown() \ ({ \ + __efi_fpsimd_end(); \ efi_virtmap_unload(); \ - kernel_neon_end(); \ }) #define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) @@ -48,6 +50,13 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); */ #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ +/* + * In some configurations (e.g. VMAP_STACK && 64K pages), stacks built into the + * kernel need greater alignment than we require the segments to be padded to. + */ +#define EFI_KIMG_ALIGN \ + (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN) + /* on arm64, the FDT may be located anywhere in system RAM */ static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base) { diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 3288c2b36731..33be513ef24c 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -139,7 +139,6 @@ typedef struct user_fpsimd_state elf_fpregset_t; #define SET_PERSONALITY(ex) \ ({ \ - clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ clear_thread_flag(TIF_32BIT); \ current->personality &= ~READ_IMPLIES_EXEC; \ }) @@ -195,7 +194,6 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; */ #define COMPAT_SET_PERSONALITY(ex) \ ({ \ - set_bit(TIF_32BIT, ¤t->mm->context.flags); \ set_thread_flag(TIF_32BIT); \ }) #define COMPAT_ARCH_DLINFO diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 8cabd57b6348..66ed8b6b9976 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -77,16 +77,23 @@ #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) #define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) -#define ESR_ELx_IL (UL(1) << 25) +#define ESR_ELx_IL_SHIFT (25) +#define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) /* ISS field definitions shared by different classes */ -#define ESR_ELx_WNR (UL(1) << 6) +#define ESR_ELx_WNR_SHIFT (6) +#define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) /* Shared ISS field definitions for Data/Instruction aborts */ -#define ESR_ELx_FnV (UL(1) << 10) -#define ESR_ELx_EA (UL(1) << 9) -#define ESR_ELx_S1PTW (UL(1) << 7) +#define ESR_ELx_SET_SHIFT (11) +#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) +#define ESR_ELx_FnV_SHIFT (10) +#define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) +#define ESR_ELx_EA_SHIFT (9) +#define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) +#define ESR_ELx_S1PTW_SHIFT (7) +#define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) /* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ #define ESR_ELx_FSC (0x3F) @@ -97,15 +104,20 @@ #define ESR_ELx_FSC_PERM (0x0C) /* ISS field definitions for Data Aborts */ -#define ESR_ELx_ISV (UL(1) << 24) +#define ESR_ELx_ISV_SHIFT (24) +#define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) #define ESR_ELx_SAS_SHIFT (22) #define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) -#define ESR_ELx_SSE (UL(1) << 21) +#define ESR_ELx_SSE_SHIFT (21) +#define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) #define ESR_ELx_SRT_SHIFT (16) #define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) -#define ESR_ELx_SF (UL(1) << 15) -#define ESR_ELx_AR (UL(1) << 14) -#define ESR_ELx_CM (UL(1) << 8) +#define ESR_ELx_SF_SHIFT (15) +#define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) +#define ESR_ELx_AR_SHIFT (14) +#define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_CM_SHIFT (8) +#define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_CV (UL(1) << 24) @@ -157,9 +169,10 @@ /* * User space cache operations have the following sysreg encoding * in System instructions. - * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 14 }, WRITE (L=0) + * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 14 }, WRITE (L=0) */ #define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 #define ESR_ELx_SYS64_ISS_CRM_IC_IVAU 5 @@ -209,6 +222,13 @@ #ifndef __ASSEMBLY__ #include <asm/types.h> +static inline bool esr_is_data_abort(u32 esr) +{ + const u32 ec = ESR_ELx_EC(esr); + + return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; +} + const char *esr_get_class_string(u32 esr); #endif /* __ASSEMBLY */ diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 50f559f574fe..410c48163c6a 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -41,16 +41,6 @@ struct fpsimd_state { unsigned int cpu; }; -/* - * Struct for stacking the bottom 'n' FP/SIMD registers. - */ -struct fpsimd_partial_state { - u32 fpsr; - u32 fpcr; - u32 num_regs; - __uint128_t vregs[32]; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ @@ -77,9 +67,9 @@ extern void fpsimd_update_current_state(struct fpsimd_state *state); extern void fpsimd_flush_task_state(struct task_struct *target); -extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, - u32 num_regs); -extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); +/* For use by EFI runtime services calls only */ +extern void __efi_fpsimd_begin(void); +extern void __efi_fpsimd_end(void); #endif diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index a2daf1293028..0f5fdd388b0d 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -75,59 +75,3 @@ ldr w\tmpnr, [\state, #16 * 2 + 4] fpsimd_restore_fpcr x\tmpnr, \state .endm - -.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 - mrs x\tmpnr1, fpsr - str w\numnr, [\state, #8] - mrs x\tmpnr2, fpcr - stp w\tmpnr1, w\tmpnr2, [\state] - adr x\tmpnr1, 0f - add \state, \state, x\numnr, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 - br x\tmpnr1 - stp q30, q31, [\state, #-16 * 30 - 16] - stp q28, q29, [\state, #-16 * 28 - 16] - stp q26, q27, [\state, #-16 * 26 - 16] - stp q24, q25, [\state, #-16 * 24 - 16] - stp q22, q23, [\state, #-16 * 22 - 16] - stp q20, q21, [\state, #-16 * 20 - 16] - stp q18, q19, [\state, #-16 * 18 - 16] - stp q16, q17, [\state, #-16 * 16 - 16] - stp q14, q15, [\state, #-16 * 14 - 16] - stp q12, q13, [\state, #-16 * 12 - 16] - stp q10, q11, [\state, #-16 * 10 - 16] - stp q8, q9, [\state, #-16 * 8 - 16] - stp q6, q7, [\state, #-16 * 6 - 16] - stp q4, q5, [\state, #-16 * 4 - 16] - stp q2, q3, [\state, #-16 * 2 - 16] - stp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm - -.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 - ldp w\tmpnr1, w\tmpnr2, [\state] - msr fpsr, x\tmpnr1 - fpsimd_restore_fpcr x\tmpnr2, x\tmpnr1 - adr x\tmpnr1, 0f - ldr w\tmpnr2, [\state, #8] - add \state, \state, x\tmpnr2, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 - br x\tmpnr1 - ldp q30, q31, [\state, #-16 * 30 - 16] - ldp q28, q29, [\state, #-16 * 28 - 16] - ldp q26, q27, [\state, #-16 * 26 - 16] - ldp q24, q25, [\state, #-16 * 24 - 16] - ldp q22, q23, [\state, #-16 * 22 - 16] - ldp q20, q21, [\state, #-16 * 20 - 16] - ldp q18, q19, [\state, #-16 * 18 - 16] - ldp q16, q17, [\state, #-16 * 16 - 16] - ldp q14, q15, [\state, #-16 * 14 - 16] - ldp q12, q13, [\state, #-16 * 12 - 16] - ldp q10, q11, [\state, #-16 * 10 - 16] - ldp q8, q9, [\state, #-16 * 8 - 16] - ldp q6, q7, [\state, #-16 * 6 - 16] - ldp q4, q5, [\state, #-16 * 4 - 16] - ldp q2, q3, [\state, #-16 * 2 - 16] - ldp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f32b42e8725d..5bb2fd4674e7 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -48,20 +48,10 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (int)(encoded_op << 8) >> 20; - int cmparg = (int)(encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1U << (oparg & 0x1f); - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 793bd73b0d07..1dca41bea16a 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -18,7 +18,6 @@ #ifndef __ASM_HUGETLB_H #define __ASM_HUGETLB_H -#include <asm-generic/hugetlb.h> #include <asm/page.h> static inline pte_t huge_ptep_get(pte_t *ptep) @@ -82,6 +81,14 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); +#define huge_pte_clear huge_pte_clear +extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +#define set_huge_swap_pte_at set_huge_swap_pte_at + +#include <asm-generic/hugetlb.h> #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static inline bool gigantic_page_supported(void) { return true; } diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index b77197d941fc..5e6f77239064 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,45 +1,12 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H -#define IRQ_STACK_SIZE THREAD_SIZE -#define IRQ_STACK_START_SP THREAD_START_SP - #ifndef __ASSEMBLER__ -#include <linux/percpu.h> - #include <asm-generic/irq.h> -#include <asm/thread_info.h> struct pt_regs; -DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); - -/* - * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S, which - * is structured as follows: - * - * ------------ - * | | <- irq_stack_ptr - * top ------------ - * | x19 | <- irq_stack_ptr - 0x08 - * ------------ - * | x29 | <- irq_stack_ptr - 0x10 - * ------------ - * - * where x19 holds a copy of the task stack pointer where the struct pt_regs - * from kernel_entry can be found. - * - */ -#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) - -/* - * The offset from irq_stack_ptr where entry.S will store the original - * stack pointer. Used by unwind_frame() and dump_backtrace(). - */ -#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) - extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -47,14 +14,5 @@ static inline int nr_legacy_irqs(void) return 0; } -static inline bool on_irq_stack(unsigned long sp, int cpu) -{ - /* variable names the same as kernel/stacktrace.c */ - unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); - unsigned long high = low + IRQ_STACK_START_SP; - - return (low <= sp && sp <= high); -} - #endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index a89cc22abadc..672c8684d5c2 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -175,18 +175,15 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) static inline void kvm_set_s2pte_readonly(pte_t *pte) { - pteval_t pteval; - unsigned long tmp; - - asm volatile("// kvm_set_s2pte_readonly\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " and %0, %0, %3 // clear PTE_S2_RDWR\n" - " orr %0, %0, %4 // set PTE_S2_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte)) - : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY)); + pteval_t old_pteval, pteval; + + pteval = READ_ONCE(pte_val(*pte)); + do { + old_pteval = pteval; + pteval &= ~PTE_S2_RDWR; + pteval |= PTE_S2_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*pte), old_pteval, pteval); + } while (pteval != old_pteval); } static inline bool kvm_s2pte_readonly(pte_t *pte) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index ef39dcb9ca6a..3585a5e26151 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -25,6 +25,7 @@ #include <linux/const.h> #include <linux/types.h> #include <asm/bug.h> +#include <asm/page-def.h> #include <asm/sizes.h> /* @@ -103,6 +104,58 @@ #define KASAN_SHADOW_SIZE (0) #endif +#define MIN_THREAD_SHIFT 14 + +/* + * VMAP'd stacks are allocated at page granularity, so we must ensure that such + * stacks are a multiple of page size. + */ +#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT) +#define THREAD_SHIFT PAGE_SHIFT +#else +#define THREAD_SHIFT MIN_THREAD_SHIFT +#endif + +#if THREAD_SHIFT >= PAGE_SHIFT +#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) +#endif + +#define THREAD_SIZE (UL(1) << THREAD_SHIFT) + +/* + * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by + * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry + * assembly. + */ +#ifdef CONFIG_VMAP_STACK +#define THREAD_ALIGN (2 * THREAD_SIZE) +#else +#define THREAD_ALIGN THREAD_SIZE +#endif + +#define IRQ_STACK_SIZE THREAD_SIZE + +#define OVERFLOW_STACK_SIZE SZ_4K + +/* + * Alignment of kernel segments (e.g. .text, .data). + */ +#if defined(CONFIG_DEBUG_ALIGN_RODATA) +/* + * 4 KB granule: 1 level 2 entry + * 16 KB granule: 128 level 3 entries, with contiguous bit + * 64 KB granule: 32 level 3 entries, with contiguous bit + */ +#define SEGMENT_ALIGN SZ_2M +#else +/* + * 4 KB granule: 16 level 3 entries, with contiguous bit + * 16 KB granule: 4 level 3 entries, without contiguous bit + * 64 KB granule: 1 level 3 entry + */ +#define SEGMENT_ALIGN SZ_64K +#endif + /* * Memory types available. */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 5468c834b072..0d34bf0a89c7 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -16,6 +16,8 @@ #ifndef __ASM_MMU_H #define __ASM_MMU_H +#define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ + typedef struct { atomic64_t id; void *vdso; diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index ad4cdc966c0f..f922eaf780f9 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -8,12 +8,22 @@ * published by the Free Software Foundation. */ +#ifndef __ASM_NEON_H +#define __ASM_NEON_H + #include <linux/types.h> #include <asm/fpsimd.h> #define cpu_has_neon() system_supports_fpsimd() -#define kernel_neon_begin() kernel_neon_begin_partial(32) - -void kernel_neon_begin_partial(u32 num_regs); +void kernel_neon_begin(void); void kernel_neon_end(void); + +/* + * Temporary macro to allow the crypto code to compile. Note that the + * semantics of kernel_neon_begin_partial() are now different from the + * original as it does not allow being called in an interrupt context. + */ +#define kernel_neon_begin_partial(num_regs) kernel_neon_begin() + +#endif /* ! __ASM_NEON_H */ diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index bf466d1876e3..ef7b23863a7c 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -7,9 +7,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) -/* currently, arm64 implements flat NUMA topology */ -#define parent_node(node) (node) - int __node_distance(int from, int to); #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h new file mode 100644 index 000000000000..01591a29dc2e --- /dev/null +++ b/arch/arm64/include/asm/page-def.h @@ -0,0 +1,34 @@ +/* + * Based on arch/arm/include/asm/page.h + * + * Copyright (C) 1995-2003 Russell King + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __ASM_PAGE_DEF_H +#define __ASM_PAGE_DEF_H + +#include <linux/const.h> + +/* PAGE_SHIFT determines the page size */ +/* CONT_SHIFT determines the number of pages which can be tracked together */ +#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT +#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT +#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) +#define CONT_MASK (~(CONT_SIZE-1)) + +#endif /* __ASM_PAGE_DEF_H */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 8472c6def5ef..60d02c81a3a2 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -19,17 +19,7 @@ #ifndef __ASM_PAGE_H #define __ASM_PAGE_H -#include <linux/const.h> - -/* PAGE_SHIFT determines the page size */ -/* CONT_SHIFT determines the number of pages which can be tracked together */ -#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT -#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) -#define CONT_MASK (~(CONT_SIZE-1)) +#include <asm/page-def.h> #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2142c7726e76..0a5635fb0ef9 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -63,23 +63,21 @@ #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) -#define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_NG | PTE_PXN) +#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) +#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #define __P000 PAGE_NONE #define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY +#define __P010 PAGE_READONLY +#define __P011 PAGE_READONLY #define __P100 PAGE_EXECONLY #define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC +#define __P110 PAGE_READONLY_EXEC +#define __P111 PAGE_READONLY_EXEC #define __S000 PAGE_NONE #define __S001 PAGE_READONLY diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6eae342ced6b..bc4e92337d16 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -39,6 +39,7 @@ #ifndef __ASSEMBLY__ +#include <asm/cmpxchg.h> #include <asm/fixmap.h> #include <linux/mmdebug.h> @@ -84,11 +85,7 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; (__boundary - 1 < (end) - 1) ? __boundary : (end); \ }) -#ifdef CONFIG_ARM64_HW_AFDBM #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) -#else -#define pte_hw_dirty(pte) (0) -#endif #define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY)) #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) @@ -124,12 +121,16 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) static inline pte_t pte_wrprotect(pte_t pte) { - return clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkwrite(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkclean(pte_t pte) @@ -168,11 +169,6 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } -static inline pte_t pte_clear_rdonly(pte_t pte) -{ - return clear_pte_bit(pte, __pgprot(PTE_RDONLY)); -} - static inline pte_t pte_mkpresent(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_VALID)); @@ -220,22 +216,15 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - if (pte_present(pte)) { - if (pte_sw_dirty(pte) && pte_write(pte)) - pte_val(pte) &= ~PTE_RDONLY; - else - pte_val(pte) |= PTE_RDONLY; - if (pte_user_exec(pte) && !pte_special(pte)) - __sync_icache_dcache(pte, addr); - } + if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte, addr); /* * If the existing pte is valid, check for potential race with * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && - pte_valid(*ptep) && pte_valid(pte)) { + if (pte_valid(*ptep) && pte_valid(pte)) { VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(*ptep), pte_val(pte)); @@ -571,7 +560,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#ifdef CONFIG_ARM64_HW_AFDBM #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, @@ -593,20 +581,17 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int __ptep_test_and_clear_young(pte_t *ptep) { - pteval_t pteval; - unsigned int tmp, res; + pte_t old_pte, pte; - asm volatile("// __ptep_test_and_clear_young\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n" - " and %0, %0, %4 // clear PTE_AF\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)), "=&r" (res) - : "L" (~PTE_AF), "I" (ilog2(PTE_AF))); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_mkold(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); - return res; + return pte_young(pte); } static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, @@ -630,17 +615,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t old_pteval; - unsigned int tmp; - - asm volatile("// ptep_get_and_clear\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " stxr %w1, xzr, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))); - - return __pte(old_pteval); + return __pte(xchg_relaxed(&pte_val(*ptep), 0)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -653,27 +628,32 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware - * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. + * ptep_set_wrprotect - mark read-only while preserving the hardware update of + * the Access Flag. */ #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t pteval; - unsigned long tmp; + pte_t old_pte, pte; - asm volatile("// ptep_set_wrprotect\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " tst %0, %4 // check for hw dirty (!PTE_RDONLY)\n" - " csel %1, %3, xzr, eq // set PTE_DIRTY|PTE_RDONLY if dirty\n" - " orr %0, %0, %1 // if !dirty, PTE_RDONLY is already set\n" - " and %0, %0, %5 // clear PTE_WRITE/PTE_DBM\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "r" (PTE_DIRTY|PTE_RDONLY), "L" (PTE_RDONLY), "L" (~PTE_WRITE) - : "cc"); + /* + * ptep_set_wrprotect() is only called on CoW mappings which are + * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE && + * PTE_RDONLY) or writable and software-dirty (PTE_WRITE && + * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and + * protection_map[]. There is no race with the hardware update of the + * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM) + * is set. + */ + VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep), + "%s: potential race with hardware DBM", __func__); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_wrprotect(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -684,7 +664,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #endif -#endif /* CONFIG_ARM64_HW_AFDBM */ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 64c9e78f9882..29adab8138c3 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -112,7 +112,7 @@ void tls_preserve_current_state(void); static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) { memset(regs, 0, sizeof(*regs)); - regs->syscallno = ~0UL; + forget_syscall(regs); regs->pc = pc; } @@ -159,7 +159,7 @@ extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); #define task_pt_regs(p) \ - ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) + ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) #define KSTK_EIP(tsk) ((unsigned long)task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) user_stack_pointer(task_pt_regs(tsk)) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 11403fdd0a50..6069d66e0bc2 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -72,8 +72,19 @@ #define COMPAT_PT_TEXT_ADDR 0x10000 #define COMPAT_PT_DATA_ADDR 0x10004 #define COMPAT_PT_TEXT_END_ADDR 0x10008 + +/* + * If pt_regs.syscallno == NO_SYSCALL, then the thread is not executing + * a syscall -- i.e., its most recent entry into the kernel from + * userspace was not via SVC, or otherwise a tracer cancelled the syscall. + * + * This must have the value -1, for ABI compatibility with ptrace etc. + */ +#define NO_SYSCALL (-1) + #ifndef __ASSEMBLY__ #include <linux/bug.h> +#include <linux/types.h> /* sizeof(struct user) for AArch32 */ #define COMPAT_USER_SZ 296 @@ -116,11 +127,29 @@ struct pt_regs { }; }; u64 orig_x0; - u64 syscallno; +#ifdef __AARCH64EB__ + u32 unused2; + s32 syscallno; +#else + s32 syscallno; + u32 unused2; +#endif + u64 orig_addr_limit; u64 unused; // maintain 16 byte alignment + u64 stackframe[2]; }; +static inline bool in_syscall(struct pt_regs const *regs) +{ + return regs->syscallno != NO_SYSCALL; +} + +static inline void forget_syscall(struct pt_regs *regs) +{ + regs->syscallno = NO_SYSCALL; +} + #define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) #define arch_has_single_step() (1) diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h index eeaa97559bab..81abea0b7650 100644 --- a/arch/arm64/include/asm/signal32.h +++ b/arch/arm64/include/asm/signal32.h @@ -22,8 +22,6 @@ #define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500 -extern const compat_ulong_t aarch32_sigret_code[6]; - int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h new file mode 100644 index 000000000000..fa8b3fe932e6 --- /dev/null +++ b/arch/arm64/include/asm/simd.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef __ASM_SIMD_H +#define __ASM_SIMD_H + +#include <linux/compiler.h> +#include <linux/irqflags.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/types.h> + +#ifdef CONFIG_KERNEL_MODE_NEON + +DECLARE_PER_CPU(bool, kernel_neon_busy); + +/* + * may_use_simd - whether it is allowable at this time to issue SIMD + * instructions or access the SIMD register file + * + * Callers must not assume that the result remains true beyond the next + * preempt_enable() or return from softirq context. + */ +static __must_check inline bool may_use_simd(void) +{ + /* + * The raw_cpu_read() is racy if called with preemption enabled. + * This is not a bug: kernel_neon_busy is only set when + * preemption is disabled, so we cannot migrate to another CPU + * while it is set, nor can we migrate to a CPU where it is set. + * So, if we find it clear on some CPU then we're guaranteed to + * find it clear on any CPU we could migrate to. + * + * If we are in between kernel_neon_begin()...kernel_neon_end(), + * the flag will be set, but preemption is also disabled, so we + * can't migrate to another CPU and spuriously see it become + * false. + */ + return !in_irq() && !irqs_disabled() && !in_nmi() && + !raw_cpu_read(kernel_neon_busy); +} + +#else /* ! CONFIG_KERNEL_MODE_NEON */ + +static __must_check inline bool may_use_simd(void) { + return false; +} + +#endif /* ! CONFIG_KERNEL_MODE_NEON */ + +#endif diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 55f08c5acfad..f82b447bd34f 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -148,7 +148,7 @@ static inline void cpu_panic_kernel(void) */ bool cpus_are_stuck_in_kernel(void); -extern void smp_send_crash_stop(void); +extern void crash_smp_send_stop(void); extern bool smp_crash_stop_failed(void); #endif /* ifndef __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index cae331d553f8..95ad7102b63c 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,58 +26,6 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - unsigned int tmp; - arch_spinlock_t lockval; - u32 owner; - - /* - * Ensure prior spin_lock operations to other locks have completed - * on this CPU before we test whether "lock" is locked. - */ - smp_mb(); - owner = READ_ONCE(lock->owner) << 16; - - asm volatile( -" sevl\n" -"1: wfe\n" -"2: ldaxr %w0, %2\n" - /* Is the lock free? */ -" eor %w1, %w0, %w0, ror #16\n" -" cbz %w1, 3f\n" - /* Lock taken -- has there been a subsequent unlock->lock transition? */ -" eor %w1, %w3, %w0, lsl #16\n" -" cbz %w1, 1b\n" - /* - * The owner has been updated, so there was an unlock->lock - * transition that we missed. That means we can rely on the - * store-release of the unlock operation paired with the - * load-acquire of the lock operation to publish any of our - * previous stores to the new lock owner and therefore don't - * need to bother with the writeback below. - */ -" b 4f\n" -"3:\n" - /* - * Serialise against any concurrent lockers by writing back the - * unlocked lock value - */ - ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ -" stxr %w1, %w0, %2\n" - __nops(2), - /* LSE atomics */ -" mov %w1, %w0\n" -" cas %w0, %w0, %2\n" -" eor %w1, %w1, %w0\n") - /* Somebody else wrote to the lock, GOTO 10 and reload the value */ -" cbnz %w1, 2b\n" -"4:" - : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) - : "r" (owner) - : "memory"); -} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) @@ -176,7 +124,11 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock) static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - smp_mb(); /* See arch_spin_unlock_wait */ + /* + * Ensure prior spin_lock operations to other locks have completed + * on this CPU before we test whether "lock" is locked. + */ + smp_mb(); /* ^^^ */ return !arch_spin_value_unlocked(READ_ONCE(*lock)); } @@ -358,14 +310,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* - * Accesses appearing in program order before a spin_lock() operation - * can be reordered with accesses inside the critical section, by virtue - * of arch_spin_lock being constructed using acquire semantics. - * - * In cases where this is problematic (e.g. try_to_wake_up), an - * smp_mb__before_spinlock() can restore the required ordering. - */ -#define smp_mb__before_spinlock() smp_mb() +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 5b6eafccc5d8..6ad30776e984 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -16,11 +16,15 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H -struct task_struct; +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> + +#include <asm/memory.h> +#include <asm/ptrace.h> struct stackframe { unsigned long fp; - unsigned long sp; unsigned long pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned int graph; @@ -32,4 +36,57 @@ extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); +DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); + +static inline bool on_irq_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); + unsigned long high = low + IRQ_STACK_SIZE; + + if (!low) + return false; + + return (low <= sp && sp < high); +} + +static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) +{ + unsigned long low = (unsigned long)task_stack_page(tsk); + unsigned long high = low + THREAD_SIZE; + + return (low <= sp && sp < high); +} + +#ifdef CONFIG_VMAP_STACK +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); + +static inline bool on_overflow_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (low <= sp && sp < high); +} +#else +static inline bool on_overflow_stack(unsigned long sp) { return false; } +#endif + +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp) +{ + if (on_task_stack(tsk, sp)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp)) + return true; + if (on_overflow_stack(sp)) + return true; + + return false; +} + #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index d0aa42907569..dd95d33a5bd5 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -52,6 +52,10 @@ extern void *__memset(void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMCMP extern int memcmp(const void *, const void *, size_t); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE +void memcpy_flushcache(void *dst, const void *src, size_t cnt); +#endif #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 248339e4aaf5..f707fed5886f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -329,6 +329,7 @@ #define ID_AA64ISAR1_LRCPC_SHIFT 20 #define ID_AA64ISAR1_FCMA_SHIFT 16 #define ID_AA64ISAR1_JSCVT_SHIFT 12 +#define ID_AA64ISAR1_DPB_SHIFT 0 /* id_aa64pfr0 */ #define ID_AA64PFR0_GIC_SHIFT 24 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93cf865..ddded6497a8a 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -23,19 +23,11 @@ #include <linux/compiler.h> -#ifdef CONFIG_ARM64_4K_PAGES -#define THREAD_SIZE_ORDER 2 -#elif defined(CONFIG_ARM64_16K_PAGES) -#define THREAD_SIZE_ORDER 0 -#endif - -#define THREAD_SIZE 16384 -#define THREAD_START_SP (THREAD_SIZE - 16) - #ifndef __ASSEMBLY__ struct task_struct; +#include <asm/memory.h> #include <asm/stack_pointer.h> #include <asm/types.h> @@ -68,6 +60,9 @@ struct thread_info { #define thread_saved_fp(tsk) \ ((unsigned long)(tsk->thread.cpu_context.fp)) +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec + #endif /* @@ -86,6 +81,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -107,11 +103,12 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE) + _TIF_UPROBE | _TIF_FSCHECK) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index 02e9035b0685..d131501c6222 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook); void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { @@ -60,4 +53,9 @@ static inline int in_exception_text(unsigned long ptr) return in ? : __in_irqentry_text(ptr); } +static inline int in_entry_text(unsigned long ptr) +{ + return ptr >= (unsigned long)&__entry_text_start && + ptr < (unsigned long)&__entry_text_end; +} #endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index fab46a0ea223..fc0f9eb66039 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); + /* * Enable/disable UAO so that copy_to_user() etc can access * kernel memory with the unprivileged instructions. @@ -347,4 +350,16 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +struct page; +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); +extern unsigned long __must_check __copy_user_flushcache(void *to, const void __user *from, unsigned long n); + +static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + kasan_check_write(dst, size); + return __copy_user_flushcache(dst, src, size); +} +#endif + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 4e187ce2a811..4b9344cba83a 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -35,5 +35,6 @@ #define HWCAP_JSCVT (1 << 13) #define HWCAP_FCMA (1 << 14) #define HWCAP_LRCPC (1 << 15) +#define HWCAP_DCPOP (1 << 16) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index e25c11e727fe..b3162715ed78 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -95,7 +95,7 @@ static int __init dt_scan_depth1_nodes(unsigned long node, * __acpi_map_table() will be called before page_init(), so early_ioremap() * or early_memremap() should be called here to for ACPI table mapping. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!size) return NULL; @@ -103,7 +103,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index b3bb7ef97bc8..71bf088f1e4b 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -75,6 +75,7 @@ int main(void) DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f9e0064c8c1..cd52d365d1f0 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -120,6 +120,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_DPB_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -888,6 +889,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 0, .matches = has_no_fpsimd, }, +#ifdef CONFIG_ARM64_PMEM + { + .desc = "Data cache clean to Point of Persistence", + .capability = ARM64_HAS_DCPOP, + .def_scope = SCOPE_SYSTEM, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .min_field_value = 1, + }, +#endif {}, }; @@ -916,6 +928,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f495ee5049fd..311885962830 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -68,6 +68,7 @@ static const char *const hwcap_str[] = { "jscvt", "fcma", "lrcpc", + "dcpop", NULL }; diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index c44a82f146b1..6a27cd6dbfa6 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -41,27 +41,3 @@ ENTRY(fpsimd_load_state) fpsimd_restore x0, 8 ret ENDPROC(fpsimd_load_state) - -#ifdef CONFIG_KERNEL_MODE_NEON - -/* - * Save the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_save_partial_state) - fpsimd_save_partial x0, 1, 8, 9 - ret -ENDPROC(fpsimd_save_partial_state) - -/* - * Load the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_load_partial_state) - fpsimd_restore_partial x0, 8, 9 - ret -ENDPROC(fpsimd_load_partial_state) - -#endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b738880350f9..e1c59d4008a8 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -69,8 +69,55 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_entry, el, regsize = 64 + .macro kernel_ventry label + .align 7 sub sp, sp, #S_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * Task and IRQ stacks are aligned to (1 << THREAD_SHIFT). + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbnz x0, #THREAD_SHIFT, 0f + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + b \label + +0: + /* + * Either we've just detected an overflow, or we've taken an exception + * while on the overflow stack. Either way, we won't return to + * userspace, and can clobber EL0 registers to free up GPRs. + */ + + /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + msr tpidr_el0, x0 + + /* Recover the original x0 value and stash it in tpidrro_el0 */ + sub x0, sp, x0 + msr tpidrro_el0, x0 + + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + /* + * Check whether we were already on the overflow stack. This may happen + * after panic() re-enables interrupts. + */ + mrs x0, tpidr_el0 // sp of interrupted context + sub x0, sp, x0 // delta with top of overflow stack + tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? + b.ne __bad_stack // no? -> bad stack pointer + + /* We were already on the overflow stack. Restore sp/x0 and carry on. */ + sub sp, sp, x0 + mrs x0, tpidrro_el0 +#endif + b \label + .endm + + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -111,6 +158,18 @@ mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + /* + * In order to be able to dump the contents of struct pt_regs at the + * time the exception was taken (in case we attempt to walk the call + * stack later), chain it together with the stack frames. + */ + .if \el == 0 + stp xzr, xzr, [sp, #S_STACKFRAME] + .else + stp x29, x22, [sp, #S_STACKFRAME] + .endif + add x29, sp, #S_STACKFRAME + #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Set the TTBR0 PAN bit in SPSR. When the exception is taken from @@ -138,12 +197,10 @@ alternative_else_nop_endif stp x22, x23, [sp, #S_PC] - /* - * Set syscallno to -1 by default (overridden later if real syscall). - */ + /* Not in a syscall by default (el0_svc overwrites for real syscall) */ .if \el == 0 - mvn x21, xzr - str x21, [sp, #S_SYSCALLNO] + mov w21, #NO_SYSCALL + str w21, [sp, #S_SYSCALLNO] .endif /* @@ -259,20 +316,12 @@ alternative_else_nop_endif and x25, x25, #~(THREAD_SIZE - 1) cbnz x25, 9998f - adr_this_cpu x25, irq_stack, x26 - mov x26, #IRQ_STACK_START_SP + ldr_this_cpu x25, irq_stack_ptr, x26 + mov x26, #IRQ_STACK_SIZE add x26, x25, x26 /* switch to the irq stack */ mov sp, x26 - - /* - * Add a dummy stack frame, this non-standard format is fixed up - * by unwind_frame() - */ - stp x29, x19, [sp, #-16]! - mov x29, sp - 9998: .endm @@ -290,8 +339,9 @@ alternative_else_nop_endif * * x7 is reserved for the system call number in 32-bit mode. */ -sc_nr .req x25 // number of system calls -scno .req x26 // syscall number +wsc_nr .req w25 // number of system calls +wscno .req w26 // syscall number +xscno .req x26 // syscall number (zero-extended) stbl .req x27 // syscall table pointer tsk .req x28 // current thread_info @@ -315,34 +365,62 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - ventry el1_sync_invalid // Synchronous EL1t - ventry el1_irq_invalid // IRQ EL1t - ventry el1_fiq_invalid // FIQ EL1t - ventry el1_error_invalid // Error EL1t + kernel_ventry el1_sync_invalid // Synchronous EL1t + kernel_ventry el1_irq_invalid // IRQ EL1t + kernel_ventry el1_fiq_invalid // FIQ EL1t + kernel_ventry el1_error_invalid // Error EL1t - ventry el1_sync // Synchronous EL1h - ventry el1_irq // IRQ EL1h - ventry el1_fiq_invalid // FIQ EL1h - ventry el1_error_invalid // Error EL1h + kernel_ventry el1_sync // Synchronous EL1h + kernel_ventry el1_irq // IRQ EL1h + kernel_ventry el1_fiq_invalid // FIQ EL1h + kernel_ventry el1_error_invalid // Error EL1h - ventry el0_sync // Synchronous 64-bit EL0 - ventry el0_irq // IRQ 64-bit EL0 - ventry el0_fiq_invalid // FIQ 64-bit EL0 - ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry el0_sync // Synchronous 64-bit EL0 + kernel_ventry el0_irq // IRQ 64-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 + kernel_ventry el0_error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - ventry el0_sync_compat // Synchronous 32-bit EL0 - ventry el0_irq_compat // IRQ 32-bit EL0 - ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 + kernel_ventry el0_irq_compat // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid_compat // Error 32-bit EL0 #else - ventry el0_sync_invalid // Synchronous 32-bit EL0 - ventry el0_irq_invalid // IRQ 32-bit EL0 - ventry el0_fiq_invalid // FIQ 32-bit EL0 - ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 + kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid // Error 32-bit EL0 #endif END(vectors) +#ifdef CONFIG_VMAP_STACK + /* + * We detected an overflow in kernel_ventry, which switched to the + * overflow stack. Stash the exception regs, and head to our overflow + * handler. + */ +__bad_stack: + /* Restore the original x0 value */ + mrs x0, tpidrro_el0 + + /* + * Store the original GPRs to the new stack. The orginal SP (minus + * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + */ + sub sp, sp, #S_FRAME_SIZE + kernel_entry 1 + mrs x0, tpidr_el0 + add x0, x0, #S_FRAME_SIZE + str x0, [sp, #S_SP] + + /* Stash the regs for handle_bad_stack */ + mov x0, sp + + /* Time to die */ + bl handle_bad_stack + ASM_BUG() +#endif /* CONFIG_VMAP_STACK */ + /* * Invalid mode handlers */ @@ -351,7 +429,8 @@ END(vectors) mov x0, sp mov x1, #\reason mrs x2, esr_el1 - b bad_mode + bl bad_mode + ASM_BUG() .endm el0_sync_invalid: @@ -448,14 +527,16 @@ el1_sp_pc: mrs x0, far_el1 enable_dbg mov x2, sp - b do_sp_pc_abort + bl do_sp_pc_abort + ASM_BUG() el1_undef: /* * Undefined instruction */ enable_dbg mov x0, sp - b do_undefinstr + bl do_undefinstr + ASM_BUG() el1_dbg: /* * Debug exception handling @@ -473,7 +554,8 @@ el1_inv: mov x0, sp mov x2, x1 mov x1, #BAD_SYNC - b bad_mode + bl bad_mode + ASM_BUG() ENDPROC(el1_sync) .align 6 @@ -577,8 +659,8 @@ el0_svc_compat: * AArch32 syscall handling */ adrp stbl, compat_sys_call_table // load compat syscall table pointer - uxtw scno, w7 // syscall number in w7 (r7) - mov sc_nr, #__NR_compat_syscalls + mov wscno, w7 // syscall number in w7 (r7) + mov wsc_nr, #__NR_compat_syscalls b el0_svc_naked .align 6 @@ -707,38 +789,6 @@ el0_irq_naked: ENDPROC(el0_irq) /* - * Register switch for AArch64. The callee-saved registers need to be saved - * and restored. On entry: - * x0 = previous task_struct (must be preserved across the switch) - * x1 = next task_struct - * Previous and next are guaranteed not to be the same. - * - */ -ENTRY(cpu_switch_to) - mov x10, #THREAD_CPU_CONTEXT - add x8, x0, x10 - mov x9, sp - stp x19, x20, [x8], #16 // store callee-saved registers - stp x21, x22, [x8], #16 - stp x23, x24, [x8], #16 - stp x25, x26, [x8], #16 - stp x27, x28, [x8], #16 - stp x29, x9, [x8], #16 - str lr, [x8] - add x8, x1, x10 - ldp x19, x20, [x8], #16 // restore callee-saved registers - ldp x21, x22, [x8], #16 - ldp x23, x24, [x8], #16 - ldp x25, x26, [x8], #16 - ldp x27, x28, [x8], #16 - ldp x29, x9, [x8], #16 - ldr lr, [x8] - mov sp, x9 - msr sp_el0, x1 - ret -ENDPROC(cpu_switch_to) - -/* * This is the fast syscall return path. We do as little as possible here, * and this includes saving x0 back into the kernel stack. */ @@ -781,36 +831,24 @@ finish_ret_to_user: ENDPROC(ret_to_user) /* - * This is how we return from a fork. - */ -ENTRY(ret_from_fork) - bl schedule_tail - cbz x19, 1f // not a kernel thread - mov x0, x20 - blr x19 -1: get_thread_info tsk - b ret_to_user -ENDPROC(ret_from_fork) - -/* * SVC handler. */ .align 6 el0_svc: adrp stbl, sys_call_table // load syscall table pointer - uxtw scno, w8 // syscall number in w8 - mov sc_nr, #__NR_syscalls + mov wscno, w8 // syscall number in w8 + mov wsc_nr, #__NR_syscalls el0_svc_naked: // compat entry point - stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number + stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number enable_dbg_and_irq ct_user_exit 1 ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs ni_sys - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine b ret_fast_syscall ni_sys: @@ -824,24 +862,23 @@ ENDPROC(el0_svc) * switches, and waiting for our parent to respond. */ __sys_trace: - mov w0, #-1 // set default errno for - cmp scno, x0 // user-issued syscall(-1) + cmp wscno, #NO_SYSCALL // user-issued syscall(-1)? b.ne 1f - mov x0, #-ENOSYS + mov x0, #-ENOSYS // set default errno if so str x0, [sp, #S_X0] 1: mov x0, sp bl syscall_trace_enter - cmp w0, #-1 // skip the syscall? + cmp w0, #NO_SYSCALL // skip the syscall? b.eq __sys_trace_return_skipped - uxtw scno, w0 // syscall number (possibly new) + mov wscno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs __ni_sys_trace ldp x0, x1, [sp] // restore the syscall args ldp x2, x3, [sp, #S_X2] ldp x4, x5, [sp, #S_X4] ldp x6, x7, [sp, #S_X6] - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine __sys_trace_return: @@ -865,3 +902,49 @@ ENTRY(sys_rt_sigreturn_wrapper) mov x0, sp b sys_rt_sigreturn ENDPROC(sys_rt_sigreturn_wrapper) + +/* + * Register switch for AArch64. The callee-saved registers need to be saved + * and restored. On entry: + * x0 = previous task_struct (must be preserved across the switch) + * x1 = next task_struct + * Previous and next are guaranteed not to be the same. + * + */ +ENTRY(cpu_switch_to) + mov x10, #THREAD_CPU_CONTEXT + add x8, x0, x10 + mov x9, sp + stp x19, x20, [x8], #16 // store callee-saved registers + stp x21, x22, [x8], #16 + stp x23, x24, [x8], #16 + stp x25, x26, [x8], #16 + stp x27, x28, [x8], #16 + stp x29, x9, [x8], #16 + str lr, [x8] + add x8, x1, x10 + ldp x19, x20, [x8], #16 // restore callee-saved registers + ldp x21, x22, [x8], #16 + ldp x23, x24, [x8], #16 + ldp x25, x26, [x8], #16 + ldp x27, x28, [x8], #16 + ldp x29, x9, [x8], #16 + ldr lr, [x8] + mov sp, x9 + msr sp_el0, x1 + ret +ENDPROC(cpu_switch_to) +NOKPROBE(cpu_switch_to) + +/* + * This is how we return from a fork. + */ +ENTRY(ret_from_fork) + bl schedule_tail + cbz x19, 1f // not a kernel thread + mov x0, x20 + blr x19 +1: get_thread_info tsk + b ret_to_user +ENDPROC(ret_from_fork) +NOKPROBE(ret_from_fork) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c7b4995868e1..3a68cf38a6b3 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -17,16 +17,19 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/bottom_half.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/percpu.h> +#include <linux/preempt.h> #include <linux/sched/signal.h> #include <linux/signal.h> -#include <linux/hardirq.h> #include <asm/fpsimd.h> #include <asm/cputype.h> +#include <asm/simd.h> #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -62,6 +65,13 @@ * CPU currently contain the most recent userland FPSIMD state of the current * task. * + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may + * save the task's FPSIMD context back to task_struct from softirq context. + * To prevent this from racing with the manipulation of the task's FPSIMD state + * from task context and thereby corrupting the state, it is necessary to + * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE + * flag with local_bh_disable() unless softirqs are already masked. + * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_state.cpu field * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu @@ -161,11 +171,14 @@ void fpsimd_flush_thread(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); - preempt_enable(); + + local_bh_enable(); } /* @@ -176,10 +189,13 @@ void fpsimd_preserve_current_state(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); - preempt_enable(); + + local_bh_enable(); } /* @@ -191,15 +207,18 @@ void fpsimd_restore_current_state(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; fpsimd_load_state(st); - this_cpu_write(fpsimd_last_state, st); + __this_cpu_write(fpsimd_last_state, st); st->cpu = smp_processor_id(); } - preempt_enable(); + + local_bh_enable(); } /* @@ -211,15 +230,18 @@ void fpsimd_update_current_state(struct fpsimd_state *state) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + fpsimd_load_state(state); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; - this_cpu_write(fpsimd_last_state, st); + __this_cpu_write(fpsimd_last_state, st); st->cpu = smp_processor_id(); } - preempt_enable(); + + local_bh_enable(); } /* @@ -232,52 +254,122 @@ void fpsimd_flush_task_state(struct task_struct *t) #ifdef CONFIG_KERNEL_MODE_NEON -static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); -static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); +DEFINE_PER_CPU(bool, kernel_neon_busy); +EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); /* * Kernel-side NEON support functions */ -void kernel_neon_begin_partial(u32 num_regs) + +/* + * kernel_neon_begin(): obtain the CPU FPSIMD registers for use by the calling + * context + * + * Must not be called unless may_use_simd() returns true. + * Task context in the FPSIMD registers is saved back to memory as necessary. + * + * A matching call to kernel_neon_end() must be made before returning from the + * calling context. + * + * The caller may freely use the FPSIMD registers until kernel_neon_end() is + * called. + */ +void kernel_neon_begin(void) { if (WARN_ON(!system_supports_fpsimd())) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - BUG_ON(num_regs > 32); - fpsimd_save_partial_state(s, roundup(num_regs, 2)); - } else { - /* - * Save the userland FPSIMD state if we have one and if we - * haven't done so already. Clear fpsimd_last_state to indicate - * that there is no longer userland FPSIMD state in the - * registers. - */ - preempt_disable(); - if (current->mm && - !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); - this_cpu_write(fpsimd_last_state, NULL); - } + BUG_ON(!may_use_simd()); + + local_bh_disable(); + + __this_cpu_write(kernel_neon_busy, true); + + /* Save unsaved task fpsimd state, if any: */ + if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) + fpsimd_save_state(¤t->thread.fpsimd_state); + + /* Invalidate any task state remaining in the fpsimd regs: */ + __this_cpu_write(fpsimd_last_state, NULL); + + preempt_disable(); + + local_bh_enable(); } -EXPORT_SYMBOL(kernel_neon_begin_partial); +EXPORT_SYMBOL(kernel_neon_begin); +/* + * kernel_neon_end(): give the CPU FPSIMD registers back to the current task + * + * Must be called from a context in which kernel_neon_begin() was previously + * called, with no call to kernel_neon_end() in the meantime. + * + * The caller must not use the FPSIMD registers after this function is called, + * unless kernel_neon_begin() is called again in the meantime. + */ void kernel_neon_end(void) { + bool busy; + if (!system_supports_fpsimd()) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - fpsimd_load_partial_state(s); - } else { - preempt_enable(); - } + + busy = __this_cpu_xchg(kernel_neon_busy, false); + WARN_ON(!busy); /* No matching kernel_neon_begin()? */ + + preempt_enable(); } EXPORT_SYMBOL(kernel_neon_end); +static DEFINE_PER_CPU(struct fpsimd_state, efi_fpsimd_state); +static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); + +/* + * EFI runtime services support functions + * + * The ABI for EFI runtime services allows EFI to use FPSIMD during the call. + * This means that for EFI (and only for EFI), we have to assume that FPSIMD + * is always used rather than being an optional accelerator. + * + * These functions provide the necessary support for ensuring FPSIMD + * save/restore in the contexts from which EFI is used. + * + * Do not use them for any other purpose -- if tempted to do so, you are + * either doing something wrong or you need to propose some refactoring. + */ + +/* + * __efi_fpsimd_begin(): prepare FPSIMD for making an EFI runtime services call + */ +void __efi_fpsimd_begin(void) +{ + if (!system_supports_fpsimd()) + return; + + WARN_ON(preemptible()); + + if (may_use_simd()) + kernel_neon_begin(); + else { + fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + __this_cpu_write(efi_fpsimd_state_used, true); + } +} + +/* + * __efi_fpsimd_end(): clean up FPSIMD after an EFI runtime services call + */ +void __efi_fpsimd_end(void) +{ + if (!system_supports_fpsimd()) + return; + + if (__this_cpu_xchg(efi_fpsimd_state_used, false)) + fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + else + kernel_neon_end(); +} + #endif /* CONFIG_KERNEL_MODE_NEON */ #ifdef CONFIG_CPU_PM diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index adb0910b88f5..7434ec0c7a27 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -143,8 +143,8 @@ preserve_boot_args: dmb sy // needed before dc ivac with // MMU off - add x1, x0, #0x20 // 4 x 8 bytes - b __inval_cache_range // tail call + mov x1, #0x20 // 4 x 8 bytes + b __inval_dcache_area // tail call ENDPROC(preserve_boot_args) /* @@ -221,20 +221,20 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE - bl __inval_cache_range + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + bl __inval_dcache_area /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 - cmp x0, x6 - b.lo 1b + subs x1, x1, #64 + b.ne 1b mov x7, SWAPPER_MM_MMUFLAGS @@ -307,9 +307,9 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) dmb sy - bl __inval_cache_range + bl __inval_dcache_area ret x28 ENDPROC(__create_page_tables) @@ -361,6 +361,9 @@ __primary_switched: ret // to __primary_switch() 0: #endif + add sp, sp, #16 + mov x29, #0 + mov x30, #0 b start_kernel ENDPROC(__primary_switched) @@ -616,6 +619,7 @@ __secondary_switched: ldr x2, [x0, #CPU_BOOT_TASK] msr sp_el0, x2 mov x29, #0 + mov x30, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index a44e13942d30..095d3c170f5d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -330,7 +330,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_pte, pte_clear_rdonly(pte)); + set_pte(dst_pte, pte_mkwrite(pte)); } else if (debug_pagealloc_enabled() && !pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if @@ -343,7 +343,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_pte, pte_mkpresent(pte_clear_rdonly(pte))); + set_pte(dst_pte, pte_mkpresent(pte_mkwrite(pte))); } } diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 2386b26c0712..713561e5bcab 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -23,15 +23,16 @@ #include <linux/kernel_stat.h> #include <linux/irq.h> +#include <linux/memory.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/irqchip.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ -DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); int arch_show_interrupts(struct seq_file *p, int prec) { @@ -50,8 +51,43 @@ void __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) handle_arch_irq = handle_irq; } +#ifdef CONFIG_VMAP_STACK +static void init_irq_stacks(void) +{ + int cpu; + unsigned long *p; + + for_each_possible_cpu(cpu) { + /* + * To ensure that VMAP'd stack overflow detection works + * correctly, the IRQ stacks need to have the same + * alignment as other stacks. + */ + p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP, PAGE_KERNEL, + 0, cpu_to_node(cpu), + __builtin_return_address(0)); + + per_cpu(irq_stack_ptr, cpu) = p; + } +} +#else +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ +DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +static void init_irq_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); +} +#endif + void __init init_IRQ(void) { + init_irq_stacks(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 481f54a866c5..11121f608eb5 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -252,7 +252,7 @@ void machine_crash_shutdown(struct pt_regs *regs) local_irq_disable(); /* shutdown non-crashing cpus */ - smp_send_crash_stop(); + crash_smp_send_stop(); /* for crashing cpu */ crash_save_cpu(regs, smp_processor_id()); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 713ca824f266..bcafd7dcfe8b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -162,7 +162,6 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index b5798ba21189..9eaef51f83ff 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -202,55 +202,6 @@ static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, }; -/* ARM Cortex-A53 HW events mapping. */ -static const unsigned armv8_a53_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, -}; - -/* ARM Cortex-A57 and Cortex-A72 events mapping. */ -static const unsigned armv8_a57_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, -}; - -static const unsigned armv8_thunder_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - -/* Broadcom Vulcan events mapping */ -static const unsigned armv8_vulcan_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_BR_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { @@ -281,27 +232,10 @@ static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { PERF_CACHE_MAP_ALL_UNSUPPORTED, - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(LL)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE, - [C(LL)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL, - [C(LL)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE, - [C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL, - - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -314,18 +248,26 @@ static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, +}; - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, +static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + PERF_CACHE_MAP_ALL_UNSUPPORTED, + + [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, + [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, + + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, + + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -340,8 +282,6 @@ static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS, [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, [C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS, [C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS, @@ -349,13 +289,6 @@ static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, }; static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -368,22 +301,11 @@ static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB, - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; @@ -846,17 +768,14 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct hw_perf_event *hwc = &event->hw; unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; - /* Always place a cycle counter into the cycle counter. */ + /* Always prefer to place a cycle counter into the cycle counter. */ if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) { - if (test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) - return -EAGAIN; - - return ARMV8_IDX_CYCLE_COUNTER; + if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) + return ARMV8_IDX_CYCLE_COUNTER; } /* - * For anything other than a cycle counter, try and use - * the events counters + * Otherwise use events counters */ for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; ++idx) { if (!test_and_set_bit(idx, cpuc->used_mask)) @@ -924,7 +843,13 @@ static void armv8pmu_reset(void *info) ARMV8_PMU_PMCR_LC); } -static int armv8_pmuv3_map_event(struct perf_event *event) +static int __armv8_pmuv3_map_event(struct perf_event *event, + const unsigned (*extra_event_map) + [PERF_COUNT_HW_MAX], + const unsigned (*extra_cache_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]) { int hw_event_id; struct arm_pmu *armpmu = to_arm_pmu(event->pmu); @@ -932,44 +857,47 @@ static int armv8_pmuv3_map_event(struct perf_event *event) hw_event_id = armpmu_map_event(event, &armv8_pmuv3_perf_map, &armv8_pmuv3_perf_cache_map, ARMV8_PMU_EVTYPE_EVENT); - if (hw_event_id < 0) - return hw_event_id; - /* disable micro/arch events not supported by this PMU */ - if ((hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) && - !test_bit(hw_event_id, armpmu->pmceid_bitmap)) { - return -EOPNOTSUPP; + /* Onl expose micro/arch events supported by this PMU */ + if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) + && test_bit(hw_event_id, armpmu->pmceid_bitmap)) { + return hw_event_id; } - return hw_event_id; + return armpmu_map_event(event, extra_event_map, extra_cache_map, + ARMV8_PMU_EVTYPE_EVENT); +} + +static int armv8_pmuv3_map_event(struct perf_event *event) +{ + return __armv8_pmuv3_map_event(event, NULL, NULL); } static int armv8_a53_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_a53_perf_map, - &armv8_a53_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map); } static int armv8_a57_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_a57_perf_map, - &armv8_a57_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map); +} + +static int armv8_a73_map_event(struct perf_event *event) +{ + return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map); } static int armv8_thunder_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_thunder_perf_map, - &armv8_thunder_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, + &armv8_thunder_perf_cache_map); } static int armv8_vulcan_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_vulcan_perf_map, - &armv8_vulcan_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, + &armv8_vulcan_perf_cache_map); } struct armv8pmu_probe_info { @@ -1062,6 +990,22 @@ static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) return 0; } +static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) +{ + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + + cpu_pmu->name = "armv8_cortex_a35"; + cpu_pmu->map_event = armv8_a53_map_event; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = + &armv8_pmuv3_events_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = + &armv8_pmuv3_format_attr_group; + + return 0; +} + static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) { int ret = armv8_pmu_init(cpu_pmu); @@ -1110,6 +1054,22 @@ static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) return 0; } +static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) +{ + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + + cpu_pmu->name = "armv8_cortex_a73"; + cpu_pmu->map_event = armv8_a73_map_event; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = + &armv8_pmuv3_events_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = + &armv8_pmuv3_format_attr_group; + + return 0; +} + static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) { int ret = armv8_pmu_init(cpu_pmu); @@ -1144,9 +1104,11 @@ static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_init}, + {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, + {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, {}, diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 26c998534dca..636ca0119c0e 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -40,7 +40,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, probe_opcode_t insn; /* TODO: Currently we do not support AARCH32 instruction probing */ - if (test_bit(TIF_32BIT, &mm->context.flags)) + if (mm->context.flags & MMCF_AARCH32) return -ENOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 659ae8094ed5..2dc0f8482210 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, /* * Complete any pending TLB or cache maintenance on this CPU in case * the thread migrates to a different CPU. + * This full barrier is also required by the membarrier system + * call. */ dsb(ish); @@ -382,15 +384,12 @@ unsigned long get_wchan(struct task_struct *p) return 0; frame.fp = thread_saved_fp(p); - frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif do { - if (frame.sp < stack_page || - frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(p, &frame)) + if (unwind_frame(p, &frame)) goto out; if (!in_sched_functions(frame.pc)) { ret = frame.pc; @@ -417,3 +416,11 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) else return randomize_page(mm->brk, SZ_1G); } + +/* + * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. + */ +void arch_setup_new_exec(void) +{ + current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; +} diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 1b38c0150aec..9cbb6123208f 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -42,6 +42,7 @@ #include <asm/compat.h> #include <asm/debug-monitors.h> #include <asm/pgtable.h> +#include <asm/stacktrace.h> #include <asm/syscall.h> #include <asm/traps.h> #include <asm/system_misc.h> @@ -127,7 +128,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || - on_irq_stack(addr, raw_smp_processor_id()); + on_irq_stack(addr); } /** @@ -1363,7 +1364,7 @@ static void tracehook_report_syscall(struct pt_regs *regs, if (dir == PTRACE_SYSCALL_EXIT) tracehook_report_syscall_exit(regs, 0); else if (tracehook_report_syscall_entry(regs)) - regs->syscallno = ~0UL; + forget_syscall(regs); regs->regs[regno] = saved_reg; } diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 12a87f2600f2..933adbc0f654 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -42,7 +42,6 @@ void *return_address(unsigned int level) data.addr = NULL; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 089c3747995d..c45214f8fb54 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -29,6 +29,7 @@ #include <linux/string.h> #include <linux/tracehook.h> #include <linux/ratelimit.h> +#include <linux/syscalls.h> #include <asm/debug-monitors.h> #include <asm/elf.h> @@ -36,6 +37,7 @@ #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> +#include <asm/ptrace.h> #include <asm/signal32.h> #include <asm/vdso.h> @@ -387,7 +389,7 @@ static int restore_sigframe(struct pt_regs *regs, /* * Avoid sys_rt_sigreturn() restarting. */ - regs->syscallno = ~0UL; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); if (err == 0) @@ -673,13 +675,12 @@ static void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; - int syscall = (int)regs->syscallno; struct ksignal ksig; /* * If we were from a system call, check for system call restarting... */ - if (syscall >= 0) { + if (in_syscall(regs)) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; @@ -687,7 +688,7 @@ static void do_signal(struct pt_regs *regs) /* * Avoid additional syscall restarting via ret_to_user. */ - regs->syscallno = ~0UL; + forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a @@ -731,7 +732,7 @@ static void do_signal(struct pt_regs *regs) * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ - if (syscall >= 0 && regs->pc == restart_addr) { + if (in_syscall(regs) && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); @@ -749,6 +750,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, * Update the trace code with the current status. */ trace_hardirqs_off(); + + /* Check valid user FS if needed */ + addr_limit_user_check(); + do { if (thread_flags & _TIF_NEED_RESCHED) { schedule(); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index c747a0fc5d7d..4e5a664be04b 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -354,7 +354,7 @@ static int compat_restore_sigframe(struct pt_regs *regs, /* * Avoid compat_sys_sigreturn() restarting. */ - regs->syscallno = ~0UL; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dc66e6ec3a99..ffe089942ac4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -154,7 +154,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; + secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -977,11 +977,21 @@ void smp_send_stop(void) } #ifdef CONFIG_KEXEC_CORE -void smp_send_crash_stop(void) +void crash_smp_send_stop(void) { + static int cpus_stopped; cpumask_t mask; unsigned long timeout; + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + */ + if (cpus_stopped) + return; + + cpus_stopped = 1; + if (num_online_cpus() == 1) return; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 09d37d66b630..3144584617e7 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -42,33 +42,17 @@ */ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { - unsigned long high, low; unsigned long fp = frame->fp; - unsigned long irq_stack_ptr; + + if (fp & 0xf) + return -EINVAL; if (!tsk) tsk = current; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - - low = frame->sp; - /* irq stacks are not THREAD_SIZE aligned */ - if (on_irq_stack(frame->sp, raw_smp_processor_id())) - high = irq_stack_ptr; - else - high = ALIGN(low, THREAD_SIZE) - 0x20; - - if (fp < low || fp > high || fp & 0xf) + if (!on_accessible_stack(tsk, fp)) return -EINVAL; - frame->sp = fp + 0x10; frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); @@ -86,34 +70,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* - * Check whether we are going to walk through from interrupt stack - * to task stack. - * If we reach the end of the stack - and its an interrupt stack, - * unpack the dummy frame to find the original elr. - * - * Check the frame->fp we read from the bottom of the irq_stack, - * and the original task stack pointer are both in current->stack. + * Frames created upon entry from EL0 have NULL FP and PC values, so + * don't bother reporting these. Frames created by __noreturn functions + * might have a valid FP even if PC is bogus, so only terminate where + * both are NULL. */ - if (frame->sp == irq_stack_ptr) { - struct pt_regs *irq_args; - unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - - if (object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) { - frame->sp = orig_sp; - - /* orig_sp is the saved pt_regs, find the elr */ - irq_args = (struct pt_regs *)orig_sp; - frame->pc = irq_args->pc; - } else { - /* - * This frame has a non-standard format, and we - * didn't fix it, because the data looked wrong. - * Refuse to output this frame. - */ - return -EINVAL; - } - } + if (!frame->fp && !frame->pc) + return -EINVAL; return 0; } @@ -167,7 +130,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) data.no_sched_functions = 0; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; @@ -192,12 +154,10 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (tsk != current) { data.no_sched_functions = 1; frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } else { data.no_sched_functions = 0; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index da33c90248e9..a4391280fba9 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -50,7 +50,6 @@ unsigned long profile_pc(struct pt_regs *regs) return regs->pc; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = -1; /* no task info */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8a62648848e5..5ea4b85aee0e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -32,6 +32,7 @@ #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/sizes.h> #include <linux/syscalls.h> #include <linux/mm_types.h> @@ -41,6 +42,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> @@ -143,7 +145,6 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr; int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -154,25 +155,14 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!try_get_task_stack(tsk)) return; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; } else { /* * task blocked in __switch_to */ frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -182,13 +172,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) skip = !!regs; printk("Call trace:\n"); while (1) { - unsigned long where = frame.pc; unsigned long stack; int ret; /* skip until specified stack frame */ if (!skip) { - dump_backtrace_entry(where); + dump_backtrace_entry(frame.pc); } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -203,20 +192,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) ret = unwind_frame(tsk, &frame); if (ret < 0) break; - stack = frame.sp; - if (in_exception_text(where)) { - /* - * If we switched to the irq_stack before calling this - * exception handler, then the pt_regs will be on the - * task stack. The easiest way to tell is if the large - * pt_regs would overlap with the end of the irq_stack. - */ - if (stack < irq_stack_ptr && - (stack + sizeof(struct pt_regs)) > irq_stack_ptr) - stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (in_entry_text(frame.pc)) { + stack = frame.fp - offsetof(struct pt_regs, stackframe); - dump_mem("", "Exception stack", stack, - stack + sizeof(struct pt_regs)); + if (on_accessible_stack(tsk, stack)) + dump_mem("", "Exception stack", stack, + stack + sizeof(struct pt_regs)); } } @@ -257,8 +238,6 @@ static int __die(const char *str, int err, struct pt_regs *regs) end_of_stack(tsk)); if (!user_mode(regs)) { - dump_mem(KERN_EMERG, "Stack: ", regs->sp, - THREAD_SIZE + (unsigned long)task_stack_page(tsk)); dump_backtrace(regs, tsk); dump_instr(KERN_EMERG, regs); } @@ -484,6 +463,9 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ + __user_cache_maint("sys 3, c7, c12, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CIVAC: /* DC CIVAC */ __user_cache_maint("dc civac", address, ret); break; @@ -593,7 +575,7 @@ asmlinkage long do_ni_syscall(struct pt_regs *regs) if (show_unhandled_signals_ratelimited()) { pr_info("%s[%d]: syscall %d\n", current->comm, - task_pid_nr(current), (int)regs->syscallno); + task_pid_nr(current), regs->syscallno); dump_instr("", regs); if (user_mode(regs)) __show_regs(regs); @@ -689,6 +671,43 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) force_sig_info(info.si_signo, &info, current); } +#ifdef CONFIG_VMAP_STACK + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +asmlinkage void handle_bad_stack(struct pt_regs *regs) +{ + unsigned long tsk_stk = (unsigned long)current->stack; + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); + unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned int esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + console_verbose(); + pr_emerg("Insufficient stack space to handle exception!"); + + pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("FAR: 0x%016lx\n", far); + + pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", + tsk_stk, tsk_stk + THREAD_SIZE); + pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", + irq_stk, irq_stk + THREAD_SIZE); + pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", + ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); + + __show_regs(regs); + + /* + * We use nmi_panic to limit the potential for recusive overflows, and + * to get a better stack trace. + */ + nmi_panic(NULL, "kernel stack overflow"); + cpu_park_loop(); +} +#endif + void __pte_error(const char *file, int line, unsigned long val) { pr_err("%s:%d: bad pte %016lx.\n", file, line, val); diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e8f759f764f2..2d419006ad43 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -110,12 +110,27 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) } #endif /* CONFIG_COMPAT */ +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + unsigned long vdso_size = vdso_end - vdso_start; + + if (vdso_size != new_size) + return -EINVAL; + + current->mm->context.vdso = (void *)new_vma->vm_start; + + return 0; +} + static struct vm_special_mapping vdso_spec[2] __ro_after_init = { { .name = "[vvar]", }, { .name = "[vdso]", + .mremap = vdso_mremap, }, }; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 987a00ee446c..fe56c268a7d9 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -72,22 +72,6 @@ PECOFF_FILE_ALIGNMENT = 0x200; #define PECOFF_EDATA_PADDING #endif -#if defined(CONFIG_DEBUG_ALIGN_RODATA) -/* - * 4 KB granule: 1 level 2 entry - * 16 KB granule: 128 level 3 entries, with contiguous bit - * 64 KB granule: 32 level 3 entries, with contiguous bit - */ -#define SEGMENT_ALIGN SZ_2M -#else -/* - * 4 KB granule: 16 level 3 entries, with contiguous bit - * 16 KB granule: 4 level 3 entries, without contiguous bit - * 64 KB granule: 1 level 3 entry - */ -#define SEGMENT_ALIGN SZ_64K -#endif - SECTIONS { /* @@ -192,7 +176,7 @@ SECTIONS _data = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) /* * Data written with the MMU off but read with the MMU on requires diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index b81f4091c909..a81f5e10fc8c 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -70,7 +70,7 @@ u32 __hyp_text __init_stage2_translation(void) * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. */ tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp) + if (tmp) val |= VTCR_EL2_HA; /* diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index c86b7909ef31..a0abc142c92b 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -17,3 +17,5 @@ CFLAGS_atomic_ll_sc.o := -fcall-used-x0 -ffixed-x1 -ffixed-x2 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ -fcall-saved-x18 + +lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c new file mode 100644 index 000000000000..b6ceafdb8b72 --- /dev/null +++ b/arch/arm64/lib/uaccess_flushcache.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/uaccess.h> +#include <asm/barrier.h> +#include <asm/cacheflush.h> + +void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + /* + * We assume this should not be called with @dst pointing to + * non-cacheable memory, such that we don't need an explicit + * barrier to order the cache maintenance against the memcpy. + */ + memcpy(dst, src, cnt); + __clean_dcache_area_pop(dst, cnt); +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + memcpy_flushcache(to, page_address(page) + offset, len); +} + +unsigned long __copy_user_flushcache(void *to, const void __user *from, + unsigned long n) +{ + unsigned long rc = __arch_copy_from_user(to, from, n); + + /* See above */ + __clean_dcache_area_pop(to, n - rc); + return rc; +} diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 83c27b6e6dca..7f1dbe962cf5 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -109,20 +109,25 @@ ENTRY(__clean_dcache_area_pou) ENDPROC(__clean_dcache_area_pou) /* - * __dma_inv_area(start, size) - * - start - virtual start address of region + * __inval_dcache_area(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are invalidated. Any partial lines at the ends of the interval are + * also cleaned to PoC to prevent data loss. + * + * - kaddr - kernel address * - size - size in question */ -__dma_inv_area: - add x1, x1, x0 +ENTRY(__inval_dcache_area) /* FALLTHROUGH */ /* - * __inval_cache_range(start, end) - * - start - start address of region - * - end - end address of region + * __dma_inv_area(start, size) + * - start - virtual start address of region + * - size - size in question */ -ENTRY(__inval_cache_range) +__dma_inv_area: + add x1, x1, x0 dcache_line_size x2, x3 sub x3, x2, #1 tst x1, x3 // end cache line aligned? @@ -140,7 +145,7 @@ ENTRY(__inval_cache_range) b.lo 2b dsb sy ret -ENDPIPROC(__inval_cache_range) +ENDPIPROC(__inval_dcache_area) ENDPROC(__dma_inv_area) /* @@ -167,6 +172,20 @@ ENDPIPROC(__clean_dcache_area_poc) ENDPROC(__dma_clean_area) /* + * __clean_dcache_area_pop(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoP. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pop) + dcache_by_line_op cvap, sy, x0, x1, x2, x3 + ret +ENDPIPROC(__clean_dcache_area_pop) + +/* * __dma_flush_area(start, size) * * clean & invalidate D / U line diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index f27d4dd04384..614af886b7ef 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -42,7 +42,7 @@ static pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot, return prot; } -static struct gen_pool *atomic_pool; +static struct gen_pool *atomic_pool __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -425,7 +425,7 @@ static int __init atomic_pool_init(void) gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, - (void *)PAGE_SHIFT); + NULL); pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", atomic_pool_size / 1024); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1f22a41565a3..89993c4be1be 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -34,6 +34,7 @@ #include <linux/hugetlb.h> #include <asm/bug.h> +#include <asm/cmpxchg.h> #include <asm/cpufeature.h> #include <asm/exception.h> #include <asm/debug-monitors.h> @@ -82,6 +83,49 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) } #endif +static void data_abort_decode(unsigned int esr) +{ + pr_alert("Data abort info:\n"); + + if (esr & ESR_ELx_ISV) { + pr_alert(" Access size = %u byte(s)\n", + 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT)); + pr_alert(" SSE = %lu, SRT = %lu\n", + (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT, + (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT); + pr_alert(" SF = %lu, AR = %lu\n", + (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT, + (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT); + } else { + pr_alert(" ISV = 0, ISS = 0x%08lu\n", esr & ESR_ELx_ISS_MASK); + } + + pr_alert(" CM = %lu, WnR = %lu\n", + (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT, + (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT); +} + +/* + * Decode mem abort information + */ +static void mem_abort_decode(unsigned int esr) +{ + pr_alert("Mem abort info:\n"); + + pr_alert(" Exception class = %s, IL = %u bits\n", + esr_get_class_string(esr), + (esr & ESR_ELx_IL) ? 32 : 16); + pr_alert(" SET = %lu, FnV = %lu\n", + (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT, + (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT); + pr_alert(" EA = %lu, S1PTW = %lu\n", + (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT, + (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT); + + if (esr_is_data_abort(esr)) + data_abort_decode(esr); +} + /* * Dump out the page tables associated with 'addr' in the currently active mm. */ @@ -139,7 +183,6 @@ void show_pte(unsigned long addr) pr_cont("\n"); } -#ifdef CONFIG_ARM64_HW_AFDBM /* * This function sets the access flags (dirty, accessed), as well as write * permission, and only to a more permissive setting. @@ -154,18 +197,13 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - pteval_t old_pteval; - unsigned int tmp; + pteval_t old_pteval, pteval; if (pte_same(*ptep, entry)) return 0; /* only preserve the access flags and write permission */ - pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY; - - /* set PTE_RDONLY if actual read-only or clean PTE */ - if (!pte_write(entry) || !pte_sw_dirty(entry)) - pte_val(entry) |= PTE_RDONLY; + pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; /* * Setting the flags must be done atomically to avoid racing with the @@ -174,21 +212,18 @@ int ptep_set_access_flags(struct vm_area_struct *vma, * (calculated as: a & b == ~(~a | ~b)). */ pte_val(entry) ^= PTE_RDONLY; - asm volatile("// ptep_set_access_flags\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " eor %0, %0, %3 // negate PTE_RDONLY in *ptep\n" - " orr %0, %0, %4 // set flags\n" - " eor %0, %0, %3 // negate final PTE_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "L" (PTE_RDONLY), "r" (pte_val(entry))); + pteval = READ_ONCE(pte_val(*ptep)); + do { + old_pteval = pteval; + pteval ^= PTE_RDONLY; + pteval |= pte_val(entry); + pteval ^= PTE_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); + } while (pteval != old_pteval); flush_tlb_fix_spurious_fault(vma, address); return 1; } -#endif static bool is_el1_instruction_abort(unsigned int esr) { @@ -248,6 +283,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg, addr); + mem_abort_decode(esr); + show_pte(addr); die("Oops", regs, esr); bust_spinlocks(0); @@ -705,6 +742,8 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n", inf->name, esr, addr); + mem_abort_decode(esr); + info.si_signo = inf->sig; info.si_errno = 0; info.si_code = inf->code; diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 21a8d828cbf4..e36ed5087b5c 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -83,3 +83,19 @@ EXPORT_SYMBOL(flush_dcache_page); * Additional functions defined in assembly. */ EXPORT_SYMBOL(flush_icache_range); + +#ifdef CONFIG_ARCH_HAS_PMEM_API +void arch_wb_cache_pmem(void *addr, size_t size) +{ + /* Ensure order against any prior non-cacheable writes */ + dmb(osh); + __clean_dcache_area_pop(addr, size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + __inval_dcache_area(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); +#endif diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 656e0ece2289..6cb0fa92a651 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -41,6 +41,16 @@ int pud_huge(pud_t pud) #endif } +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { @@ -58,15 +68,107 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, return CONT_PTES; } +static inline int num_contig_ptes(unsigned long size, size_t *pgsize) +{ + int contig_ptes = 0; + + *pgsize = size; + + switch (size) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE: + contig_ptes = 1; + break; + case CONT_PMD_SIZE: + *pgsize = PMD_SIZE; + contig_ptes = CONT_PMDS; + break; + case CONT_PTE_SIZE: + *pgsize = PAGE_SIZE; + contig_ptes = CONT_PTES; + break; + } + + return contig_ptes; +} + +/* + * Changing some bits of contiguous entries requires us to follow a + * Break-Before-Make approach, breaking the whole contiguous set + * before we can change any entries. See ARM DDI 0487A.k_iss10775, + * "Misprogramming of the Contiguous bit", page D4-1762. + * + * This helper performs the break step. + */ +static pte_t get_clear_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + struct vm_area_struct vma = { .vm_mm = mm }; + pte_t orig_pte = huge_ptep_get(ptep); + bool valid = pte_valid(orig_pte); + unsigned long i, saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + + /* + * If HW_AFDBM is enabled, then the HW could turn on + * the dirty bit for any page in the set, so check + * them all. All hugetlb entries are already young. + */ + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + } + + if (valid) + flush_tlb_range(&vma, saddr, addr); + return orig_pte; +} + +/* + * Changing some bits of contiguous entries requires us to follow a + * Break-Before-Make approach, breaking the whole contiguous set + * before we can change any entries. See ARM DDI 0487A.k_iss10775, + * "Misprogramming of the Contiguous bit", page D4-1762. + * + * This helper performs the break step for use cases where the + * original pte is not needed. + */ +static void clear_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + struct vm_area_struct vma = { .vm_mm = mm }; + unsigned long i, saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + pte_clear(mm, addr, ptep); + + flush_tlb_range(&vma, saddr, addr); +} + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { size_t pgsize; int i; int ncontig; - unsigned long pfn; + unsigned long pfn, dpfn; pgprot_t hugeprot; + /* + * Code needs to be expanded to handle huge swap and migration + * entries. Needed for HUGETLB and MEMORY_FAILURE. + */ + WARN_ON(!pte_present(pte)); + if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); return; @@ -74,17 +176,30 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, ncontig = find_num_contig(mm, addr, ptep, &pgsize); pfn = pte_pfn(pte); - hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); - for (i = 0; i < ncontig; i++) { + dpfn = pgsize >> PAGE_SHIFT; + hugeprot = pte_pgprot(pte); + + clear_flush(mm, addr, ptep, pgsize, ncontig); + + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) { pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, pte_val(pfn_pte(pfn, hugeprot))); set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); - ptep++; - pfn += pgsize >> PAGE_SHIFT; - addr += pgsize; } } +void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + int i, ncontig; + size_t pgsize; + + ncontig = num_contig_ptes(sz, &pgsize); + + for (i = 0; i < ncontig; i++, ptep++) + set_pte(ptep, pte); +} + pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { @@ -144,19 +259,28 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(pgd, addr); - if (pud_none(*pud)) + if (sz != PUD_SIZE && pud_none(*pud)) return NULL; - /* swap or huge page */ - if (!pud_present(*pud) || pud_huge(*pud)) + /* hugepage or swap? */ + if (pud_huge(*pud) || !pud_present(*pud)) return (pte_t *)pud; /* table; check the next level */ + if (sz == CONT_PMD_SIZE) + addr &= CONT_PMD_MASK; + pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) && + pmd_none(*pmd)) return NULL; - if (!pmd_present(*pmd) || pmd_huge(*pmd)) + if (pmd_huge(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; + if (sz == CONT_PTE_SIZE) { + pte_t *pte = pte_offset_kernel(pmd, (addr & CONT_PTE_MASK)); + return pte; + } + return NULL; } @@ -176,111 +300,133 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, return entry; } +void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) +{ + int i, ncontig; + size_t pgsize; + + ncontig = num_contig_ptes(sz, &pgsize); + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + pte_clear(mm, addr, ptep); +} + pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte; + int ncontig; + size_t pgsize; + pte_t orig_pte = huge_ptep_get(ptep); - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize; - bool is_dirty = false; - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - /* save the 1st pte to return */ - pte = ptep_get_and_clear(mm, addr, ptep); - for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { - /* - * If HW_AFDBM is enabled, then the HW could - * turn on the dirty bit for any of the page - * in the set, so check them all. - */ - ++ptep; - if (pte_dirty(ptep_get_and_clear(mm, addr, ptep))) - is_dirty = true; - } - if (is_dirty) - return pte_mkdirty(pte); - else - return pte; - } else { + if (!pte_cont(orig_pte)) return ptep_get_and_clear(mm, addr, ptep); - } + + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + + return get_clear_flush(mm, addr, ptep, pgsize, ncontig); } int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - if (pte_cont(pte)) { - int ncontig, i, changed = 0; - size_t pgsize = 0; - unsigned long pfn = pte_pfn(pte); - /* Select all bits except the pfn */ - pgprot_t hugeprot = - __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ - pte_val(pte)); - - pfn = pte_pfn(pte); - ncontig = find_num_contig(vma->vm_mm, addr, ptep, - &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) { - changed |= ptep_set_access_flags(vma, addr, ptep, - pfn_pte(pfn, - hugeprot), - dirty); - pfn += pgsize >> PAGE_SHIFT; - } - return changed; - } else { + int ncontig, i, changed = 0; + size_t pgsize = 0; + unsigned long pfn = pte_pfn(pte), dpfn; + pgprot_t hugeprot; + pte_t orig_pte; + + if (!pte_cont(pte)) return ptep_set_access_flags(vma, addr, ptep, pte, dirty); - } + + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); + dpfn = pgsize >> PAGE_SHIFT; + + orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); + if (!pte_same(orig_pte, pte)) + changed = 1; + + /* Make sure we don't lose the dirty state */ + if (pte_dirty(orig_pte)) + pte = pte_mkdirty(pte); + + hugeprot = pte_pgprot(pte); + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) + set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot)); + + return changed; } void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize = 0; - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) - ptep_set_wrprotect(mm, addr, ptep); - } else { + unsigned long pfn, dpfn; + pgprot_t hugeprot; + int ncontig, i; + size_t pgsize; + pte_t pte; + + if (!pte_cont(*ptep)) { ptep_set_wrprotect(mm, addr, ptep); + return; } + + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + dpfn = pgsize >> PAGE_SHIFT; + + pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig); + pte = pte_wrprotect(pte); + + hugeprot = pte_pgprot(pte); + pfn = pte_pfn(pte); + + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); } void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize = 0; - - ncontig = find_num_contig(vma->vm_mm, addr, ptep, - &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) - ptep_clear_flush(vma, addr, ptep); - } else { + size_t pgsize; + int ncontig; + + if (!pte_cont(*ptep)) { ptep_clear_flush(vma, addr, ptep); + return; } + + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); + clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); } static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); - return 0; + switch (ps) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE * CONT_PMDS: + case PMD_SIZE: + case PAGE_SIZE * CONT_PTES: + hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT); + return 1; } - return 1; + + hugetlb_bad_size(); + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); + return 0; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_ARM64_64K_PAGES +static __init int add_default_hugepagesz(void) +{ + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) + hugetlb_add_hstate(CONT_PTE_SHIFT); + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index c58f4a83ed6f..f6431439d15d 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -48,11 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __raw_spin_unlock_asm(&lock->lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_read_can_lock(arch_rwlock_t *rw) { return __raw_uncached_fetch_asm(&rw->lock) > 0; diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c index 0188c933b155..15af5768c403 100644 --- a/arch/blackfin/kernel/module.c +++ b/arch/blackfin/kernel/module.c @@ -4,8 +4,6 @@ * Licensed under the GPL-2 or later */ -#define pr_fmt(fmt) "module %s: " fmt, mod->name - #include <linux/moduleloader.h> #include <linux/elf.h> #include <linux/vmalloc.h> @@ -16,6 +14,11 @@ #include <asm/cacheflush.h> #include <linux/uaccess.h> +#define mod_err(mod, fmt, ...) \ + pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__) +#define mod_debug(mod, fmt, ...) \ + pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__) + /* Transfer the section to the L1 memory */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_inst_sram_alloc(s->sh_size); mod->arch.text_l1 = dest; if (dest == NULL) { - pr_err("L1 inst memory allocation failed\n"); + mod_err(mod, "L1 inst memory allocation failed\n"); return -1; } dma_memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_alloc(s->sh_size); mod->arch.data_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_zalloc(s->sh_size); mod->arch.bss_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } @@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.data_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.bss_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memset(dest, 0, s->sh_size); @@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.text_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.data_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_zalloc(s->sh_size); mod->arch.bss_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } @@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, Elf32_Sym *sym; unsigned long location, value, size; - pr_debug("applying relocate section %u to %u\n", - relsec, sechdrs[relsec].sh_info); + mod_debug(mod, "applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ @@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, #ifdef CONFIG_SMP if (location >= COREB_L1_DATA_A_START) { - pr_err("cannot relocate in L1: %u (SMP kernel)\n", + mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } #endif - pr_debug("location is %lx, value is %lx type is %d\n", - location, value, ELF32_R_TYPE(rel[i].r_info)); + mod_debug(mod, "location is %lx, value is %lx type is %d\n", + location, value, ELF32_R_TYPE(rel[i].r_info)); switch (ELF32_R_TYPE(rel[i].r_info)) { @@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, case R_BFIN_PCREL12_JUMP: case R_BFIN_PCREL12_JUMP_S: case R_BFIN_PCREL10: - pr_err("unsupported relocation: %u (no -mlong-calls?)\n", + mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; default: - pr_err("unknown relocation: %u\n", + mod_err(mod, "unknown relocation: %u\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, isram_memcpy((void *)location, &value, size); break; default: - pr_err("invalid relocation for %#lx\n", location); + mod_err(mod, "invalid relocation for %#lx\n", location); return -ENOEXEC; } } diff --git a/arch/cris/arch-v32/mach-a3/arbiter.c b/arch/cris/arch-v32/mach-a3/arbiter.c index ab5c421a4de8..735a9b0abdb8 100644 --- a/arch/cris/arch-v32/mach-a3/arbiter.c +++ b/arch/cris/arch-v32/mach-a3/arbiter.c @@ -227,7 +227,7 @@ static void crisv32_arbiter_config(int arbiter, int region, int unused_slots) } } -extern char _stext, _etext; +extern char _stext[], _etext[]; static void crisv32_arbiter_init(void) { @@ -265,7 +265,7 @@ static void crisv32_arbiter_init(void) #ifndef CONFIG_ETRAX_KGDB /* Global watch for writes to kernel text segment. */ - crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext, + crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext, MARB_CLIENTS(arbiter_all_clients, arbiter_bar_all_clients), arbiter_all_write, NULL); #endif diff --git a/arch/cris/arch-v32/mach-fs/arbiter.c b/arch/cris/arch-v32/mach-fs/arbiter.c index c97f4d8120f9..047c70bdbb23 100644 --- a/arch/cris/arch-v32/mach-fs/arbiter.c +++ b/arch/cris/arch-v32/mach-fs/arbiter.c @@ -158,7 +158,7 @@ static void crisv32_arbiter_config(int region, int unused_slots) } } -extern char _stext, _etext; +extern char _stext[], _etext[]; static void crisv32_arbiter_init(void) { @@ -190,7 +190,7 @@ static void crisv32_arbiter_init(void) #ifndef CONFIG_ETRAX_KGDB /* Global watch for writes to kernel text segment. */ - crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext, + crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext, arbiter_all_clients, arbiter_all_write, NULL); #endif } diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c index a01636a12a6e..d98131c45bb5 100644 --- a/arch/cris/kernel/traps.c +++ b/arch/cris/kernel/traps.c @@ -42,7 +42,7 @@ void (*nmi_handler)(struct pt_regs *); void show_trace(unsigned long *stack) { unsigned long addr, module_start, module_end; - extern char _stext, _etext; + extern char _stext[], _etext[]; int i; pr_err("\nCall Trace: "); @@ -69,8 +69,8 @@ void show_trace(unsigned long *stack) * down the cause of the crash will be able to figure * out the call path that was taken. */ - if (((addr >= (unsigned long)&_stext) && - (addr <= (unsigned long)&_etext)) || + if (((addr >= (unsigned long)_stext) && + (addr <= (unsigned long)_etext)) || ((addr >= module_start) && (addr <= module_end))) { #ifdef CONFIG_KALLSYMS print_ip_sym(addr); diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 2e1da71e27a4..ab346f5f8820 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -7,7 +7,8 @@ #include <asm/errno.h> #include <linux/uaccess.h> -extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); +extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr); static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c index d155ca9e5098..37f7b2bf7f73 100644 --- a/arch/frv/kernel/futex.c +++ b/arch/frv/kernel/futex.c @@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o /* * do the futex operations */ -int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; break; - } - } + if (!ret) + *oval = oldval; return ret; -} /* end futex_atomic_op_inuser() */ +} /* end arch_futex_atomic_op_inuser() */ diff --git a/arch/h8300/include/asm/traps.h b/arch/h8300/include/asm/traps.h index 15e701130b27..1c5a30ec2df8 100644 --- a/arch/h8300/include/asm/traps.h +++ b/arch/h8300/include/asm/traps.h @@ -33,9 +33,9 @@ extern unsigned long *_interrupt_redirect_table; #define TRAP2_VEC 10 #define TRAP3_VEC 11 -extern char _start, _etext; +extern char _start[], _etext[]; #define check_kernel_text(addr) \ - ((addr >= (unsigned long)(&_start)) && \ - (addr < (unsigned long)(&_etext)) && !(addr & 1)) + ((addr >= (unsigned long)(_start)) && \ + (addr < (unsigned long)(_etext)) && !(addr & 1)) #endif /* _H8300_TRAPS_H */ diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index a62ba368b27d..fb3dfb2a667e 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new) ); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /** * atomic_read - reads a word, atomically * @v: pointer to atomic value diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index 7e597f8434da..c607b77c8215 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -31,18 +31,9 @@ static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; pagefault_disable(); @@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index a1c55788c5d6..53a8d5885887 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -179,11 +179,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock) */ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_is_locked(x) ((x)->lock != 0) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index a3d0211970e9..c86a947f5368 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; } -#define acpi_unlazy_tlb(x) - #ifdef CONFIG_ACPI_NUMA extern cpumask_t early_cpu_possible_map; #define for_each_possible_early_cpu(cpu) \ diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index 76acbcd5c060..6d67dc1eaf2b 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -45,18 +45,9 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index ca9e76149a4a..df2c121164b8 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -76,22 +76,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) ACCESS_ONCE(*p) = (tmp + 2) & ~1; } -static __always_inline void __ticket_spin_unlock_wait(arch_spinlock_t *lock) -{ - int *p = (int *)&lock->lock, ticket; - - ia64_invala(); - - for (;;) { - asm volatile ("ld4.c.nc %0=[%1]" : "=r"(ticket) : "r"(p) : "memory"); - if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK)) - return; - cpu_relax(); - } - - smp_acquire__after_ctrl_dep(); -} - static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { long tmp = ACCESS_ONCE(lock->lock); @@ -143,11 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - __ticket_spin_unlock_wait(lock); -} - #define arch_read_can_lock(rw) (*(volatile int *)(rw) >= 0) #define arch_write_can_lock(rw) (*(volatile int *)(rw) == 0) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 7508c306aa9e..1d29b2f8726b 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -159,12 +159,12 @@ int acpi_request_vector(u32 int_type) return vector; } -char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { - return __va(phys_addr); + return __va(phys); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { } diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 121295637d0d..81416000c5e0 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size) return 0; } -u32 +int efi_mem_type (unsigned long phys_addr) { efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); if (md) return md->type; - return 0; + return -EINVAL; } u64 diff --git a/arch/m32r/include/asm/flat.h b/arch/m32r/include/asm/flat.h index 455ce7ddbf14..dfcb0e4eb256 100644 --- a/arch/m32r/include/asm/flat.h +++ b/arch/m32r/include/asm/flat.h @@ -95,7 +95,7 @@ static inline unsigned long m32r_flat_get_addr_from_rp (u32 *rp, return ~0; /* bogus value */ } -static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) +static inline int flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) { unsigned int reloc = flat_m32r_get_reloc_type (relval); if (reloc & 0xf0) { @@ -133,6 +133,7 @@ static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) break; } } + return 0; } // kludge - text_len is a local variable in the only user. diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 323c7fc953cd..a56825592b90 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -30,11 +30,6 @@ #define arch_spin_is_locked(x) (*(volatile int *)(&(x)->slock) <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, VAL > 0); -} - /** * arch_spin_trylock - Try spin lock and return a result * @lock: Pointer to the lock variable diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index ddff1164aff0..54191f6fc715 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -49,6 +49,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -465,8 +466,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m # CONFIG_IOMMU_SUPPORT is not set @@ -477,7 +480,6 @@ CONFIG_SERIAL_CONSOLE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -587,12 +589,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 17384dc959a5..fb4663904428 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -427,8 +428,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -436,7 +439,6 @@ CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -546,12 +548,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 53a641d62f85..4ab393e86e52 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -442,7 +443,10 @@ CONFIG_DMASOUND_ATARI=m CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m +# CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -457,7 +461,6 @@ CONFIG_ATARI_DSP56K=m CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -567,12 +570,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 3925ae3a5eb3..1dd8d697545b 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index f4a134b390b4..02b39f50076e 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -430,15 +431,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -548,12 +550,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 9ed0cef632b7..044dcb2bf8fb 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -46,6 +46,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -452,15 +453,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -570,12 +572,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index efed0d48fd53..3ad04682077a 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -56,6 +56,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -520,8 +521,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m CONFIG_RTC_DRV_GENERIC=m @@ -540,7 +543,6 @@ CONFIG_SERIAL_CONSOLE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -650,12 +652,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 9040457c7f9c..dc2dd61948cd 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -44,6 +44,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 8b17f00e0484..54e7b523fc3d 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 5f3718c62c85..d63d8a15f6db 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -442,8 +443,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -451,7 +454,6 @@ CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -561,12 +563,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 8c979a68fca5..d0924c22f52a 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -42,6 +42,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -422,15 +423,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -540,12 +542,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y CONFIG_CRYPTO_RSA=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index a1e79530e806..3001ee1e5dc5 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -42,6 +42,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -422,15 +423,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -540,12 +542,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/include/asm/asm-prototypes.h b/arch/m68k/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..22ccb9c97576 --- /dev/null +++ b/arch/m68k/include/asm/asm-prototypes.h @@ -0,0 +1,5 @@ +extern int __divsi3(int, int); +extern int __modsi3(int, int); +extern int __mulsi3(int, int); +extern unsigned int __udivsi3(unsigned int, unsigned int); +extern unsigned int __umodsi3(unsigned int, unsigned int); diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 8aa8792e3174..d96348a52362 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -357,6 +357,17 @@ static void cuda_shutdown(void) struct adb_request req; if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_POWERDOWN) < 0) return; + + /* Avoid infinite polling loop when PSU is not under Cuda control */ + switch (macintosh_config->ident) { + case MAC_MODEL_C660: + case MAC_MODEL_Q605: + case MAC_MODEL_Q605_ACC: + case MAC_MODEL_P475: + case MAC_MODEL_P475F: + return; + } + while (!req.complete) cuda_poll(); } @@ -463,8 +474,9 @@ void mac_poweroff(void) pmu_shutdown(); #endif } - local_irq_enable(); + pr_crit("It is now safe to turn off your Macintosh.\n"); + local_irq_disable(); while(1); } @@ -554,8 +566,8 @@ void mac_reset(void) } /* should never get here */ - local_irq_enable(); pr_crit("Restart failed. Please restart manually.\n"); + local_irq_disable(); while(1); } diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index 5b7a45d99cfb..7d8b322e5101 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -26,6 +26,7 @@ config METAG select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNDERSCORE_SYMBOL_PREFIX select IRQ_DOMAIN + select GENERIC_IRQ_EFFECTIVE_AFF_MASK select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h index 6c1380a8a0d4..eee779f26cc4 100644 --- a/arch/metag/include/asm/atomic_lock1.h +++ b/arch/metag/include/asm/atomic_lock1.h @@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i) return i; } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define ATOMIC_OP(op, c_op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h index c0c7a22be1ae..ddf7fe5708a6 100644 --- a/arch/metag/include/asm/spinlock.h +++ b/arch/metag/include/asm/spinlock.h @@ -15,11 +15,6 @@ * locked. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/microblaze/include/asm/flat.h b/arch/microblaze/include/asm/flat.h index f23c3d266bae..3d2747d4c967 100644 --- a/arch/microblaze/include/asm/flat.h +++ b/arch/microblaze/include/asm/flat.h @@ -60,7 +60,7 @@ static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, * unaligned. */ -static inline void +static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 relval) { u32 *p = (__force u32 *)rp; diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index 01848f056f43..a9dad9e5e132 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -29,18 +29,9 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index 1de190bdfb9c..a9e61ea54ca9 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -83,18 +83,9 @@ } static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 6dd13641a418..1395654cfc8d 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -872,15 +872,13 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) if (unlikely(test_thread_flag(TIF_SECCOMP))) { int ret, i; struct seccomp_data sd; + unsigned long args[6]; sd.nr = syscall; sd.arch = syscall_get_arch(); - for (i = 0; i < 6; i++) { - unsigned long v, r; - - r = mips_get_syscall_arg(&v, current, regs, i); - sd.args[i] = r ? 0 : v; - } + syscall_get_arguments(current, regs, 0, 6, args); + for (i = 0; i < 6; i++) + sd.args[i] = args[i]; sd.instruction_pointer = KSTK_EIP(current); ret = __secure_computing(&sd); diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 27c2f90eeb21..a9a7d78803cd 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -190,12 +190,6 @@ illegal_syscall: sll t1, t0, 2 beqz v0, einval lw t2, sys_call_table(t1) # syscall routine - sw a0, PT_R2(sp) # call routine directly on restart - - /* Some syscalls like execve get their arguments from struct pt_regs - and claim zero arguments in the syscall table. Thus we have to - assume the worst case and shuffle around all potential arguments. - If you want performance, don't use indirect syscalls. */ move a0, a1 # shift argument registers move a1, a2 @@ -207,11 +201,6 @@ illegal_syscall: sw t4, 16(sp) sw t5, 20(sp) sw t6, 24(sp) - sw a0, PT_R4(sp) # .. and push back a0 - a3, some - sw a1, PT_R5(sp) # syscalls expect them there - sw a2, PT_R6(sp) - sw a3, PT_R7(sp) - sw a3, PT_R26(sp) # update a3 for syscall restarting jr t2 /* Unreached */ diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index c30bc520885f..9ebe3e2403b1 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -198,7 +198,6 @@ LEAF(sys32_syscall) dsll t1, t0, 3 beqz v0, einval ld t2, sys32_call_table(t1) # syscall routine - sd a0, PT_R2(sp) # call routine directly on restart move a0, a1 # shift argument registers move a1, a2 @@ -207,11 +206,6 @@ LEAF(sys32_syscall) move a4, a5 move a5, a6 move a6, a7 - sd a0, PT_R4(sp) # ... and push back a0 - a3, some - sd a1, PT_R5(sp) # syscalls expect them there - sd a2, PT_R6(sp) - sd a3, PT_R7(sp) - sd a3, PT_R26(sp) # update a3 for syscall restarting jr t2 /* Unreached */ diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 6bace7695788..c7cbddfcdc3b 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static DEFINE_PER_CPU(atomic_t, tick_broadcast_count); -static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd); +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); void tick_broadcast(const struct cpumask *mask) { atomic_t *count; - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for_each_cpu(cpu, mask) { @@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info) static int __init tick_broadcast_init(void) { - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 9c7b8f7942d8..fe413b41df6c 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -26,11 +26,6 @@ #define arch_spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - static inline void arch_spin_unlock(arch_spinlock_t *lock) { asm volatile( diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h index 778087341977..8fed278a24b8 100644 --- a/arch/openrisc/include/asm/futex.h +++ b/arch/openrisc/include/asm/futex.h @@ -30,20 +30,10 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index dda1f558ef35..13648519bd41 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -9,6 +9,9 @@ config PARISC select ARCH_WANT_FRAME_POINTERS select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_WANTS_UBSAN_NO_NULL + select ARCH_SUPPORTS_MEMORY_FAILURE select RTC_CLASS select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE @@ -17,6 +20,12 @@ config PARISC select BUG select BUILDTIME_EXTABLE_SORT select HAVE_PERF_EVENTS + select HAVE_KERNEL_BZIP2 + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 + select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO + select HAVE_KERNEL_XZ select GENERIC_ATOMIC64 if !64BIT select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 75cb451b1f03..58fae5d2449d 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -24,15 +24,20 @@ KBUILD_DEFCONFIG := default_defconfig NM = sh $(srctree)/arch/parisc/nm CHECKFLAGS += -D__hppa__=1 LIBGCC = $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) +export LIBGCC ifdef CONFIG_64BIT UTS_MACHINE := parisc64 CHECKFLAGS += -D__LP64__=1 -m64 CC_ARCHES = hppa64 +LD_BFD := elf64-hppa-linux else # 32-bit CC_ARCHES = hppa hppa2.0 hppa1.1 +LD_BFD := elf32-hppa-linux endif +export LD_BFD + ifneq ($(SUBARCH),$(UTS_MACHINE)) ifeq ($(CROSS_COMPILE),) CC_SUFFIXES = linux linux-gnu unknown-linux-gnu @@ -88,6 +93,8 @@ libs-y += arch/parisc/lib/ $(LIBGCC) drivers-$(CONFIG_OPROFILE) += arch/parisc/oprofile/ +boot := arch/parisc/boot + PALO := $(shell if (which palo 2>&1); then : ; \ elif [ -x /sbin/palo ]; then echo /sbin/palo; \ fi) @@ -116,11 +123,14 @@ INSTALL_TARGETS = zinstall install PHONY += bzImage $(BOOT_TARGETS) $(INSTALL_TARGETS) -bzImage zImage: vmlinuz +zImage: vmlinuz Image: vmlinux -vmlinuz: vmlinux - @gzip -cf -9 $< > $@ +bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ + +vmlinuz: bzImage + $(OBJCOPY) $(boot)/bzImage $@ install: $(CONFIG_SHELL) $(src)/arch/parisc/install.sh \ diff --git a/arch/parisc/boot/.gitignore b/arch/parisc/boot/.gitignore new file mode 100644 index 000000000000..017d5912ad2d --- /dev/null +++ b/arch/parisc/boot/.gitignore @@ -0,0 +1,2 @@ +image +bzImage diff --git a/arch/parisc/boot/Makefile b/arch/parisc/boot/Makefile new file mode 100644 index 000000000000..cad68a584884 --- /dev/null +++ b/arch/parisc/boot/Makefile @@ -0,0 +1,26 @@ +# +# Makefile for the linux parisc-specific parts of the boot image creator. +# + +COMPILE_VERSION := __linux_compile_version_id__`hostname | \ + tr -c '[0-9A-Za-z]' '_'`__`date | \ + tr -c '[0-9A-Za-z]' '_'`_t + +ccflags-y := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I. + +targets := image +targets += bzImage +subdir- := compressed + +$(obj)/image: vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/bzImage: $(obj)/compressed/vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/compressed/vmlinux: FORCE + $(Q)$(MAKE) $(build)=$(obj)/compressed $@ + +install: $(CONFIGURE) $(obj)/bzImage + sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + System.map "$(INSTALL_PATH)" diff --git a/arch/parisc/boot/compressed/.gitignore b/arch/parisc/boot/compressed/.gitignore new file mode 100644 index 000000000000..ae06b9b4c02f --- /dev/null +++ b/arch/parisc/boot/compressed/.gitignore @@ -0,0 +1,3 @@ +sizes.h +vmlinux +vmlinux.lds diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile new file mode 100644 index 000000000000..5450a11c9d10 --- /dev/null +++ b/arch/parisc/boot/compressed/Makefile @@ -0,0 +1,86 @@ +# +# linux/arch/parisc/boot/compressed/Makefile +# +# create a compressed self-extracting vmlinux image from the original vmlinux +# + +KCOV_INSTRUMENT := n +GCOV_PROFILE := n +UBSAN_SANITIZE := n + +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 +targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 +targets += misc.o piggy.o sizes.h head.o real2.o firmware.o + +KBUILD_CFLAGS := -D__KERNEL__ -O2 -DBOOTLOADER +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks +KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs +ifndef CONFIG_64BIT +KBUILD_CFLAGS += -mfast-indirect-calls +endif + +OBJECTS += $(obj)/head.o $(obj)/real2.o $(obj)/firmware.o $(obj)/misc.o $(obj)/piggy.o + +# LDFLAGS_vmlinux := -X --whole-archive -e startup -T +LDFLAGS_vmlinux := -X -e startup --as-needed -T +$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC) + $(call if_changed,ld) + +sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\)$$/\#define SZ\2 0x\1/p' + +quiet_cmd_sizes = GEN $@ + cmd_sizes = $(NM) $< | sed -n $(sed-sizes) > $@ + +$(obj)/sizes.h: vmlinux + $(call if_changed,sizes) + +AFLAGS_head.o += -I$(objtree)/$(obj) -DBOOTLOADER +$(obj)/head.o: $(obj)/sizes.h + +CFLAGS_misc.o += -I$(objtree)/$(obj) +$(obj)/misc.o: $(obj)/sizes.h + +$(obj)/firmware.o: $(obj)/firmware.c +$(obj)/firmware.c: $(srctree)/arch/$(SRCARCH)/kernel/firmware.c + $(call cmd,shipped) + +AFLAGS_real2.o += -DBOOTLOADER +$(obj)/real2.o: $(obj)/real2.S +$(obj)/real2.S: $(srctree)/arch/$(SRCARCH)/kernel/real2.S + $(call cmd,shipped) + +$(obj)/misc.o: $(obj)/sizes.h + +CPPFLAGS_vmlinux.lds += -I$(objtree)/$(obj) -DBOOTLOADER +$(obj)/vmlinux.lds: $(obj)/sizes.h + +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .comment -S +$(obj)/vmlinux.bin: vmlinux + $(call if_changed,objcopy) + +vmlinux.bin.all-y := $(obj)/vmlinux.bin + +suffix-$(CONFIG_KERNEL_GZIP) := gz +suffix-$(CONFIG_KERNEL_BZIP2) := bz2 +suffix-$(CONFIG_KERNEL_LZ4) := lz4 +suffix-$(CONFIG_KERNEL_LZMA) := lzma +suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_XZ) := xz + +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) + $(call if_changed,gzip) +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) + $(call if_changed,bzip2) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) + $(call if_changed,lz4) +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) + $(call if_changed,lzma) +$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) + $(call if_changed,lzo) +$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) + $(call if_changed,xzkern) + +LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) + $(call if_changed,ld) diff --git a/arch/parisc/boot/compressed/head.S b/arch/parisc/boot/compressed/head.S new file mode 100644 index 000000000000..5aba20fa48aa --- /dev/null +++ b/arch/parisc/boot/compressed/head.S @@ -0,0 +1,85 @@ +/* + * Startup glue code to uncompress the kernel + * + * (C) 2017 Helge Deller <deller@gmx.de> + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/page.h> +#include <asm/psw.h> +#include <asm/pdc.h> +#include <asm/assembly.h> +#include "sizes.h" + +#define BOOTADDR(x) (x) + +#ifndef CONFIG_64BIT + .import $global$ /* forward declaration */ +#endif /*!CONFIG_64BIT*/ + + __HEAD + +ENTRY(startup) + .level LEVEL + +#define PSW_W_SM 0x200 +#define PSW_W_BIT 36 + + ;! nuke the W bit, saving original value + .level 2.0 + rsm PSW_W_SM, %r1 + + .level 1.1 + extrw,u %r1, PSW_W_BIT-32, 1, %r1 + copy %r1, %arg0 + + /* Make sure sr4-sr7 are set to zero for the kernel address space */ + mtsp %r0,%sr4 + mtsp %r0,%sr5 + mtsp %r0,%sr6 + mtsp %r0,%sr7 + + /* Clear BSS */ + + .import _bss,data + .import _ebss,data + + load32 BOOTADDR(_bss),%r3 + load32 BOOTADDR(_ebss),%r4 + ldo FRAME_SIZE(%r4),%sp /* stack at end of bss */ +$bss_loop: + cmpb,<<,n %r3,%r4,$bss_loop + stw,ma %r0,4(%r3) + + /* Initialize the global data pointer */ + loadgp + + /* arg0..arg4 were set by palo. */ + copy %arg1, %r6 /* command line */ + copy %arg2, %r7 /* rd-start */ + copy %arg3, %r8 /* rd-end */ + load32 BOOTADDR(decompress_kernel),%r3 + +#ifdef CONFIG_64BIT + .level LEVEL + ssm PSW_W_SM, %r0 /* set W-bit */ + depdi 0, 31, 32, %r3 +#endif + load32 BOOTADDR(startup_continue), %r2 + bv,n 0(%r3) + +startup_continue: +#ifdef CONFIG_64BIT + .level LEVEL + rsm PSW_W_SM, %r0 /* clear W-bit */ +#endif + + load32 KERNEL_BINARY_TEXT_START, %arg0 /* free mem */ + copy %r6, %arg1 /* command line */ + copy %r7, %arg2 /* rd-start */ + copy %r8, %arg3 /* rd-end */ + + bv,n 0(%ret0) +END(startup) diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c new file mode 100644 index 000000000000..13a4bf9ac4da --- /dev/null +++ b/arch/parisc/boot/compressed/misc.c @@ -0,0 +1,301 @@ +/* + * Definitions and wrapper functions for kernel decompressor + * + * (C) 2017 Helge Deller <deller@gmx.de> + */ + +#include <linux/uaccess.h> +#include <asm/unaligned.h> +#include <asm/page.h> +#include "sizes.h" + +/* + * gzip declarations + */ +#define STATIC static + +#undef memmove +#define memmove memmove +#define memzero(s, n) memset((s), 0, (n)) + +#define malloc malloc_gzip +#define free free_gzip + +/* Symbols defined by linker scripts */ +extern char input_data[]; +extern int input_len; +extern __le32 output_len; /* at unaligned address, little-endian */ +extern char _text, _end; +extern char _bss, _ebss; +extern char _startcode_end; +extern void startup_continue(void *entry, unsigned long cmdline, + unsigned long rd_start, unsigned long rd_end) __noreturn; + +void error(char *m) __noreturn; + +static unsigned long free_mem_ptr; +static unsigned long free_mem_end_ptr; + +#ifdef CONFIG_KERNEL_GZIP +#include "../../../../lib/decompress_inflate.c" +#endif + +#ifdef CONFIG_KERNEL_BZIP2 +#include "../../../../lib/decompress_bunzip2.c" +#endif + +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + +#ifdef CONFIG_KERNEL_LZMA +#include "../../../../lib/decompress_unlzma.c" +#endif + +#ifdef CONFIG_KERNEL_LZO +#include "../../../../lib/decompress_unlzo.c" +#endif + +#ifdef CONFIG_KERNEL_XZ +#include "../../../../lib/decompress_unxz.c" +#endif + +void *memmove(void *dest, const void *src, size_t n) +{ + const char *s = src; + char *d = dest; + + if (d <= s) { + while (n--) + *d++ = *s++; + } else { + d += n; + s += n; + while (n--) + *--d = *--s; + } + return dest; +} + +void *memset(void *s, int c, size_t count) +{ + char *xs = (char *)s; + + while (count--) + *xs++ = c; + return s; +} + +void *memcpy(void *d, const void *s, size_t len) +{ + char *dest = (char *)d; + const char *source = (const char *)s; + + while (len--) + *dest++ = *source++; + return d; +} + +size_t strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + ; + return sc - s; +} + +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + ++s; + } + return NULL; +} + +int puts(const char *s) +{ + const char *nuline = s; + + while ((nuline = strchr(s, '\n')) != NULL) { + if (nuline != s) + pdc_iodc_print(s, nuline - s); + pdc_iodc_print("\r\n", 2); + s = nuline + 1; + } + if (*s != '\0') + pdc_iodc_print(s, strlen(s)); + + return 0; +} + +static int putchar(int c) +{ + char buf[2]; + + buf[0] = c; + buf[1] = '\0'; + puts(buf); + return c; +} + +void __noreturn error(char *x) +{ + puts("\n\n"); + puts(x); + puts("\n\n -- System halted"); + while (1) /* wait forever */ + ; +} + +static int print_hex(unsigned long num) +{ + const char hex[] = "0123456789abcdef"; + char str[40]; + int i = sizeof(str)-1; + + str[i--] = '\0'; + do { + str[i--] = hex[num & 0x0f]; + num >>= 4; + } while (num); + + str[i--] = 'x'; + str[i] = '0'; + puts(&str[i]); + + return 0; +} + +int printf(const char *fmt, ...) +{ + va_list args; + int i = 0; + + va_start(args, fmt); + + while (fmt[i]) { + if (fmt[i] != '%') { +put: + putchar(fmt[i++]); + continue; + } + + if (fmt[++i] == '%') + goto put; + ++i; + print_hex(va_arg(args, unsigned long)); + } + + va_end(args); + return 0; +} + +/* helper functions for libgcc */ +void abort(void) +{ + error("aborted."); +} + +#undef malloc +void *malloc(size_t size) +{ + return malloc_gzip(size); +} + +#undef free +void free(void *ptr) +{ + return free_gzip(ptr); +} + + +static void flush_data_cache(char *start, unsigned long length) +{ + char *end = start + length; + + do { + asm volatile("fdc 0(%0)" : : "r" (start)); + asm volatile("fic 0(%%sr0,%0)" : : "r" (start)); + start += 16; + } while (start < end); + asm volatile("fdc 0(%0)" : : "r" (end)); + + asm ("sync"); +} + +unsigned long decompress_kernel(unsigned int started_wide, + unsigned int command_line, + const unsigned int rd_start, + const unsigned int rd_end) +{ + char *output; + unsigned long len, len_all; + +#ifdef CONFIG_64BIT + parisc_narrow_firmware = 0; +#endif + + set_firmware_width_unlocked(); + + putchar('U'); /* if you get this p and no more, string storage */ + /* in $GLOBAL$ is wrong or %dp is wrong */ + puts("ncompressing ...\n"); + + output = (char *) KERNEL_BINARY_TEXT_START; + len_all = __pa(SZ_end) - __pa(SZparisc_kernel_start); + + if ((unsigned long) &_startcode_end > (unsigned long) output) + error("Bootcode overlaps kernel code"); + + len = get_unaligned_le32(&output_len); + if (len > len_all) + error("Output len too big."); + else + memset(&output[len], 0, len_all - len); + + /* + * Initialize free_mem_ptr and free_mem_end_ptr. + */ + free_mem_ptr = (unsigned long) &_ebss; + free_mem_ptr += 2*1024*1024; /* leave 2 MB for stack */ + + /* Limit memory for bootoader to 1GB */ + #define ARTIFICIAL_LIMIT (1*1024*1024*1024) + free_mem_end_ptr = PAGE0->imm_max_mem; + if (free_mem_end_ptr > ARTIFICIAL_LIMIT) + free_mem_end_ptr = ARTIFICIAL_LIMIT; + +#ifdef CONFIG_BLK_DEV_INITRD + /* if we have ramdisk this is at end of memory */ + if (rd_start && rd_start < free_mem_end_ptr) + free_mem_end_ptr = rd_start; +#endif + +#ifdef DEBUG + printf("startcode_end = %x\n", &_startcode_end); + printf("commandline = %x\n", command_line); + printf("rd_start = %x\n", rd_start); + printf("rd_end = %x\n", rd_end); + + printf("free_ptr = %x\n", free_mem_ptr); + printf("free_ptr_end = %x\n", free_mem_end_ptr); + + printf("input_data = %x\n", input_data); + printf("input_len = %x\n", input_len); + printf("output = %x\n", output); + printf("output_len = %x\n", len); + printf("output_max = %x\n", len_all); +#endif + + __decompress(input_data, input_len, NULL, NULL, + output, 0, NULL, error); + + flush_data_cache(output, len); + + printf("Booting kernel ...\n\n"); + + return (unsigned long) output; +} diff --git a/arch/parisc/boot/compressed/vmlinux.lds.S b/arch/parisc/boot/compressed/vmlinux.lds.S new file mode 100644 index 000000000000..a4ce3314e78e --- /dev/null +++ b/arch/parisc/boot/compressed/vmlinux.lds.S @@ -0,0 +1,101 @@ +#include <asm-generic/vmlinux.lds.h> +#include <asm/page.h> +#include "sizes.h" + +#ifndef CONFIG_64BIT +OUTPUT_FORMAT("elf32-hppa-linux") +OUTPUT_ARCH(hppa) +#else +OUTPUT_FORMAT("elf64-hppa-linux") +OUTPUT_ARCH(hppa:hppa2.0w) +#endif + +ENTRY(startup) + +SECTIONS +{ + /* palo loads at 0x60000 */ + /* loaded kernel will move to 0x10000 */ + . = 0xe0000; /* should not overwrite palo code */ + + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + + /* keep __gp below 0x1000000 */ +#ifdef CONFIG_64BIT + . = ALIGN(16); + /* Linkage tables */ + .opd : { + *(.opd) + } PROVIDE (__gp = .); + .plt : { + *(.plt) + } + .dlt : { + *(.dlt) + } +#endif + _startcode_end = .; + + /* bootloader code and data starts behind area of extracted kernel */ + . = (SZ_end - SZparisc_kernel_start + KERNEL_BINARY_TEXT_START); + + /* align on next page boundary */ + . = ALIGN(4096); + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + . = ALIGN(8); + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + . = ALIGN(8); + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + . = ALIGN(8); + .rodata.compressed : { + *(.rodata.compressed) + } + . = ALIGN(8); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(4096); + _ebss = .; + } + + STABS_DEBUG + .note 0 : { *(.note) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { +#ifdef CONFIG_64BIT + /* temporary hack until binutils is fixed to not emit these + * for static binaries + */ + *(.PARISC.unwind) /* no unwind data */ + *(.interp) + *(.dynsym) + *(.dynstr) + *(.dynamic) + *(.hash) + *(.gnu.hash) +#endif + } +} diff --git a/arch/parisc/boot/compressed/vmlinux.scr b/arch/parisc/boot/compressed/vmlinux.scr new file mode 100644 index 000000000000..dac2d142bcfa --- /dev/null +++ b/arch/parisc/boot/compressed/vmlinux.scr @@ -0,0 +1,10 @@ +SECTIONS +{ + .rodata.compressed : { + input_len = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + output_len = . - 4; /* can be at unaligned address */ + input_data_end = .; + } +} diff --git a/arch/parisc/boot/install.sh b/arch/parisc/boot/install.sh new file mode 100644 index 000000000000..8f7c365fad83 --- /dev/null +++ b/arch/parisc/boot/install.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# +# arch/parisc/install.sh, derived from arch/i386/boot/install.sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1995 by Linus Torvalds +# +# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin +# +# "make install" script for i386 architecture +# +# Arguments: +# $1 - kernel version +# $2 - kernel image file +# $3 - kernel map file +# $4 - default install path (blank if root directory) +# + +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo ' *** You need to run "make" before "make install".' 1>&2 + echo "" 1>&2 + exit 1 + fi +} + +# Make sure the files actually exist + +verify "$2" +verify "$3" + +# User may have a custom install script + +if [ -n "${INSTALLKERNEL}" ]; then + if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi + if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi +fi + +# Default install + +if [ "$(basename $2)" = "zImage" ]; then +# Compressed install + echo "Installing compressed kernel" + base=vmlinuz +else +# Normal install + echo "Installing normal kernel" + base=vmlinux +fi + +if [ -f $4/$base-$1 ]; then + mv $4/$base-$1 $4/$base-$1.old +fi +cat $2 > $4/$base-$1 + +# Install system map file +if [ -f $4/System.map-$1 ]; then + mv $4/System.map-$1 $4/System.map-$1.old +fi +cp $3 $4/System.map-$1 diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 5394b9c5f914..17b98a87e5e2 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i) _atomic_spin_unlock_irqrestore(v, flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + static __inline__ int atomic_read(const atomic_t *v) { return READ_ONCE((v)->counter); diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index 0ba14300cd8e..c601aab2fb36 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags) } static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { unsigned long int flags; - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval, ret; u32 tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr))) - return -EFAULT; - _futex_spin_lock_irqsave(uaddr, &flags); pagefault_disable(); @@ -85,17 +75,9 @@ out_pagefault_enable: pagefault_enable(); _futex_spin_unlock_irqrestore(uaddr, &flags); - if (ret == 0) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index a81226257878..e4a657094058 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -63,6 +63,9 @@ static inline void switch_mm(struct mm_struct *prev, { unsigned long flags; + if (prev == next) + return; + local_irq_save(flags); switch_mm_irqs_off(prev, next, tsk); local_irq_restore(flags); diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 80e742a1c162..bfed09d80bae 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -116,11 +116,15 @@ extern int npmem_ranges; /* This governs the relationship between virtual and physical addresses. * If you alter it, make sure to take care of our various fixed mapping * segments in fixmap.h */ +#if defined(BOOTLOADER) +#define __PAGE_OFFSET (0) /* bootloader uses physical addresses */ +#else #ifdef CONFIG_64BIT #define __PAGE_OFFSET (0x40000000) /* 1GB */ #else #define __PAGE_OFFSET (0x10000000) /* 256MB */ #endif +#endif /* BOOTLOADER */ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index 7569627a032b..26b4455baa83 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -5,6 +5,8 @@ #if !defined(__ASSEMBLY__) +extern int parisc_narrow_firmware; + extern int pdc_type; extern unsigned long parisc_cell_num; /* cell number the CPU runs on (PAT) */ extern unsigned long parisc_cell_loc; /* cell location of CPU (PAT) */ diff --git a/arch/parisc/include/asm/pdcpat.h b/arch/parisc/include/asm/pdcpat.h index e3c0586260d8..a468a172ee33 100644 --- a/arch/parisc/include/asm/pdcpat.h +++ b/arch/parisc/include/asm/pdcpat.h @@ -223,6 +223,18 @@ struct pdc_pat_mem_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_PD_INFO (return info) */ unsigned long clear_time; /* last PDT clear time (since Jan 1970) */ }; +struct pdc_pat_mem_cell_pdt_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_CELL_INFO */ + u64 reserved:32; + u64 cs:1; /* clear status: cleared since the last call? */ + u64 current_pdt_entries:15; + u64 ic:1; /* interleaving had to be changed ? */ + u64 max_pdt_entries:15; + unsigned long good_mem; + unsigned long first_dbe_loc; /* first location of double bit error */ + unsigned long clear_time; /* last PDT clear time (since Jan 1970) */ +}; + + struct pdc_pat_mem_read_pd_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_PD_READ */ unsigned long actual_count_bytes; unsigned long pdt_entries; @@ -325,6 +337,8 @@ extern int pdc_pat_io_pci_cfg_read(unsigned long pci_addr, int pci_size, u32 *va extern int pdc_pat_io_pci_cfg_write(unsigned long pci_addr, int pci_size, u32 val); extern int pdc_pat_mem_pdt_info(struct pdc_pat_mem_retinfo *rinfo); +extern int pdc_pat_mem_pdt_cell_info(struct pdc_pat_mem_cell_pdt_retinfo *rinfo, + unsigned long cell); extern int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, unsigned long *pdt_entries_ptr, unsigned long max_entries); extern int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index e32936cd7f10..55bfe4affca3 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -14,13 +14,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x) #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *x) -{ - volatile unsigned int *a = __ldcw_align(x); - - smp_cond_load_acquire(a, VAL); -} - static inline void arch_spin_lock_flags(arch_spinlock_t *x, unsigned long flags) { diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 5979745815a5..9a9c2fe4be50 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -40,9 +40,6 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ -#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ -#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ -#define MADV_VPS_INHERIT 7 /* Inherit parents page size */ /* common/generic parameters */ #define MADV_FREE 8 /* free pages only if memory pressure */ @@ -60,6 +57,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ +#define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index f622a311d04a..ab80e5c6f651 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -69,7 +69,15 @@ #include <asm/pdcpat.h> #include <asm/processor.h> /* for boot_cpu_data */ +#if defined(BOOTLOADER) +# undef spin_lock_irqsave +# define spin_lock_irqsave(a, b) { b = 1; } +# undef spin_unlock_irqrestore +# define spin_unlock_irqrestore(a, b) +#else static DEFINE_SPINLOCK(pdc_lock); +#endif + extern unsigned long pdc_result[NUM_PDC_RESULT]; extern unsigned long pdc_result2[NUM_PDC_RESULT]; @@ -142,8 +150,8 @@ static void convert_to_wide(unsigned long *addr) int i; unsigned int *p = (unsigned int *)addr; - if(unlikely(parisc_narrow_firmware)) { - for(i = 31; i >= 0; --i) + if (unlikely(parisc_narrow_firmware)) { + for (i = (NUM_PDC_RESULT-1); i >= 0; --i) addr[i] = p[i]; } #endif @@ -186,6 +194,8 @@ void set_firmware_width(void) } #endif /*CONFIG_64BIT*/ + +#if !defined(BOOTLOADER) /** * pdc_emergency_unlock - Unlock the linux pdc lock * @@ -979,16 +989,22 @@ int pdc_mem_pdt_read_entries(struct pdc_mem_read_pdt *pret, spin_lock_irqsave(&pdc_lock, flags); retval = mem_pdc_call(PDC_MEM, PDC_MEM_READ_PDT, __pa(pdc_result), - __pa(pdc_result2)); + __pa(pdt_entries_ptr)); if (retval == PDC_OK) { convert_to_wide(pdc_result); memcpy(pret, pdc_result, sizeof(*pret)); - convert_to_wide(pdc_result2); - memcpy(pdt_entries_ptr, pdc_result2, - pret->pdt_entries * sizeof(*pdt_entries_ptr)); } spin_unlock_irqrestore(&pdc_lock, flags); +#ifdef CONFIG_64BIT + /* + * 64-bit kernels should not call this PDT function in narrow mode. + * The pdt_entries_ptr array above will now contain 32-bit values + */ + if (WARN_ON_ONCE((retval == PDC_OK) && parisc_narrow_firmware)) + return PDC_ERROR; +#endif + return retval; } @@ -1143,6 +1159,8 @@ void pdc_io_reset_devices(void) spin_unlock_irqrestore(&pdc_lock, flags); } +#endif /* defined(BOOTLOADER) */ + /* locked by pdc_console_lock */ static int __attribute__((aligned(8))) iodc_retbuf[32]; static char __attribute__((aligned(64))) iodc_dbuf[4096]; @@ -1187,6 +1205,7 @@ print: return i; } +#if !defined(BOOTLOADER) /** * pdc_iodc_getc - Read a character (non-blocking) from the PDC console. * @@ -1440,6 +1459,29 @@ int pdc_pat_mem_pdt_info(struct pdc_pat_mem_retinfo *rinfo) } /** + * pdc_pat_mem_pdt_cell_info - Retrieve information about page deallocation + * table of a cell + * @rinfo: memory pdt information + * @cell: cell number + * + */ +int pdc_pat_mem_pdt_cell_info(struct pdc_pat_mem_cell_pdt_retinfo *rinfo, + unsigned long cell) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_CELL_INFO, + __pa(&pdc_result), cell); + if (retval == PDC_OK) + memcpy(rinfo, &pdc_result, sizeof(*rinfo)); + spin_unlock_irqrestore(&pdc_lock, flags); + + return retval; +} + +/** * pdc_pat_mem_read_cell_pdt - Read PDT entries from (old) PAT firmware * @pret: array of PDT entries * @pdt_entries_ptr: ptr to hold number of PDT entries @@ -1455,14 +1497,14 @@ int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, spin_lock_irqsave(&pdc_lock, flags); /* PDC_PAT_MEM_CELL_READ is available on early PAT machines only */ retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_CELL_READ, - __pa(&pdc_result), parisc_cell_num, __pa(&pdc_result2)); + __pa(&pdc_result), parisc_cell_num, + __pa(pdt_entries_ptr)); if (retval == PDC_OK) { /* build up return value as for PDC_PAT_MEM_PD_READ */ entries = min(pdc_result[0], max_entries); pret->pdt_entries = entries; pret->actual_count_bytes = entries * sizeof(unsigned long); - memcpy(pdt_entries_ptr, &pdc_result2, pret->actual_count_bytes); } spin_unlock_irqrestore(&pdc_lock, flags); @@ -1474,6 +1516,8 @@ int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, * pdc_pat_mem_read_pd_pdt - Read PDT entries from (newer) PAT firmware * @pret: array of PDT entries * @pdt_entries_ptr: ptr to hold number of PDT entries + * @count: number of bytes to read + * @offset: offset to start (in bytes) * */ int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, @@ -1524,6 +1568,7 @@ int pdc_pat_mem_get_dimm_phys_location( return retval; } #endif /* CONFIG_64BIT */ +#endif /* defined(BOOTLOADER) */ /***************** 32-bit real-mode calls ***********/ @@ -1633,4 +1678,3 @@ long real64_call(unsigned long fn, ...) } #endif /* CONFIG_64BIT */ - diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 5f0067a62738..bd4c0a7471d3 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -41,7 +41,7 @@ static unsigned long pcxl_used_bytes __read_mostly = 0; static unsigned long pcxl_used_pages __read_mostly = 0; extern unsigned long pcxl_dma_start; /* Start of pcxl dma mapping area */ -static spinlock_t pcxl_res_lock; +static DEFINE_SPINLOCK(pcxl_res_lock); static char *pcxl_res_map; static int pcxl_res_hint; static int pcxl_res_size; @@ -390,7 +390,6 @@ pcxl_dma_init(void) if (pcxl_dma_start == 0) return 0; - spin_lock_init(&pcxl_res_lock); pcxl_res_size = PCXL_DMA_MAP_SIZE >> (PAGE_SHIFT + 3); pcxl_res_hint = 0; pcxl_res_map = (char *)__get_free_pages(GFP_KERNEL, diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index d02874ecb94d..05730a83895c 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -1,19 +1,20 @@ /* * Page Deallocation Table (PDT) support * - * The Page Deallocation Table (PDT) holds a table with pointers to bad - * memory (broken RAM modules) which is maintained by firmware. + * The Page Deallocation Table (PDT) is maintained by firmware and holds a + * list of memory addresses in which memory errors were detected. + * The list contains both single-bit (correctable) and double-bit + * (uncorrectable) errors. * * Copyright 2017 by Helge Deller <deller@gmx.de> * - * TODO: - * - check regularily for new bad memory - * - add userspace interface with procfs or sysfs - * - increase number of PDT entries dynamically + * possible future enhancements: + * - add userspace interface via procfs or sysfs to clear PDT */ #include <linux/memblock.h> #include <linux/seq_file.h> +#include <linux/kthread.h> #include <asm/pdc.h> #include <asm/pdcpat.h> @@ -24,11 +25,16 @@ enum pdt_access_type { PDT_NONE, PDT_PDC, PDT_PAT_NEW, - PDT_PAT_OLD + PDT_PAT_CELL }; static enum pdt_access_type pdt_type; +/* PDT poll interval: 1 minute if errors, 5 minutes if everything OK. */ +#define PDT_POLL_INTERVAL_DEFAULT (5*60*HZ) +#define PDT_POLL_INTERVAL_SHORT (1*60*HZ) +static unsigned long pdt_poll_interval = PDT_POLL_INTERVAL_DEFAULT; + /* global PDT status information */ static struct pdc_mem_retinfo pdt_status; @@ -36,6 +42,21 @@ static struct pdc_mem_retinfo pdt_status; #define MAX_PDT_ENTRIES (MAX_PDT_TABLE_SIZE / sizeof(unsigned long)) static unsigned long pdt_entry[MAX_PDT_ENTRIES] __page_aligned_bss; +/* + * Constants for the pdt_entry format: + * A pdt_entry holds the physical address in bits 0-57, bits 58-61 are + * reserved, bit 62 is the perm bit and bit 63 is the error_type bit. + * The perm bit indicates whether the error have been verified as a permanent + * error (value of 1) or has not been verified, and may be transient (value + * of 0). The error_type bit indicates whether the error is a single bit error + * (value of 1) or a multiple bit error. + * On non-PAT machines phys_addr is encoded in bits 0-59 and error_type in bit + * 63. Those machines don't provide the perm bit. + */ + +#define PDT_ADDR_PHYS_MASK (pdt_type != PDT_PDC ? ~0x3f : ~0x0f) +#define PDT_ADDR_PERM_ERR (pdt_type != PDT_PDC ? 2UL : 0UL) +#define PDT_ADDR_SINGLE_ERR 1UL /* report PDT entries via /proc/meminfo */ void arch_report_meminfo(struct seq_file *m) @@ -49,6 +70,68 @@ void arch_report_meminfo(struct seq_file *m) pdt_status.pdt_entries); } +static int get_info_pat_new(void) +{ + struct pdc_pat_mem_retinfo pat_rinfo; + int ret; + + /* newer PAT machines like C8000 report info for all cells */ + if (is_pdc_pat()) + ret = pdc_pat_mem_pdt_info(&pat_rinfo); + else + return PDC_BAD_PROC; + + pdt_status.pdt_size = pat_rinfo.max_pdt_entries; + pdt_status.pdt_entries = pat_rinfo.current_pdt_entries; + pdt_status.pdt_status = 0; + pdt_status.first_dbe_loc = pat_rinfo.first_dbe_loc; + pdt_status.good_mem = pat_rinfo.good_mem; + + return ret; +} + +static int get_info_pat_cell(void) +{ + struct pdc_pat_mem_cell_pdt_retinfo cell_rinfo; + int ret; + + /* older PAT machines like rp5470 report cell info only */ + if (is_pdc_pat()) + ret = pdc_pat_mem_pdt_cell_info(&cell_rinfo, parisc_cell_num); + else + return PDC_BAD_PROC; + + pdt_status.pdt_size = cell_rinfo.max_pdt_entries; + pdt_status.pdt_entries = cell_rinfo.current_pdt_entries; + pdt_status.pdt_status = 0; + pdt_status.first_dbe_loc = cell_rinfo.first_dbe_loc; + pdt_status.good_mem = cell_rinfo.good_mem; + + return ret; +} + +static void report_mem_err(unsigned long pde) +{ + struct pdc_pat_mem_phys_mem_location loc; + unsigned long addr; + char dimm_txt[32]; + + addr = pde & PDT_ADDR_PHYS_MASK; + + /* show DIMM slot description on PAT machines */ + if (is_pdc_pat()) { + pdc_pat_mem_get_dimm_phys_location(&loc, addr); + sprintf(dimm_txt, "DIMM slot %02x, ", loc.dimm_slot); + } else + dimm_txt[0] = 0; + + pr_warn("PDT: BAD MEMORY at 0x%08lx, %s%s%s-bit error.\n", + addr, dimm_txt, + pde & PDT_ADDR_PERM_ERR ? "permanent ":"", + pde & PDT_ADDR_SINGLE_ERR ? "single":"multi"); +} + + /* * pdc_pdt_init() * @@ -63,18 +146,17 @@ void __init pdc_pdt_init(void) unsigned long entries; struct pdc_mem_read_pdt pdt_read_ret; - if (is_pdc_pat()) { - struct pdc_pat_mem_retinfo pat_rinfo; + pdt_type = PDT_PAT_NEW; + ret = get_info_pat_new(); - pdt_type = PDT_PAT_NEW; - ret = pdc_pat_mem_pdt_info(&pat_rinfo); - pdt_status.pdt_size = pat_rinfo.max_pdt_entries; - pdt_status.pdt_entries = pat_rinfo.current_pdt_entries; - pdt_status.pdt_status = 0; - pdt_status.first_dbe_loc = pat_rinfo.first_dbe_loc; - pdt_status.good_mem = pat_rinfo.good_mem; - } else { + if (ret != PDC_OK) { + pdt_type = PDT_PAT_CELL; + ret = get_info_pat_cell(); + } + + if (ret != PDC_OK) { pdt_type = PDT_PDC; + /* non-PAT machines provide the standard PDC call */ ret = pdc_mem_pdt_info(&pdt_status); } @@ -86,13 +168,17 @@ void __init pdc_pdt_init(void) } entries = pdt_status.pdt_entries; - WARN_ON(entries > MAX_PDT_ENTRIES); + if (WARN_ON(entries > MAX_PDT_ENTRIES)) + entries = pdt_status.pdt_entries = MAX_PDT_ENTRIES; - pr_info("PDT: size %lu, entries %lu, status %lu, dbe_loc 0x%lx," - " good_mem %lu\n", + pr_info("PDT: type %s, size %lu, entries %lu, status %lu, dbe_loc 0x%lx," + " good_mem %lu MB\n", + pdt_type == PDT_PDC ? __stringify(PDT_PDC) : + pdt_type == PDT_PAT_CELL ? __stringify(PDT_PAT_CELL) + : __stringify(PDT_PAT_NEW), pdt_status.pdt_size, pdt_status.pdt_entries, pdt_status.pdt_status, pdt_status.first_dbe_loc, - pdt_status.good_mem); + pdt_status.good_mem / 1024 / 1024); if (entries == 0) { pr_info("PDT: Firmware reports all memory OK.\n"); @@ -112,15 +198,12 @@ void __init pdc_pdt_init(void) #ifdef CONFIG_64BIT struct pdc_pat_mem_read_pd_retinfo pat_pret; - /* try old obsolete PAT firmware function first */ - pdt_type = PDT_PAT_OLD; - ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, - MAX_PDT_ENTRIES); - if (ret != PDC_OK) { - pdt_type = PDT_PAT_NEW; + if (pdt_type == PDT_PAT_CELL) + ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, + MAX_PDT_ENTRIES); + else ret = pdc_pat_mem_read_pd_pdt(&pat_pret, pdt_entry, MAX_PDT_TABLE_SIZE, 0); - } #else ret = PDC_BAD_PROC; #endif @@ -128,27 +211,142 @@ void __init pdc_pdt_init(void) if (ret != PDC_OK) { pdt_type = PDT_NONE; - pr_debug("PDT type %d, retval = %d\n", pdt_type, ret); + pr_warn("PDT: Get PDT entries failed with %d\n", ret); return; } for (i = 0; i < pdt_status.pdt_entries; i++) { - struct pdc_pat_mem_phys_mem_location loc; + report_mem_err(pdt_entry[i]); + + /* mark memory page bad */ + memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); + } +} - /* get DIMM slot number */ - loc.dimm_slot = 0xff; + +/* + * This is the PDT kernel thread main loop. + */ + +static int pdt_mainloop(void *unused) +{ + struct pdc_mem_read_pdt pdt_read_ret; + struct pdc_pat_mem_read_pd_retinfo pat_pret __maybe_unused; + unsigned long old_num_entries; + unsigned long *bad_mem_ptr; + int num, ret; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + old_num_entries = pdt_status.pdt_entries; + + schedule_timeout(pdt_poll_interval); + if (kthread_should_stop()) + break; + + /* Do we have new PDT entries? */ + switch (pdt_type) { + case PDT_PAT_NEW: + ret = get_info_pat_new(); + break; + case PDT_PAT_CELL: + ret = get_info_pat_cell(); + break; + default: + ret = pdc_mem_pdt_info(&pdt_status); + break; + } + + if (ret != PDC_OK) { + pr_warn("PDT: unexpected failure %d\n", ret); + return -EINVAL; + } + + /* if no new PDT entries, just wait again */ + num = pdt_status.pdt_entries - old_num_entries; + if (num <= 0) + continue; + + /* decrease poll interval in case we found memory errors */ + if (pdt_status.pdt_entries && + pdt_poll_interval == PDT_POLL_INTERVAL_DEFAULT) + pdt_poll_interval = PDT_POLL_INTERVAL_SHORT; + + /* limit entries to get */ + if (num > MAX_PDT_ENTRIES) { + num = MAX_PDT_ENTRIES; + pdt_status.pdt_entries = old_num_entries + num; + } + + /* get new entries */ + switch (pdt_type) { #ifdef CONFIG_64BIT - pdc_pat_mem_get_dimm_phys_location(&loc, pdt_entry[i]); + case PDT_PAT_CELL: + if (pdt_status.pdt_entries > MAX_PDT_ENTRIES) { + pr_crit("PDT: too many entries.\n"); + return -ENOMEM; + } + ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, + MAX_PDT_ENTRIES); + bad_mem_ptr = &pdt_entry[old_num_entries]; + break; + case PDT_PAT_NEW: + ret = pdc_pat_mem_read_pd_pdt(&pat_pret, + pdt_entry, + num * sizeof(unsigned long), + old_num_entries * sizeof(unsigned long)); + bad_mem_ptr = &pdt_entry[0]; + break; #endif + default: + ret = pdc_mem_pdt_read_entries(&pdt_read_ret, + pdt_entry); + bad_mem_ptr = &pdt_entry[old_num_entries]; + break; + } - pr_warn("PDT: BAD PAGE #%d at 0x%08lx, " - "DIMM slot %02x (error_type = %lu)\n", - i, - pdt_entry[i] & PAGE_MASK, - loc.dimm_slot, - pdt_entry[i] & 1); + /* report and mark memory broken */ + while (num--) { + unsigned long pde = *bad_mem_ptr++; - /* mark memory page bad */ - memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); + report_mem_err(pde); + +#ifdef CONFIG_MEMORY_FAILURE + if ((pde & PDT_ADDR_PERM_ERR) || + ((pde & PDT_ADDR_SINGLE_ERR) == 0)) + memory_failure(pde >> PAGE_SHIFT, 0, 0); + else + soft_offline_page( + pfn_to_page(pde >> PAGE_SHIFT), 0); +#else + pr_crit("PDT: memory error at 0x%lx ignored.\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y " + "for real handling.\n", + pde & PDT_ADDR_PHYS_MASK); +#endif + + } } + + return 0; } + + +static int __init pdt_initcall(void) +{ + struct task_struct *kpdtd_task; + + if (pdt_type == PDT_NONE) + return -ENODEV; + + kpdtd_task = kthread_create(pdt_mainloop, NULL, "kpdtd"); + if (IS_ERR(kpdtd_task)) + return PTR_ERR(kpdtd_task); + + wake_up_process(kpdtd_task); + + return 0; +} + +late_initcall(pdt_initcall); diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 6017a5af2e6e..0813359049ae 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -69,7 +69,7 @@ struct rdr_tbl_ent { static int perf_processor_interface __read_mostly = UNKNOWN_INTF; static int perf_enabled __read_mostly; -static spinlock_t perf_lock; +static DEFINE_SPINLOCK(perf_lock); struct parisc_device *cpu_device __read_mostly; /* RDRs to write for PCX-W */ @@ -533,8 +533,6 @@ static int __init perf_init(void) /* Patch the images to match the system */ perf_patch_images(); - spin_lock_init(&perf_lock); - /* TODO: this only lets us access the first cpu.. what to do for SMP? */ cpu_device = per_cpu(cpu_data, 0).dev; printk("Performance monitoring counters enabled for %s\n", diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 0ab32779dfa7..a778bd3c107c 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -30,6 +30,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/seq_file.h> +#include <linux/random.h> #include <linux/slab.h> #include <linux/cpu.h> #include <asm/param.h> @@ -89,7 +90,7 @@ init_percpu_prof(unsigned long cpunum) * (return 1). If so, initialize the chip and tell other partners in crime * they have work to do. */ -static int processor_probe(struct parisc_device *dev) +static int __init processor_probe(struct parisc_device *dev) { unsigned long txn_addr; unsigned long cpuid; @@ -237,28 +238,45 @@ static int processor_probe(struct parisc_device *dev) */ void __init collect_boot_cpu_data(void) { + unsigned long cr16_seed; + memset(&boot_cpu_data, 0, sizeof(boot_cpu_data)); + cr16_seed = get_cycles(); + add_device_randomness(&cr16_seed, sizeof(cr16_seed)); + boot_cpu_data.cpu_hz = 100 * PAGE0->mem_10msec; /* Hz of this PARISC */ /* get CPU-Model Information... */ #define p ((unsigned long *)&boot_cpu_data.pdc.model) - if (pdc_model_info(&boot_cpu_data.pdc.model) == PDC_OK) + if (pdc_model_info(&boot_cpu_data.pdc.model) == PDC_OK) { printk(KERN_INFO "model %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]); + + add_device_randomness(&boot_cpu_data.pdc.model, + sizeof(boot_cpu_data.pdc.model)); + } #undef p - if (pdc_model_versions(&boot_cpu_data.pdc.versions, 0) == PDC_OK) + if (pdc_model_versions(&boot_cpu_data.pdc.versions, 0) == PDC_OK) { printk(KERN_INFO "vers %08lx\n", boot_cpu_data.pdc.versions); - if (pdc_model_cpuid(&boot_cpu_data.pdc.cpuid) == PDC_OK) + add_device_randomness(&boot_cpu_data.pdc.versions, + sizeof(boot_cpu_data.pdc.versions)); + } + + if (pdc_model_cpuid(&boot_cpu_data.pdc.cpuid) == PDC_OK) { printk(KERN_INFO "CPUID vers %ld rev %ld (0x%08lx)\n", (boot_cpu_data.pdc.cpuid >> 5) & 127, boot_cpu_data.pdc.cpuid & 31, boot_cpu_data.pdc.cpuid); + add_device_randomness(&boot_cpu_data.pdc.cpuid, + sizeof(boot_cpu_data.pdc.cpuid)); + } + if (pdc_model_capabilities(&boot_cpu_data.pdc.capabilities) == PDC_OK) printk(KERN_INFO "capabilities 0x%lx\n", boot_cpu_data.pdc.capabilities); @@ -414,12 +432,12 @@ show_cpuinfo (struct seq_file *m, void *v) return 0; } -static const struct parisc_device_id processor_tbl[] = { +static const struct parisc_device_id processor_tbl[] __initconst = { { HPHW_NPROC, HVERSION_REV_ANY_ID, HVERSION_ANY_ID, SVERSION_ANY_ID }, { 0, } }; -static struct parisc_driver cpu_driver = { +static struct parisc_driver cpu_driver __refdata = { .name = "CPU", .id_table = processor_tbl, .probe = processor_probe diff --git a/arch/parisc/kernel/real2.S b/arch/parisc/kernel/real2.S index 1db58e546230..cc9963421a19 100644 --- a/arch/parisc/kernel/real2.S +++ b/arch/parisc/kernel/real2.S @@ -162,6 +162,7 @@ ENDPROC_CFI(restore_control_regs) .text .align 128 ENTRY_CFI(rfi_virt2real) +#if !defined(BOOTLOADER) /* switch to real mode... */ rsm PSW_SM_I,%r0 load32 PA(rfi_v2r_1), %r1 @@ -191,6 +192,7 @@ ENTRY_CFI(rfi_virt2real) nop rfi_v2r_1: tophys_r1 %r2 +#endif /* defined(BOOTLOADER) */ bv 0(%r2) nop ENDPROC_CFI(rfi_virt2real) @@ -198,6 +200,7 @@ ENDPROC_CFI(rfi_virt2real) .text .align 128 ENTRY_CFI(rfi_real2virt) +#if !defined(BOOTLOADER) rsm PSW_SM_I,%r0 load32 (rfi_r2v_1), %r1 nop @@ -226,6 +229,7 @@ ENTRY_CFI(rfi_real2virt) nop rfi_r2v_1: tovirt_r1 %r2 +#endif /* defined(BOOTLOADER) */ bv 0(%r2) nop ENDPROC_CFI(rfi_real2virt) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 1b73690477c5..48dc7d4d20bb 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -34,7 +34,7 @@ extern struct unwind_table_entry __start___unwind[]; extern struct unwind_table_entry __stop___unwind[]; -static spinlock_t unwind_lock; +static DEFINE_SPINLOCK(unwind_lock); /* * the kernel unwind block is not dynamically allocated so that * we can call unwind_init as early in the bootup process as @@ -181,8 +181,6 @@ int __init unwind_init(void) start = (long)&__start___unwind[0]; stop = (long)&__stop___unwind[0]; - spin_lock_init(&unwind_lock); - printk("unwind_init: start = 0x%lx, end = 0x%lx, entries = %lu\n", start, stop, (stop - start) / sizeof(struct unwind_table_entry)); diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 99115cd9e790..865a7f796c7f 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -27,8 +27,6 @@ #include <linux/compiler.h> #include <linux/uaccess.h> -DECLARE_PER_CPU(struct exception_data, exception_data); - #define get_user_space() (uaccess_kernel() ? 0 : mfsp(3)) #define get_kernel_space() (0) diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index 25d42bd3f114..9c601adfc500 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,13 +74,6 @@ do { \ ___p1; \ }) -/* - * This must resolve to hwsync on SMP for the context switch path. - * See _switch, and core scheduler context switch memory ordering - * comments. - */ -#define smp_mb__before_spinlock() smp_mb() - #include <asm-generic/barrier.h> #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index eaada6c92344..719ed9b61ea7 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -29,18 +29,10 @@ : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ : "cr0", "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 8c1b913de6d7..edbe571bcc54 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -170,39 +170,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->slock = 0; } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - arch_spinlock_t lock_val; - - smp_mb(); - - /* - * Atomically load and store back the lock value (unchanged). This - * ensures that our observation of the lock value is ordered with - * respect to other lock operations. - */ - __asm__ __volatile__( -"1: " PPC_LWARX(%0, 0, %2, 0) "\n" -" stwcx. %0, 0, %2\n" -" bne- 1b\n" - : "=&r" (lock_val), "+m" (*lock) - : "r" (lock) - : "cr0", "xer"); - - if (arch_spin_value_unlocked(lock_val)) - goto out; - - while (lock->slock) { - HMT_low(); - if (SHARED_PROCESSOR) - __spin_yield(lock); - } - HMT_medium(); - -out: - smp_mb(); -} - /* * Read-write spinlocks, allowing multiple readers * but only one writer. @@ -342,5 +309,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) __rw_yield(lock) #define arch_write_relax(lock) __rw_yield(lock) +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6c2d4168daec..2e3eb7431571 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, ~0ULL, event->hw.last_period); - if (event->attr.sample_type & PERF_SAMPLE_ADDR) + if (event->attr.sample_type & + (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) perf_get_data_addr(regs, &data.addr); if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) { diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index ae2f740a82f1..5ffcdeb1eb17 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1749,7 +1749,7 @@ out: static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file_inode(file); - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = file_write_and_wait_range(file, start, end); if (!err) { inode_lock(inode); err = spufs_mfc_flush(file, NULL); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 7eeb75d758c1..48af970320cb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -222,6 +222,10 @@ config HAVE_MARCH_Z13_FEATURES def_bool n select HAVE_MARCH_ZEC12_FEATURES +config HAVE_MARCH_Z14_FEATURES + def_bool n + select HAVE_MARCH_Z13_FEATURES + choice prompt "Processor type" default MARCH_Z196 @@ -282,6 +286,14 @@ config MARCH_Z13 2964 series). The kernel will be slightly faster but will not work on older machines. +config MARCH_Z14 + bool "IBM z14" + select HAVE_MARCH_Z14_FEATURES + help + Select this to enable optimizations for IBM z14 (3906 series). + The kernel will be slightly faster but will not work on older + machines. + endchoice config MARCH_Z900_TUNE @@ -305,6 +317,9 @@ config MARCH_ZEC12_TUNE config MARCH_Z13_TUNE def_bool TUNE_Z13 || MARCH_Z13 && TUNE_DEFAULT +config MARCH_Z14_TUNE + def_bool TUNE_Z14 || MARCH_Z14 && TUNE_DEFAULT + choice prompt "Tune code generation" default TUNE_DEFAULT @@ -343,6 +358,9 @@ config TUNE_ZEC12 config TUNE_Z13 bool "IBM z13" +config TUNE_Z14 + bool "IBM z14" + endchoice config 64BIT diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 54e00526b8df..dac821cfcd43 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -31,7 +31,8 @@ mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109 mflags-$(CONFIG_MARCH_Z10) := -march=z10 mflags-$(CONFIG_MARCH_Z196) := -march=z196 mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12 -mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z14) := -march=z14 export CC_FLAGS_MARCH := $(mflags-y) @@ -44,7 +45,8 @@ cflags-$(CONFIG_MARCH_Z9_109_TUNE) += -mtune=z9-109 cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10 cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196 cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12 -cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index b3c88479feba..6e2c9f7e47fa 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -16,4 +16,5 @@ generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += preempt.h generic-y += trace_clock.h +generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index b9300f8aee10..07a82bc933a7 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -8,11 +8,12 @@ #include <linux/sched/task_stack.h> #include <linux/thread_info.h> -#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64)) +#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \ + typeof(0?(__force t)0:0ULL), u64)) #define __SC_DELOUSE(t,v) ({ \ BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \ - (t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ + (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ }) #define PSW32_MASK_PER 0x40000000UL diff --git a/arch/s390/include/asm/cpcmd.h b/arch/s390/include/asm/cpcmd.h index 3dfadb5d648f..ca2b0624ad46 100644 --- a/arch/s390/include/asm/cpcmd.h +++ b/arch/s390/include/asm/cpcmd.h @@ -10,9 +10,8 @@ /* * the lowlevel function for cpcmd - * the caller of __cpcmd has to ensure that the response buffer is below 2 GB */ -extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); /* * cpcmd is the in-kernel interface for issuing CP commands @@ -25,8 +24,8 @@ extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code * response_code: return pointer for VM's error code * return value: the size of the response. The caller can check if the buffer * was large enough by comparing the return value and rlen - * NOTE: If the response buffer is not below 2 GB, cpcmd can sleep + * NOTE: If the response buffer is not in real storage, cpcmd can sleep */ -extern int cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int cpcmd(const char *cmd, char *response, int rlen, int *response_code); #endif /* _ASM_S390_CPCMD_H */ diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index c5befc5a3bf5..b71735eab23f 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -9,9 +9,7 @@ #ifndef _EBCDIC_H #define _EBCDIC_H -#ifndef _S390_TYPES_H -#include <types.h> -#endif +#include <linux/types.h> extern __u8 _ascebc_500[256]; /* ASCII -> EBCDIC 500 conversion table */ extern __u8 _ebcasc_500[256]; /* EBCDIC 500 -> ASCII conversion table */ diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index c92ed0170be2..65998a1f5d43 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -191,7 +191,7 @@ struct arch_elf_state { } while (0) #define CORE_DUMP_USE_REGSET -#define ELF_EXEC_PAGESIZE 4096 +#define ELF_EXEC_PAGESIZE PAGE_SIZE /* * This is the base location for PIE (ET_DYN with INTERP) loads. On diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index a4811aa0304d..8f8eec9e1198 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -21,17 +21,12 @@ : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc"); -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, newval, ret; load_kernel_asce(); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; pagefault_disable(); switch (op) { @@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index edb5161df7e2..6810bd757312 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -81,7 +81,7 @@ struct ipl_parameter_block { struct ipl_block_fcp fcp; struct ipl_block_ccw ccw; } ipl_info; -} __attribute__((packed,aligned(4096))); +} __packed __aligned(PAGE_SIZE); /* * IPL validity flags diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 8a5b082797f8..a6870ea6ea8b 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -95,46 +95,46 @@ struct lowcore { __u64 int_clock; /* 0x0310 */ __u64 mcck_clock; /* 0x0318 */ __u64 clock_comparator; /* 0x0320 */ + __u64 boot_clock[2]; /* 0x0328 */ /* Current process. */ - __u64 current_task; /* 0x0328 */ - __u8 pad_0x318[0x320-0x318]; /* 0x0330 */ - __u64 kernel_stack; /* 0x0338 */ + __u64 current_task; /* 0x0338 */ + __u64 kernel_stack; /* 0x0340 */ /* Interrupt, panic and restart stack. */ - __u64 async_stack; /* 0x0340 */ - __u64 panic_stack; /* 0x0348 */ - __u64 restart_stack; /* 0x0350 */ + __u64 async_stack; /* 0x0348 */ + __u64 panic_stack; /* 0x0350 */ + __u64 restart_stack; /* 0x0358 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0358 */ - __u64 restart_data; /* 0x0360 */ - __u64 restart_source; /* 0x0368 */ + __u64 restart_fn; /* 0x0360 */ + __u64 restart_data; /* 0x0368 */ + __u64 restart_source; /* 0x0370 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0370 */ - __u64 user_asce; /* 0x0378 */ + __u64 kernel_asce; /* 0x0378 */ + __u64 user_asce; /* 0x0380 */ /* * The lpp and current_pid fields form a * 64-bit value that is set as program * parameter with the LPP instruction. */ - __u32 lpp; /* 0x0380 */ - __u32 current_pid; /* 0x0384 */ + __u32 lpp; /* 0x0388 */ + __u32 current_pid; /* 0x038c */ /* SMP info area */ - __u32 cpu_nr; /* 0x0388 */ - __u32 softirq_pending; /* 0x038c */ - __u64 percpu_offset; /* 0x0390 */ - __u64 vdso_per_cpu_data; /* 0x0398 */ - __u64 machine_flags; /* 0x03a0 */ - __u32 preempt_count; /* 0x03a8 */ - __u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */ - __u64 gmap; /* 0x03b0 */ - __u32 spinlock_lockval; /* 0x03b8 */ - __u32 fpu_flags; /* 0x03bc */ - __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ + __u32 cpu_nr; /* 0x0390 */ + __u32 softirq_pending; /* 0x0394 */ + __u64 percpu_offset; /* 0x0398 */ + __u64 vdso_per_cpu_data; /* 0x03a0 */ + __u64 machine_flags; /* 0x03a8 */ + __u32 preempt_count; /* 0x03b0 */ + __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ + __u64 gmap; /* 0x03b8 */ + __u32 spinlock_lockval; /* 0x03c0 */ + __u32 fpu_flags; /* 0x03c4 */ + __u8 pad_0x03c8[0x0400-0x03c8]; /* 0x03c8 */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h deleted file mode 100644 index b79813d9cf68..000000000000 --- a/arch/s390/include/asm/mman.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/mman.h" - */ -#ifndef __S390_MMAN_H__ -#define __S390_MMAN_H__ - -#include <uapi/asm/mman.h> - -#endif /* __S390_MMAN_H__ */ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 24bc41622a98..72e9ca83a668 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -12,6 +12,7 @@ #include <linux/mm_types.h> #include <asm/tlbflush.h> #include <asm/ctl_reg.h> +#include <asm-generic/mm_hooks.h> static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) @@ -33,7 +34,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.use_cmma = 0; #endif switch (mm->context.asce_limit) { - case 1UL << 42: + case _REGION2_SIZE: /* * forked 3-level task, fall through to set new asce with new * mm->pgd @@ -49,12 +50,12 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION1; break; - case 1UL << 53: + case _REGION1_SIZE: /* forked 4-level task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; break; - case 1UL << 31: + case _REGION3_SIZE: /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; @@ -138,30 +139,4 @@ static inline void activate_mm(struct mm_struct *prev, set_user_asce(next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ -} - -static inline void arch_exit_mmap(struct mm_struct *mm) -{ -} - -static inline void arch_unmap(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void arch_bprm_mm_init(struct mm_struct *mm, - struct vm_area_struct *vma) -{ -} - -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool execute, bool foreign) -{ - /* by default, allow everything */ - return true; -} #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 9d91cf3e427f..c8e211b9a002 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -72,7 +72,7 @@ union mci { u64 ar : 1; /* 33 access register validity */ u64 da : 1; /* 34 delayed access exception */ u64 : 1; /* 35 */ - u64 gs : 1; /* 36 guarded storage registers */ + u64 gs : 1; /* 36 guarded storage registers validity */ u64 : 5; /* 37-41 */ u64 pr : 1; /* 42 tod programmable register validity */ u64 fc : 1; /* 43 fp control register validity */ diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index 42267a2fe29e..ca21b28a7b17 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -13,6 +13,7 @@ #define ESSA_SET_POT_VOLATILE 4 #define ESSA_SET_STABLE_RESIDENT 5 #define ESSA_SET_STABLE_IF_RESIDENT 6 +#define ESSA_SET_STABLE_NODAT 7 #define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 624deaa44230..5d5c2b3500a4 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -10,10 +10,14 @@ #include <linux/const.h> #include <asm/types.h> +#define _PAGE_SHIFT 12 +#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) +#define _PAGE_MASK (~(_PAGE_SIZE - 1)) + /* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#define PAGE_SHIFT _PAGE_SHIFT +#define PAGE_SIZE _PAGE_SIZE +#define PAGE_MASK _PAGE_MASK #define PAGE_DEFAULT_ACC 0 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) @@ -133,6 +137,9 @@ static inline int page_reset_referenced(unsigned long addr) struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); +void arch_set_page_dat(struct page *page, int order); +void arch_set_page_nodat(struct page *page, int order); +int arch_test_page_nodat(struct page *page); void arch_set_page_states(int make_stable); static inline int devmem_is_allowed(unsigned long pfn) @@ -145,16 +152,26 @@ static inline int devmem_is_allowed(unsigned long pfn) #endif /* !__ASSEMBLY__ */ -#define __PAGE_OFFSET 0x0UL -#define PAGE_OFFSET 0x0UL -#define __pa(x) (unsigned long)(x) -#define __va(x) (void *)(unsigned long)(x) -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define __PAGE_OFFSET 0x0UL +#define PAGE_OFFSET 0x0UL + +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)(unsigned long)(x)) + +#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) + +#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) +#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr)) +#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) + +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) + #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index bb0ff1bb0c4a..a0d9167519b1 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -15,6 +15,8 @@ #include <linux/gfp.h> #include <linux/mm.h> +#define CRST_ALLOC_ORDER 2 + unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); @@ -42,16 +44,16 @@ static inline void clear_table(unsigned long *s, unsigned long val, size_t n) static inline void crst_table_init(unsigned long *crst, unsigned long entry) { - clear_table(crst, entry, sizeof(unsigned long)*2048); + clear_table(crst, entry, _CRST_TABLE_SIZE); } static inline unsigned long pgd_entry_type(struct mm_struct *mm) { - if (mm->context.asce_limit <= (1UL << 31)) + if (mm->context.asce_limit <= _REGION3_SIZE) return _SEGMENT_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 42)) + if (mm->context.asce_limit <= _REGION2_SIZE) return _REGION3_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 53)) + if (mm->context.asce_limit <= _REGION1_SIZE) return _REGION2_ENTRY_EMPTY; return _REGION1_ENTRY_EMPTY; } @@ -119,7 +121,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) if (!table) return NULL; - if (mm->context.asce_limit == (1UL << 31)) { + if (mm->context.asce_limit == _REGION3_SIZE) { /* Forking a compat process with 2 page table levels */ if (!pgtable_pmd_page_ctor(virt_to_page(table))) { crst_table_free(mm, table); @@ -131,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (mm->context.asce_limit == (1UL << 31)) + if (mm->context.asce_limit == _REGION3_SIZE) pgtable_pmd_page_dtor(virt_to_page(pgd)); crst_table_free(mm, (unsigned long *) pgd); } @@ -158,4 +160,8 @@ static inline void pmd_populate(struct mm_struct *mm, extern void rcu_table_freelist_finish(void); +void vmem_map_init(void); +void *vmem_crst_alloc(unsigned long val); +pte_t *vmem_pte_alloc(void); + #endif /* _S390_PGALLOC_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57057fb1cc07..dce708e061ea 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -11,19 +11,6 @@ #ifndef _ASM_S390_PGTABLE_H #define _ASM_S390_PGTABLE_H -/* - * The Linux memory management assumes a three-level page table setup. - * For s390 64 bit we use up to four of the five levels the hardware - * provides (region first tables are not used). - * - * The "pgd_xxx()" functions are trivial for a folded two-level - * setup: the pgd is never bad, and a pmd always exists (as it's folded - * into the pgd entry) - * - * This file contains the functions and defines necessary to modify and use - * the S390 page table tree. - */ -#ifndef __ASSEMBLY__ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -34,9 +21,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -extern void vmem_map_init(void); -pmd_t *vmem_pmd_alloc(void); -pte_t *vmem_pte_alloc(void); enum { PG_DIRECT_MAP_4K = 0, @@ -77,38 +61,6 @@ extern unsigned long zero_page_mask; #define __HAVE_COLOR_ZERO_PAGE /* TODO: s390 cannot support io_remap_pfn_range... */ -#endif /* !__ASSEMBLY__ */ - -/* - * PMD_SHIFT determines the size of the area a second-level page - * table can map - * PGDIR_SHIFT determines what a third-level page table entry can map - */ -#define PMD_SHIFT 20 -#define PUD_SHIFT 31 -#define P4D_SHIFT 42 -#define PGDIR_SHIFT 53 - -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) -#define P4D_SIZE (1UL << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* - * entries per page directory level: the S390 is two-level, so - * we don't really have any PMD directory physically. - * for S390 segment-table entries are combined to one PGD - * that leads to 1024 pte per pgd - */ -#define PTRS_PER_PTE 256 -#define PTRS_PER_PMD 2048 -#define PTRS_PER_PUD 2048 -#define PTRS_PER_P4D 2048 -#define PTRS_PER_PGD 2048 #define FIRST_USER_ADDRESS 0UL @@ -123,7 +75,6 @@ extern unsigned long zero_page_mask; #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e)) -#ifndef __ASSEMBLY__ /* * The vmalloc and module area will always be on the topmost area of the * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules. @@ -269,7 +220,7 @@ static inline int is_module_addr(void *addr) */ /* Bits in the segment/region table address-space-control-element */ -#define _ASCE_ORIGIN ~0xfffUL/* segment table origin */ +#define _ASCE_ORIGIN ~0xfffUL/* region/segment table origin */ #define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ #define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ #define _ASCE_SPACE_SWITCH 0x40 /* space switch event */ @@ -320,9 +271,9 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL #define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */ -#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* segment table origin */ -#define _SEGMENT_ENTRY_PROTECT 0x200 /* page protection bit */ -#define _SEGMENT_ENTRY_NOEXEC 0x100 /* region no-execute bit */ +#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */ +#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ +#define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY (0) @@ -340,6 +291,54 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _CRST_ENTRIES 2048 /* number of region/segment table entries */ +#define _PAGE_ENTRIES 256 /* number of page table entries */ + +#define _CRST_TABLE_SIZE (_CRST_ENTRIES * 8) +#define _PAGE_TABLE_SIZE (_PAGE_ENTRIES * 8) + +#define _REGION1_SHIFT 53 +#define _REGION2_SHIFT 42 +#define _REGION3_SHIFT 31 +#define _SEGMENT_SHIFT 20 + +#define _REGION1_INDEX (0x7ffUL << _REGION1_SHIFT) +#define _REGION2_INDEX (0x7ffUL << _REGION2_SHIFT) +#define _REGION3_INDEX (0x7ffUL << _REGION3_SHIFT) +#define _SEGMENT_INDEX (0x7ffUL << _SEGMENT_SHIFT) +#define _PAGE_INDEX (0xffUL << _PAGE_SHIFT) + +#define _REGION1_SIZE (1UL << _REGION1_SHIFT) +#define _REGION2_SIZE (1UL << _REGION2_SHIFT) +#define _REGION3_SIZE (1UL << _REGION3_SHIFT) +#define _SEGMENT_SIZE (1UL << _SEGMENT_SHIFT) + +#define _REGION1_MASK (~(_REGION1_SIZE - 1)) +#define _REGION2_MASK (~(_REGION2_SIZE - 1)) +#define _REGION3_MASK (~(_REGION3_SIZE - 1)) +#define _SEGMENT_MASK (~(_SEGMENT_SIZE - 1)) + +#define PMD_SHIFT _SEGMENT_SHIFT +#define PUD_SHIFT _REGION3_SHIFT +#define P4D_SHIFT _REGION2_SHIFT +#define PGDIR_SHIFT _REGION1_SHIFT + +#define PMD_SIZE _SEGMENT_SIZE +#define PUD_SIZE _REGION3_SIZE +#define P4D_SIZE _REGION2_SIZE +#define PGDIR_SIZE _REGION1_SIZE + +#define PMD_MASK _SEGMENT_MASK +#define PUD_MASK _REGION3_MASK +#define P4D_MASK _REGION2_MASK +#define PGDIR_MASK _REGION1_MASK + +#define PTRS_PER_PTE _PAGE_ENTRIES +#define PTRS_PER_PMD _CRST_ENTRIES +#define PTRS_PER_PUD _CRST_ENTRIES +#define PTRS_PER_P4D _CRST_ENTRIES +#define PTRS_PER_PGD _CRST_ENTRIES + /* * Segment table and region3 table entry encoding * (R = read-only, I = invalid, y = young bit): @@ -376,6 +375,7 @@ static inline int is_module_addr(void *addr) /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL +#define _PGSTE_GPS_NODAT 0x0000000040000000UL #define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL #define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL #define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL @@ -505,7 +505,7 @@ static inline int mm_alloc_pgste(struct mm_struct *mm) * In the case that a guest uses storage keys * faults should no longer be backed by zero pages */ -#define mm_forbids_zeropage mm_use_skey +#define mm_forbids_zeropage mm_has_pgste static inline int mm_use_skey(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -952,15 +952,30 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 -static inline void __ptep_ipte(unsigned long address, pte_t *ptep, int local) +#define IPTE_NODAT 0x400 +#define IPTE_GUEST_ASCE 0x800 + +static inline void __ptep_ipte(unsigned long address, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) { unsigned long pto = (unsigned long) ptep; - /* Invalidation + TLB flush for the pte */ + if (__builtin_constant_p(opt) && opt == 0) { + /* Invalidation + TLB flush for the pte */ + asm volatile( + " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" + : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), + [m4] "i" (local)); + return; + } + + /* Invalidate ptes with options + TLB flush of the ptes */ + opt = opt | (asce & _ASCE_ORIGIN); asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" - : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), - [m4] "i" (local)); + " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (opt) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); } static inline void __ptep_ipte_range(unsigned long address, int nr, @@ -1341,31 +1356,61 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_GLOBAL 0 #define IDTE_LOCAL 1 -static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp, int local) +#define IDTE_PTOA 0x0800 +#define IDTE_NODAT 0x1000 +#define IDTE_GUEST_ASCE 0x2000 + +static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, + unsigned long opt, unsigned long asce, + int local) { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pmdp) - : [r1] "a" (sto), [r2] "a" ((address & HPAGE_MASK)), - [m4] "i" (local) - : "cc" ); + sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)), + [m4] "i" (local) + : "cc" ); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } -static inline void __pudp_idte(unsigned long address, pud_t *pudp, int local) +static inline void __pudp_idte(unsigned long addr, pud_t *pudp, + unsigned long opt, unsigned long asce, + int local) { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t); + r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pudp) - : [r1] "a" (r3o), [r2] "a" ((address & PUD_MASK)), - [m4] "i" (local) - : "cc"); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)), + [m4] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); @@ -1548,8 +1593,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#endif /* !__ASSEMBLY__ */ - #define kern_addr_valid(addr) (1) extern int vmem_add_mapping(unsigned long start, unsigned long size); diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 998b61cd0e56..eaee69e7c42a 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -80,7 +80,7 @@ struct qdr { u32 qkey : 4; u32 : 28; struct qdesfmt0 qdf0[126]; -} __attribute__ ((packed, aligned(4096))); +} __packed __aligned(PAGE_SIZE); #define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 #define QIB_RFLAGS_ENABLE_QEBSM 0x80 diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index cd78155b1829..490e035b3716 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -29,8 +29,10 @@ #define MACHINE_FLAG_TE _BITUL(11) #define MACHINE_FLAG_TLB_LC _BITUL(12) #define MACHINE_FLAG_VX _BITUL(13) -#define MACHINE_FLAG_NX _BITUL(14) -#define MACHINE_FLAG_GS _BITUL(15) +#define MACHINE_FLAG_TLB_GUEST _BITUL(14) +#define MACHINE_FLAG_NX _BITUL(15) +#define MACHINE_FLAG_GS _BITUL(16) +#define MACHINE_FLAG_SCC _BITUL(17) #define LPP_MAGIC _BITUL(31) #define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) @@ -68,8 +70,10 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) #define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) +#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) +#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) /* * Console mode. Override with conmode= @@ -104,9 +108,16 @@ extern void pfault_fini(void); #define pfault_fini() do { } while (0) #endif /* CONFIG_PFAULT */ +#ifdef CONFIG_VMCP +void vmcp_cma_reserve(void); +#else +static inline void vmcp_cma_reserve(void) { } +#endif + void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -extern void cmma_init(void); +void cmma_init(void); +void cmma_init_nodat(void); extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index f7838ecd83c6..8182b521c42f 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -92,17 +92,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) { typecheck(int, lp->lock); asm volatile( - "st %1,%0\n" - : "+Q" (lp->lock) - : "d" (0) - : "cc", "memory"); -} - -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - while (arch_spin_is_locked(lock)) - arch_spin_relax(lock); - smp_acquire__after_ctrl_dep(); +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0070\n" /* NIAI 7 */ +#endif + " st %1,%0\n" + : "=Q" (lp->lock) : "d" (0) : "cc", "memory"); } /* diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 118535123f34..93f2eb3f277c 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -15,6 +15,8 @@ /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL +extern u64 clock_comparator_max; + /* Inline functions for clock register access. */ static inline int set_tod_clock(__u64 time) { @@ -126,7 +128,7 @@ static inline unsigned long long local_tick_disable(void) unsigned long long old; old = S390_lowcore.clock_comparator; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); return old; } @@ -174,24 +176,24 @@ static inline cycles_t get_cycles(void) return (cycles_t) get_tod_clock() >> 2; } -int get_phys_clock(unsigned long long *clock); +int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); -extern u64 sched_clock_base_cc; +extern unsigned char tod_clock_base[16] __aligned(8); /** * get_clock_monotonic - returns current time in clock rate units * * The caller must ensure that preemption is disabled. - * The clock and sched_clock_base get changed via stop_machine. + * The clock and tod_clock_base get changed via stop_machine. * Therefore preemption must be disabled when calling this * function, otherwise the returned value is not guaranteed to * be monotonic. */ static inline unsigned long long get_tod_clock_monotonic(void) { - return get_tod_clock() - sched_clock_base_cc; + return get_tod_clock() - *(unsigned long long *) &tod_clock_base[1]; } /** @@ -218,4 +220,32 @@ static inline unsigned long long tod_to_ns(unsigned long long todval) return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } +/** + * tod_after - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a > (long long) b; + return a > b; +} + +/** + * tod_after_eq - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after_eq(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a >= (long long) b; + return a >= b; +} + #endif diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 2eb8ff0d6fca..3a14b864b2e3 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -135,7 +135,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 31)) + if (tlb->mm->context.asce_limit <= _REGION3_SIZE) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); tlb_remove_table(tlb, pmd); @@ -151,7 +151,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 53)) + if (tlb->mm->context.asce_limit <= _REGION1_SIZE) return; tlb_remove_table(tlb, p4d); } @@ -166,7 +166,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 42)) + if (tlb->mm->context.asce_limit <= _REGION2_SIZE) return; tlb_remove_table(tlb, pud); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 39846100682a..4d759f8f4bc7 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -20,10 +20,15 @@ static inline void __tlb_flush_local(void) */ static inline void __tlb_flush_idte(unsigned long asce) { + unsigned long opt; + + opt = IDTE_PTOA; + if (MACHINE_HAS_TLB_GUEST) + opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ asm volatile( " .insn rrf,0xb98e0000,0,%0,%1,0" - : : "a" (2048), "a" (asce) : "cc"); + : : "a" (opt), "a" (asce) : "cc"); } #ifdef CONFIG_SMP diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index fa1bfce10370..5222da162b69 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -77,12 +77,6 @@ static inline const struct cpumask *cpumask_of_node(int node) return &node_to_cpumask_map[node]; } -/* - * Returns the number of the node containing node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h deleted file mode 100644 index 6740f4f9781f..000000000000 --- a/arch/s390/include/asm/types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/types.h" - */ -#ifndef _S390_TYPES_H -#define _S390_TYPES_H - -#include <uapi/asm/types.h> - -#endif /* _S390_TYPES_H */ diff --git a/arch/s390/include/asm/unaligned.h b/arch/s390/include/asm/unaligned.h deleted file mode 100644 index da9627afe5d8..000000000000 --- a/arch/s390/include/asm/unaligned.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_S390_UNALIGNED_H -#define _ASM_S390_UNALIGNED_H - -/* - * The S390 can do unaligned accesses itself. - */ -#include <linux/unaligned/access_ok.h> -#include <linux/unaligned/generic.h> - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#endif /* _ASM_S390_UNALIGNED_H */ diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index ca62066895e0..098f28778a13 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -9,4 +9,5 @@ generic-y += param.h generic-y += poll.h generic-y += resource.h generic-y += sockios.h +generic-y += swab.h generic-y += termbits.h diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 1340311dab77..ab5797cdc1b7 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -72,7 +72,10 @@ typedef struct dasd_information2_t { * 0x02: use diag discipline (diag) * 0x04: set the device initially online (internal use only) * 0x08: enable ERP related logging - * 0x20: give access to raw eckd data + * 0x10: allow I/O to fail on lost paths + * 0x20: allow I/O to fail when a lock was stolen + * 0x40: give access to raw eckd data + * 0x80: enable discard support */ #define DASD_FEATURE_DEFAULT 0x00 #define DASD_FEATURE_READONLY 0x01 @@ -82,6 +85,7 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_FAILFAST 0x10 #define DASD_FEATURE_FAILONSLCK 0x20 #define DASD_FEATURE_USERAW 0x40 +#define DASD_FEATURE_DISCARD 0x80 #define DASD_PARTN_BITS 2 diff --git a/arch/s390/include/uapi/asm/swab.h b/arch/s390/include/uapi/asm/swab.h deleted file mode 100644 index da3bfe5cc161..000000000000 --- a/arch/s390/include/uapi/asm/swab.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef _S390_SWAB_H -#define _S390_SWAB_H - -/* - * S390 version - * Copyright IBM Corp. 1999 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ - -#include <linux/types.h> - -#ifndef __s390x__ -# define __SWAB_64_THRU_32__ -#endif - -#ifdef __s390x__ -static inline __u64 __arch_swab64p(const __u64 *x) -{ - __u64 result; - - asm volatile("lrvg %0,%1" : "=d" (result) : "m" (*x)); - return result; -} -#define __arch_swab64p __arch_swab64p - -static inline __u64 __arch_swab64(__u64 x) -{ - __u64 result; - - asm volatile("lrvgr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab64 __arch_swab64 - -static inline void __arch_swab64s(__u64 *x) -{ - *x = __arch_swab64p(x); -} -#define __arch_swab64s __arch_swab64s -#endif /* __s390x__ */ - -static inline __u32 __arch_swab32p(const __u32 *x) -{ - __u32 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,8,%O1+3(%R1)\n" - " icm %0,4,%O1+2(%R1)\n" - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrv %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab32p __arch_swab32p - -#ifdef __s390x__ -static inline __u32 __arch_swab32(__u32 x) -{ - __u32 result; - - asm volatile("lrvr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab32 __arch_swab32 -#endif /* __s390x__ */ - -static inline __u16 __arch_swab16p(const __u16 *x) -{ - __u16 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1\n" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrvh %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab16p __arch_swab16p - -#endif /* _S390_SWAB_H */ diff --git a/arch/s390/include/uapi/asm/vmcp.h b/arch/s390/include/uapi/asm/vmcp.h new file mode 100644 index 000000000000..4caf71714a55 --- /dev/null +++ b/arch/s390/include/uapi/asm/vmcp.h @@ -0,0 +1,24 @@ +/* + * Copyright IBM Corp. 2004, 2005 + * Interface implementation for communication with the z/VM control program + * Version 1.0 + * Author(s): Christian Borntraeger <cborntra@de.ibm.com> + * + * + * z/VMs CP offers the possibility to issue commands via the diagnose code 8 + * this driver implements a character device that issues these commands and + * returns the answer of CP. + * + * The idea of this driver is based on cpint from Neale Ferguson + */ + +#ifndef _UAPI_ASM_VMCP_H +#define _UAPI_ASM_VMCP_H + +#include <linux/ioctl.h> + +#define VMCP_GETCODE _IOR(0x10, 1, int) +#define VMCP_SETBUF _IOW(0x10, 2, int) +#define VMCP_GETSIZE _IOR(0x10, 3, int) + +#endif /* _UAPI_ASM_VMCP_H */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index b65c414b6c0e..3d42f91c95fd 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -158,6 +158,7 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); + OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 9f0e4a2785f7..63bc6603e0ed 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> +#include <linux/mm.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> @@ -28,9 +29,7 @@ static int diag8_noresponse(int cmdlen) register unsigned long reg3 asm ("3") = cmdlen; asm volatile( - " sam31\n" " diag %1,%0,0x8\n" - " sam64\n" : "+d" (reg3) : "d" (reg2) : "cc"); return reg3; } @@ -43,9 +42,7 @@ static int diag8_response(int cmdlen, char *response, int *rlen) register unsigned long reg5 asm ("5") = *rlen; asm volatile( - " sam31\n" " diag %2,%0,0x8\n" - " sam64\n" " brc 8,1f\n" " agr %1,%4\n" "1:\n" @@ -57,7 +54,6 @@ static int diag8_response(int cmdlen, char *response, int *rlen) /* * __cpcmd has some restrictions over cpcmd - * - the response buffer must reside below 2GB (if any) * - __cpcmd is unlocked and therefore not SMP-safe */ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code) @@ -88,13 +84,12 @@ EXPORT_SYMBOL(__cpcmd); int cpcmd(const char *cmd, char *response, int rlen, int *response_code) { + unsigned long flags; char *lowbuf; int len; - unsigned long flags; - if ((virt_to_phys(response) != (unsigned long) response) || - (((unsigned long)response + rlen) >> 31)) { - lowbuf = kmalloc(rlen, GFP_KERNEL | GFP_DMA); + if (is_vmalloc_or_module_addr(response)) { + lowbuf = kmalloc(rlen, GFP_KERNEL); if (!lowbuf) { pr_warn("The cpcmd kernel function failed to allocate a response buffer\n"); return -ENOMEM; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 86b3e74f569e..1d9e83c401fc 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -866,7 +866,8 @@ static inline void debug_finish_entry(debug_info_t * id, debug_entry_t* active, int level, int exception) { - active->id.stck = get_tod_clock_fast() - sched_clock_base_cc; + active->id.stck = get_tod_clock_fast() - + *(unsigned long long *) &tod_clock_base[1]; active->id.fields.cpuid = smp_processor_id(); active->caller = __builtin_return_address(0); active->id.fields.exception = exception; @@ -1455,15 +1456,15 @@ int debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, int area, debug_entry_t * entry, char *out_buf) { - unsigned long sec, usec; + unsigned long base, sec, usec; char *except_str; unsigned long caller; int rc = 0; unsigned int level; level = entry->id.fields.level; - sec = (entry->id.stck >> 12) + (sched_clock_base_cc >> 12); - sec = sec - (TOD_UNIX_EPOCH >> 12); + base = (*(unsigned long *) &tod_clock_base[0]) >> 4; + sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12); usec = do_div(sec, USEC_PER_SEC); if (entry->id.fields.exception) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index dab78babfab6..2aa545dca4d5 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -76,7 +76,7 @@ void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); #ifdef CONFIG_CHECK_STACK sp = __dump_trace(func, data, sp, - S390_lowcore.panic_stack + frame_size - 4096, + S390_lowcore.panic_stack + frame_size - PAGE_SIZE, S390_lowcore.panic_stack + frame_size); #endif sp = __dump_trace(func, data, sp, diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 5d20182ee8ae..ca8cd80e8feb 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -53,8 +53,9 @@ static void __init reset_tod_clock(void) if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) disabled_wait(0); - sched_clock_base_cc = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = sched_clock_base_cc; + memset(tod_clock_base, 0, 16); + *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; + S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; } #ifdef CONFIG_SHARED_KERNEL @@ -165,8 +166,8 @@ static noinline __init void create_kernel_nss(void) } /* re-initialize cputime accounting. */ - sched_clock_base_cc = get_tod_clock(); - S390_lowcore.last_update_clock = sched_clock_base_cc; + get_tod_clock_ext(tod_clock_base); + S390_lowcore.last_update_clock = *(__u64 *) &tod_clock_base[1]; S390_lowcore.last_update_timer = 0x7fffffffffffffffULL; S390_lowcore.user_timer = 0; S390_lowcore.system_timer = 0; @@ -387,6 +388,12 @@ static __init void detect_machine_facilities(void) } if (test_facility(133)) S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + if (test_facility(139) && (tod_clock_base[1] & 0x80)) { + /* Enabled signed clock comparator comparisons */ + S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + clock_comparator_max = -1ULL >> 1; + __ctl_set_bit(0, 53); + } } static inline void save_vector_registers(void) @@ -413,7 +420,7 @@ static int __init disable_vector_extension(char *str) { S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; __ctl_clear_bit(0, 17); - return 1; + return 0; } early_param("novx", disable_vector_extension); diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index eff5b31671d4..8ed753c72d9b 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -302,7 +302,8 @@ ENTRY(startup_kdump) xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 lctlg %c0,%c15,0x200(%r0) # initialize control registers - stck __LC_LAST_UPDATE_CLOCK + stcke __LC_BOOT_CLOCK + mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) l %r15,.Lstack-.LPG0(%r13) diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 31c91f24e562..0d8f2a858ced 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -21,8 +21,8 @@ ENTRY(startup_continue) xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid mvi __LC_LPP,0x80 # and set LPP_MAGIC .insn s,0xb2800000,__LC_LPP # load program parameter -0: larl %r1,sched_clock_base_cc - mvc 0(8,%r1),__LC_LAST_UPDATE_CLOCK +0: larl %r1,tod_clock_base + mvc 0(16,%r1),__LC_BOOT_CLOCK larl %r13,.LPG1 # get base lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 6dca93b29bed..a2fdff0e730b 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -105,7 +105,8 @@ void do_IRQ(struct pt_regs *regs, int irq) old_regs = set_irq_regs(regs); irq_enter(); - if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) + if (tod_after_eq(S390_lowcore.int_clock, + S390_lowcore.clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index cfac28330b03..4bdc65636603 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -7,6 +7,7 @@ */ #include <linux/linkage.h> +#include <asm/page.h> #include <asm/sigp.h> /* @@ -55,8 +56,8 @@ ENTRY(relocate_kernel) .back_pgm: lmg %r0,%r15,gprregs-.base(%r13) .top: - lghi %r7,4096 # load PAGE_SIZE in r7 - lghi %r9,4096 # load PAGE_SIZE in r9 + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 lg %r5,0(%r2) # read another word for indirection page aghi %r2,8 # increment pointer tml %r5,0x1 # is it a destination page? diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3d1d808ea8a9..164a1e16b53e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,7 +305,7 @@ static void __init setup_lowcore(void) /* * Setup lowcore for boot cpu */ - BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096); + BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * PAGE_SIZE); lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; @@ -323,7 +323,7 @@ static void __init setup_lowcore(void) lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; - lc->clock_comparator = -1ULL; + lc->clock_comparator = clock_comparator_max; lc->kernel_stack = ((unsigned long) &init_thread_union) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->async_stack = (unsigned long) @@ -469,10 +469,10 @@ static void __init setup_memory_end(void) vmalloc_size = VMALLOC_END ?: (128UL << 30) - MODULES_LEN; tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; tmp = tmp * (sizeof(struct page) + PAGE_SIZE); - if (tmp + vmalloc_size + MODULES_LEN <= (1UL << 42)) - vmax = 1UL << 42; /* 3-level kernel page table */ + if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) + vmax = _REGION2_SIZE; /* 3-level kernel page table */ else - vmax = 1UL << 53; /* 4-level kernel page table */ + vmax = _REGION1_SIZE; /* 4-level kernel page table */ /* module area is at the end of the kernel address space. */ MODULES_END = vmax; MODULES_VADDR = MODULES_END - MODULES_LEN; @@ -818,6 +818,9 @@ static int __init setup_hwcaps(void) case 0x2965: strcpy(elf_platform, "z13"); break; + case 0x3906: + strcpy(elf_platform, "z14"); + break; } /* @@ -922,6 +925,7 @@ void __init setup_arch(char **cmdline_p) setup_memory_end(); setup_memory(); dma_contiguous_reserve(memory_end); + vmcp_cma_reserve(); check_initrd(); reserve_crashkernel(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1020a11a24e5..1cee6753d47a 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1181,6 +1181,7 @@ static int __init s390_smp_init(void) rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); + rc = rc <= 0 ? rc : 0; out: return rc; } diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c index 39e2f41b6cf0..c8ea715bfe10 100644 --- a/arch/s390/kernel/suspend.c +++ b/arch/s390/kernel/suspend.c @@ -98,10 +98,16 @@ int page_key_alloc(unsigned long pages) */ void page_key_read(unsigned long *pfn) { + struct page *page; unsigned long addr; - - addr = (unsigned long) page_address(pfn_to_page(*pfn)); - *(unsigned char *) pfn = (unsigned char) page_get_storage_key(addr); + unsigned char key; + + page = pfn_to_page(*pfn); + addr = (unsigned long) page_address(page); + key = (unsigned char) page_get_storage_key(addr) & 0x7f; + if (arch_test_page_nodat(page)) + key |= 0x80; + *(unsigned char *) pfn = key; } /* @@ -126,8 +132,16 @@ void page_key_memorize(unsigned long *pfn) */ void page_key_write(void *address) { - page_set_storage_key((unsigned long) address, - page_key_rp->data[page_key_rx], 0); + struct page *page; + unsigned char key; + + key = page_key_rp->data[page_key_rx]; + page_set_storage_key((unsigned long) address, key & 0x7f, 0); + page = virt_to_page(address); + if (key & 0x80) + arch_set_page_nodat(page, 0); + else + arch_set_page_dat(page, 0); if (++page_key_rx >= PAGE_KEY_DATA_SIZE) return; page_key_rp = page_key_rp->next; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 192efdfac918..5cbd52169348 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -51,8 +51,15 @@ #include <asm/cio.h> #include "entry.h" -u64 sched_clock_base_cc = -1; /* Force to data section. */ -EXPORT_SYMBOL_GPL(sched_clock_base_cc); +unsigned char tod_clock_base[16] __aligned(8) = { + /* Force to data section. */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; +EXPORT_SYMBOL_GPL(tod_clock_base); + +u64 clock_comparator_max = -1ULL; +EXPORT_SYMBOL_GPL(clock_comparator_max); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -75,7 +82,7 @@ void __init time_early_init(void) struct ptff_qui qui; /* Initialize TOD steering parameters */ - tod_steering_end = sched_clock_base_cc; + tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; vdso_data->ts_end = tod_steering_end; if (!test_facility(28)) @@ -111,22 +118,27 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -static void tod_to_timeval(__u64 todval, struct timespec64 *xt) +static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt) { - unsigned long long sec; + unsigned long long high, low, rem, sec, nsec; + + /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */ + high = (*(unsigned long long *) clk) >> 4; + low = (*(unsigned long long *)&clk[7]) << 4; + /* Calculate seconds and nano-seconds */ + sec = high; + rem = do_div(sec, 1000000); + nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32; - sec = todval >> 12; - do_div(sec, 1000000); xt->tv_sec = sec; - todval -= (sec * 1000000) << 12; - xt->tv_nsec = ((todval * 1000) >> 12); + xt->tv_nsec = nsec; } void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -148,7 +160,7 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); cpu = smp_processor_id(); @@ -179,7 +191,7 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == -1ULL) + if (S390_lowcore.clock_comparator == clock_comparator_max) set_clock_comparator(S390_lowcore.clock_comparator); } @@ -197,18 +209,28 @@ static void stp_reset(void); void read_persistent_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = get_tod_clock() - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + get_tod_clock_ext(clk); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } void read_boot_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = sched_clock_base_cc - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + memcpy(clk, tod_clock_base, 16); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } static u64 read_tod_clock(struct clocksource *cs) @@ -335,7 +357,7 @@ static unsigned long clock_sync_flags; * source. If the clock mode is local it will return -EOPNOTSUPP and * -EAGAIN if the clock is not in sync with the external reference. */ -int get_phys_clock(unsigned long long *clock) +int get_phys_clock(unsigned long *clock) { atomic_t *sw_ptr; unsigned int sw0, sw1; @@ -406,7 +428,10 @@ static void clock_sync_global(unsigned long long delta) struct ptff_qto qto; /* Fixup the monotonic sched clock. */ - sched_clock_base_cc += delta; + *(unsigned long long *) &tod_clock_base[1] += delta; + if (*(unsigned long long *) &tod_clock_base[1] < delta) + /* Epoch overflow */ + tod_clock_base[0]++; /* Adjust TOD steering parameters. */ vdso_data->tb_update_count++; now = get_tod_clock(); @@ -437,7 +462,7 @@ static void clock_sync_global(unsigned long long delta) static void clock_sync_local(unsigned long long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != -1ULL) { + if (S390_lowcore.clock_comparator != clock_comparator_max) { S390_lowcore.clock_comparator += delta; set_clock_comparator(S390_lowcore.clock_comparator); } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index b89d19f6f2ab..eacda05b45d7 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -157,6 +157,8 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) page_frame = get_zeroed_page(GFP_KERNEL); if (!segment_table || !page_table || !page_frame) goto out; + arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER); + arch_set_page_dat(virt_to_page(page_table), 0); /* Initialize per-cpu vdso data page */ vd = (struct vdso_per_cpu_data *) page_frame; diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index 8f048c2d6d13..263a7f9eee1e 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 32 bits vdso * library */ + +#include <asm/page.h> #include <asm/vdso.h> OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index f35455d497fe..9e3dbbcc1cfc 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 64 bits vdso * library */ + +#include <asm/page.h> #include <asm/vdso.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index ce865bd4f81d..e4d36094aceb 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -27,7 +27,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) unsigned long prefix = kvm_s390_get_prefix(vcpu); start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; - end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; + end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE; vcpu->stat.diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end @@ -51,9 +51,9 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) */ gmap_discard(vcpu->arch.gmap, start, prefix); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, 4096); - if (end > prefix + 4096) - gmap_discard(vcpu->arch.gmap, 4096, 8192); + gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + if (end > prefix + PAGE_SIZE) + gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); } return 0; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 653cae5e1ee1..3cc77391a102 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -629,7 +629,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); if (asce.r) goto real_address; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; switch (asce.dt) { case ASCE_TYPE_REGION1: if (vaddr.rfx01 > asce.tl) @@ -674,7 +674,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_SECOND_TRANS; if (edat1) dat_protection |= rfte.p; - ptr = rfte.rto * 4096 + vaddr.rsx * 8; + ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } /* fallthrough */ case ASCE_TYPE_REGION2: { @@ -692,7 +692,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_THIRD_TRANS; if (edat1) dat_protection |= rste.p; - ptr = rste.rto * 4096 + vaddr.rtx * 8; + ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } /* fallthrough */ case ASCE_TYPE_REGION3: { @@ -720,7 +720,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_SEGMENT_TRANSLATION; if (edat1) dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * 4096 + vaddr.sx * 8; + ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } /* fallthrough */ case ASCE_TYPE_SEGMENT: { @@ -743,7 +743,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, goto absolute_address; } dat_protection |= ste.fc0.p; - ptr = ste.fc0.pto * 2048 + vaddr.px * 8; + ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; } } if (kvm_is_error_gpa(vcpu->kvm, ptr)) @@ -993,7 +993,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; if (asce.r) { *fake = 1; ptr = 0; @@ -1029,7 +1029,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, union region1_table_entry rfte; if (*fake) { - ptr += (unsigned long) vaddr.rfx << 53; + ptr += vaddr.rfx * _REGION1_SIZE; rfte.val = ptr; goto shadow_r2t; } @@ -1044,7 +1044,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) *dat_protection |= rfte.p; - ptr = rfte.rto << 12UL; + ptr = rfte.rto * PAGE_SIZE; shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) @@ -1055,7 +1055,7 @@ shadow_r2t: union region2_table_entry rste; if (*fake) { - ptr += (unsigned long) vaddr.rsx << 42; + ptr += vaddr.rsx * _REGION2_SIZE; rste.val = ptr; goto shadow_r3t; } @@ -1070,7 +1070,7 @@ shadow_r2t: return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) *dat_protection |= rste.p; - ptr = rste.rto << 12UL; + ptr = rste.rto * PAGE_SIZE; shadow_r3t: rste.p |= *dat_protection; rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); @@ -1082,7 +1082,7 @@ shadow_r3t: union region3_table_entry rtte; if (*fake) { - ptr += (unsigned long) vaddr.rtx << 31; + ptr += vaddr.rtx * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1098,7 +1098,7 @@ shadow_r3t: if (rtte.fc && sg->edat_level >= 2) { *dat_protection |= rtte.fc0.p; *fake = 1; - ptr = rtte.fc1.rfaa << 31UL; + ptr = rtte.fc1.rfaa * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1106,7 +1106,7 @@ shadow_r3t: return PGM_SEGMENT_TRANSLATION; if (sg->edat_level >= 1) *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto << 12UL; + ptr = rtte.fc0.sto * PAGE_SIZE; shadow_sgt: rtte.fc0.p |= *dat_protection; rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); @@ -1118,7 +1118,7 @@ shadow_sgt: union segment_table_entry ste; if (*fake) { - ptr += (unsigned long) vaddr.sx << 20; + ptr += vaddr.sx * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } @@ -1134,11 +1134,11 @@ shadow_sgt: *dat_protection |= ste.fc0.p; if (ste.fc && sg->edat_level >= 1) { *fake = 1; - ptr = ste.fc1.sfaa << 20UL; + ptr = ste.fc1.sfaa * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } - ptr = ste.fc0.pto << 11UL; + ptr = ste.fc0.pto * (PAGE_SIZE / 2); shadow_pgt: ste.fc0.p |= *dat_protection; rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); @@ -1187,8 +1187,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, vaddr.addr = saddr; if (fake) { - /* offset in 1MB guest memory block */ - pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } if (!rc) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8a1dac793d6b..785ad028bde6 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -329,7 +329,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { /* start already designates an absolute address */ - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); } else { start = kvm_s390_real_to_abs(vcpu, start); end = start + PAGE_SIZE; @@ -893,10 +893,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) case 0x00000000: /* only 4k frames specify a real address */ start = kvm_s390_real_to_abs(vcpu, start); - end = (start + (1UL << 12)) & ~((1UL << 12) - 1); + end = (start + PAGE_SIZE) & ~(PAGE_SIZE - 1); break; case 0x00001000: - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); break; case 0x00002000: /* only support 2G frame size if EDAT2 is available and we are @@ -904,7 +904,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (!test_kvm_facility(vcpu->kvm, 78) || psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_24BIT) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - end = (start + (1UL << 31)) & ~((1UL << 31) - 1); + end = (start + _REGION3_SIZE) & ~(_REGION3_SIZE - 1); break; default: return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 715c19c45d9a..ba8203e4d516 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1069,7 +1069,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + BUILD_BUG_ON(sizeof(struct vsie_page) != PAGE_SIZE); scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); /* 512 byte alignment */ diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index 92e90e40b6fb..7f17555ad4d5 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -57,7 +57,7 @@ static void __udelay_enabled(unsigned long long usecs) end = get_tod_clock_fast() + (usecs << 12); do { clock_saved = 0; - if (end < S390_lowcore.clock_comparator) { + if (tod_after(S390_lowcore.clock_comparator, end)) { clock_saved = local_tick_disable(); set_clock_comparator(end); } diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ffb15bd4c593..b12663d653d8 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -32,42 +32,63 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +static inline int arch_load_niai4(int *lock) +{ + int owner; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0040\n" /* NIAI 4 */ +#endif + " l %0,%1\n" + : "=d" (owner) : "Q" (*lock) : "memory"); + return owner; +} + +static inline int arch_cmpxchg_niai8(int *lock, int old, int new) +{ + int expected = old; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0080\n" /* NIAI 8 */ +#endif + " cs %0,%3,%1\n" + : "=d" (old), "=Q" (*lock) + : "0" (old), "d" (new), "Q" (*lock) + : "cc", "memory"); + return expected == old; +} + void arch_spin_lock_wait(arch_spinlock_t *lp) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); - first_diag = 1; + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; continue; } - /* First iteration: check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait); @@ -75,42 +96,36 @@ EXPORT_SYMBOL(arch_spin_lock_wait); void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; local_irq_restore(flags); - first_diag = 1; + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); + + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { local_irq_disable(); - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; local_irq_restore(flags); continue; } - /* Check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait_flags); diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index b3bd3f23b8e8..4ea9106417ee 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -15,8 +15,30 @@ #include <asm/mmu_context.h> #include <asm/facility.h> +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES static DEFINE_STATIC_KEY_FALSE(have_mvcos); +static int __init uaccess_init(void) +{ + if (test_facility(27)) + static_branch_enable(&have_mvcos); + return 0; +} +early_initcall(uaccess_init); + +static inline int copy_with_mvcos(void) +{ + if (static_branch_likely(&have_mvcos)) + return 1; + return 0; +} +#else +static inline int copy_with_mvcos(void) +{ + return 1; +} +#endif + static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, unsigned long size) { @@ -84,7 +106,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_from_user_mvcos(to, from, n); return copy_from_user_mvcp(to, from, n); } @@ -157,7 +179,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_to_user_mvcos(to, from, n); return copy_to_user_mvcs(to, from, n); } @@ -220,7 +242,7 @@ static inline unsigned long copy_in_user_mvc(void __user *to, const void __user unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_in_user_mvcos(to, from, n); return copy_in_user_mvc(to, from, n); } @@ -292,7 +314,7 @@ static inline unsigned long clear_user_xc(void __user *to, unsigned long size) unsigned long __clear_user(void __user *to, unsigned long size) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return clear_user_mvcos(to, size); return clear_user_xc(to, size); } @@ -349,11 +371,3 @@ long __strncpy_from_user(char *dst, const char __user *src, long size) return done; } EXPORT_SYMBOL(__strncpy_from_user); - -static int __init uaccess_init(void) -{ - if (test_facility(27)) - static_branch_enable(&have_mvcos); - return 0; -} -early_initcall(uaccess_init); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 14f25798b001..bdabb013537b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -135,7 +135,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table = table + ((address >> 53) & 0x7ff); + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; if (bad_address(table)) goto bad; pr_cont("R1:%016lx ", *table); @@ -144,7 +144,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION2: - table = table + ((address >> 42) & 0x7ff); + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) goto bad; pr_cont("R2:%016lx ", *table); @@ -153,7 +153,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION3: - table = table + ((address >> 31) & 0x7ff); + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) goto bad; pr_cont("R3:%016lx ", *table); @@ -162,7 +162,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_SEGMENT: - table = table + ((address >> 20) & 0x7ff); + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) goto bad; pr_cont("S:%016lx ", *table); @@ -170,7 +170,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) goto out; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); } - table = table + ((address >> 12) & 0xff); + table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) goto bad; pr_cont("P:%016lx ", *table); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4fb3d3cdb370..9e1494e3d849 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -36,16 +36,16 @@ static struct gmap *gmap_alloc(unsigned long limit) unsigned long *table; unsigned long etype, atype; - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; atype = _ASCE_TYPE_SEGMENT; etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; atype = _ASCE_TYPE_REGION3; etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; atype = _ASCE_TYPE_REGION2; etype = _REGION2_ENTRY_EMPTY; } else { @@ -65,7 +65,7 @@ static struct gmap *gmap_alloc(unsigned long limit) spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); atomic_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) goto out_free; page->index = 0; @@ -186,7 +186,7 @@ static void gmap_free(struct gmap *gmap) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); @@ -306,7 +306,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); @@ -321,7 +321,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, } spin_unlock(&gmap->guest_table_lock); if (page) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return 0; } @@ -546,30 +546,30 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) + gaddr & _REGION1_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) + gaddr & _REGION2_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) + gaddr & _REGION3_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ mm = gmap->mm; pgd = pgd_offset(mm, vmaddr); @@ -771,7 +771,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = gmap->table; switch (gmap->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if (level == 4) break; if (*table & _REGION_ENTRY_INVALID) @@ -779,7 +779,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION2: - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) break; if (*table & _REGION_ENTRY_INVALID) @@ -787,7 +787,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION3: - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) break; if (*table & _REGION_ENTRY_INVALID) @@ -795,13 +795,13 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_SEGMENT: - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) break; if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr >> 12) & 0xff; + table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; } @@ -1126,7 +1126,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ if (!table || *table & _PAGE_INVALID) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); } @@ -1144,7 +1144,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, int i; BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < 256; i++, raddr += 1UL << 12) + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) pgt[i] = _PAGE_INVALID; } @@ -1164,8 +1164,8 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); - sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); *ste = _SEGMENT_ENTRY_EMPTY; @@ -1193,7 +1193,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; - for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); @@ -1222,8 +1222,8 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); - r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); *r3e = _REGION3_ENTRY_EMPTY; @@ -1231,7 +1231,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1251,7 +1251,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; - for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); @@ -1260,7 +1260,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1280,8 +1280,8 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); - r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); *r2e = _REGION2_ENTRY_EMPTY; @@ -1289,7 +1289,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1309,7 +1309,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; - for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); @@ -1318,7 +1318,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1338,8 +1338,8 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); - r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); *r1e = _REGION1_ENTRY_EMPTY; @@ -1347,7 +1347,7 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1367,7 +1367,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; - for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); @@ -1378,7 +1378,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1535,7 +1535,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, /* protect after insertion, so it will get properly invalidated */ down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, PROT_READ, PGSTE_VSIE_BIT); up_read(&parent->mm->mmap_sem); spin_lock(&parent->shadow_lock); @@ -1578,7 +1578,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; @@ -1614,10 +1614,10 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, } spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1634,7 +1634,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r2t); @@ -1662,7 +1662,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; @@ -1697,10 +1697,10 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, } spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1717,7 +1717,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r3t); @@ -1745,7 +1745,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; @@ -1781,10 +1781,10 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, } spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1801,7 +1801,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); @@ -1902,7 +1902,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, } spin_unlock(&sg->guest_table_lock); /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); spin_lock(&sg->guest_table_lock); @@ -2021,7 +2021,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, } /* Check for top level table */ start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && gaddr < end) { /* The complete shadow table has to go */ @@ -2032,7 +2032,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, return; } /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); gmap_for_each_rmap_safe(rmap, rnext, head) { bits = rmap->raddr & _SHADOW_RMAP_MASK; raddr = rmap->raddr ^ bits; @@ -2076,7 +2076,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); @@ -2121,6 +2121,37 @@ static inline void thp_split_mm(struct mm_struct *mm) } /* + * Remove all empty zero pages from the mapping for lazy refaulting + * - This must be called after mm->context.has_pgste is set, to avoid + * future creation of zero pages + * - This must be called after THP was enabled + */ +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + unsigned long addr; + + for (addr = start; addr != end; addr += PAGE_SIZE) { + pte_t *ptep; + spinlock_t *ptl; + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (is_zero_pfn(pte_pfn(*ptep))) + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_unmap_unlock(ptep, ptl); + } + return 0; +} + +static inline void zap_zero_pages(struct mm_struct *mm) +{ + struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); +} + +/* * switch on pgstes for its userspace process (for kvm) */ int s390_enable_sie(void) @@ -2137,6 +2168,7 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); + zap_zero_pages(mm); up_write(&mm->mmap_sem); return 0; } @@ -2149,13 +2181,6 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); static int __s390_enable_skey(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8111694ce55a..3b567838b905 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -26,6 +26,7 @@ #include <linux/poison.h> #include <linux/initrd.h> #include <linux/export.h> +#include <linux/cma.h> #include <linux/gfp.h> #include <linux/memblock.h> #include <asm/processor.h> @@ -84,7 +85,7 @@ void __init paging_init(void) psw_t psw; init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > (1UL << 42)) { + if (VMALLOC_END > _REGION2_SIZE) { asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; pgd_type = _REGION2_ENTRY_EMPTY; } else { @@ -93,8 +94,7 @@ void __init paging_init(void) } init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; S390_lowcore.kernel_asce = init_mm.context.asce; - clear_table((unsigned long *) init_mm.pgd, pgd_type, - sizeof(unsigned long)*2048); + crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); /* enable virtual mapping in kernel mode */ @@ -137,6 +137,8 @@ void __init mem_init(void) free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ + cmma_init_nodat(); + mem_init_print_info(NULL); } @@ -166,6 +168,58 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_CMA + +/* Prevent memory blocks which contain cma regions from going offline */ + +struct s390_cma_mem_data { + unsigned long start; + unsigned long end; +}; + +static int s390_cma_check_range(struct cma *cma, void *data) +{ + struct s390_cma_mem_data *mem_data; + unsigned long start, end; + + mem_data = data; + start = cma_get_base(cma); + end = start + cma_get_size(cma); + if (end < mem_data->start) + return 0; + if (start >= mem_data->end) + return 0; + return -EBUSY; +} + +static int s390_cma_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct s390_cma_mem_data mem_data; + struct memory_notify *arg; + int rc = 0; + + arg = data; + mem_data.start = arg->start_pfn << PAGE_SHIFT; + mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT); + if (action == MEM_GOING_OFFLINE) + rc = cma_for_each_area(s390_cma_check_range, &mem_data); + return notifier_from_errno(rc); +} + +static struct notifier_block s390_cma_mem_nb = { + .notifier_call = s390_cma_mem_notifier, +}; + +static int __init s390_cma_mem_init(void) +{ + return register_memory_notifier(&s390_cma_mem_nb); +} +device_initcall(s390_cma_mem_init); + +#endif /* CONFIG_CMA */ + int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 69a7b01ae746..07fa7b8ae233 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -10,9 +10,10 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/mm.h> +#include <linux/memblock.h> #include <linux/gfp.h> #include <linux/init.h> - +#include <asm/facility.h> #include <asm/page-states.h> static int cmma_flag = 1; @@ -36,14 +37,16 @@ __setup("cmma=", cmma); static inline int cmma_test_essa(void) { register unsigned long tmp asm("0") = 0; - register int rc asm("1") = -EOPNOTSUPP; + register int rc asm("1"); + /* test ESSA_GET_STATE */ asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,0,0\n" + " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" "0: la %0,0\n" "1:\n" EX_TABLE(0b,1b) - : "+&d" (rc), "+&d" (tmp)); + : "=&d" (rc), "+&d" (tmp) + : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); return rc; } @@ -51,11 +54,26 @@ void __init cmma_init(void) { if (!cmma_flag) return; - if (cmma_test_essa()) + if (cmma_test_essa()) { cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; } -static inline void set_page_unstable(struct page *page, int order) +static inline unsigned char get_page_state(struct page *page) +{ + unsigned char state; + + asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (state) + : "a" (page_to_phys(page)), + "i" (ESSA_GET_STATE)); + return state & 0x3f; +} + +static inline void set_page_unused(struct page *page, int order) { int i, rc; @@ -66,14 +84,18 @@ static inline void set_page_unstable(struct page *page, int order) "i" (ESSA_SET_UNUSED)); } -void arch_free_page(struct page *page, int order) +static inline void set_page_stable_dat(struct page *page, int order) { - if (!cmma_flag) - return; - set_page_unstable(page, order); + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE)); } -static inline void set_page_stable(struct page *page, int order) +static inline void set_page_stable_nodat(struct page *page, int order) { int i, rc; @@ -81,14 +103,154 @@ static inline void set_page_stable(struct page *page, int order) asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" : "=&d" (rc) : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); + "i" (ESSA_SET_STABLE_NODAT)); +} + +static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || pmd_large(*pmd)) + continue; + page = virt_to_page(pmd_val(*pmd)); + set_bit(PG_arch_1, &page->flags); + } while (pmd++, addr = next, addr != end); +} + +static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pud_t *pud; + int i; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) || pud_large(*pud)) + continue; + if (!pud_folded(*pud)) { + page = virt_to_page(pud_val(*pud)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pmd(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) + continue; + if (!p4d_folded(*p4d)) { + page = virt_to_page(p4d_val(*p4d)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pud(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +static void mark_kernel_pgd(void) +{ + unsigned long addr, next; + struct page *page; + pgd_t *pgd; + int i; + + addr = 0; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, MODULES_END); + if (pgd_none(*pgd)) + continue; + if (!pgd_folded(*pgd)) { + page = virt_to_page(pgd_val(*pgd)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_p4d(pgd, addr, next); + } while (pgd++, addr = next, addr != MODULES_END); +} + +void __init cmma_init_nodat(void) +{ + struct memblock_region *reg; + struct page *page; + unsigned long start, end, ix; + + if (cmma_flag < 2) + return; + /* Mark pages used in kernel page tables */ + mark_kernel_pgd(); + + /* Set all kernel pages not used for page tables to stable/no-dat */ + for_each_memblock(memory, reg) { + start = memblock_region_memory_base_pfn(reg); + end = memblock_region_memory_end_pfn(reg); + page = pfn_to_page(start); + for (ix = start; ix < end; ix++, page++) { + if (__test_and_clear_bit(PG_arch_1, &page->flags)) + continue; /* skip page table pages */ + if (!list_empty(&page->lru)) + continue; /* skip free pages */ + set_page_stable_nodat(page, 0); + } + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_unused(page, order); } void arch_alloc_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_stable(page, order); + if (cmma_flag < 2) + set_page_stable_dat(page, order); + else + set_page_stable_nodat(page, order); +} + +void arch_set_page_dat(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_stable_dat(page, order); +} + +void arch_set_page_nodat(struct page *page, int order) +{ + if (cmma_flag < 2) + return; + set_page_stable_nodat(page, order); +} + +int arch_test_page_nodat(struct page *page) +{ + unsigned char state; + + if (cmma_flag < 2) + return 0; + state = get_page_state(page); + return !!(state & 0x20); } void arch_set_page_states(int make_stable) @@ -108,9 +270,9 @@ void arch_set_page_states(int make_stable) list_for_each(l, &zone->free_area[order].free_list[t]) { page = list_entry(l, struct page, lru); if (make_stable) - set_page_stable(page, order); + set_page_stable_dat(page, 0); else - set_page_unstable(page, order); + set_page_unused(page, order); } } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 180481589246..552f898dfa74 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -7,6 +7,7 @@ #include <asm/cacheflush.h> #include <asm/facility.h> #include <asm/pgtable.h> +#include <asm/pgalloc.h> #include <asm/page.h> #include <asm/set_memory.h> @@ -191,7 +192,7 @@ static int split_pud_page(pud_t *pudp, unsigned long addr) pud_t new; int i, ro, nx; - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) return -ENOMEM; pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; @@ -328,7 +329,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte, IPTE_GLOBAL); + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 18918e394ce4..c5b74dd61197 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -57,6 +57,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!page) return NULL; + arch_set_page_dat(page, 2); return (unsigned long *) page_to_phys(page); } @@ -82,7 +83,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) int rc, notify; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - BUG_ON(mm->context.asce_limit < (1UL << 42)); + BUG_ON(mm->context.asce_limit < _REGION2_SIZE); if (end >= TASK_SIZE_MAX) return -ENOMEM; rc = 0; @@ -95,11 +96,11 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) } spin_lock_bh(&mm->page_table_lock); pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == (1UL << 42)) { + if (mm->context.asce_limit == _REGION2_SIZE) { crst_table_init(table, _REGION2_ENTRY_EMPTY); p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); mm->pgd = (pgd_t *) table; - mm->context.asce_limit = 1UL << 53; + mm->context.asce_limit = _REGION1_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; } else { @@ -123,7 +124,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd_t *pgd; /* downgrade should only happen from 3 to 2 levels (compat only) */ - BUG_ON(mm->context.asce_limit != (1UL << 42)); + BUG_ON(mm->context.asce_limit != _REGION2_SIZE); if (current->active_mm == mm) { clear_user_asce(); @@ -132,7 +133,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd = mm->pgd; mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = 1UL << 31; + mm->context.asce_limit = _REGION3_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; crst_table_free(mm, (unsigned long *) pgd); @@ -214,6 +215,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) __free_page(page); return NULL; } + arch_set_page_dat(page, 0); /* Initialize page table */ table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4a1f7366b17a..4198a71b8fdd 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -25,8 +25,49 @@ #include <asm/mmu_context.h> #include <asm/page-states.h> +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); + } +} + +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + } +} + static inline pte_t ptep_flush_direct(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -36,15 +77,16 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_ipte(addr, ptep, IPTE_LOCAL); + ptep_ipte_local(mm, addr, ptep, nodat); else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } static inline pte_t ptep_flush_lazy(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -57,7 +99,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } @@ -229,10 +271,12 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_direct(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -244,10 +288,12 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -259,10 +305,12 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); if (mm_has_pgste(mm)) { pgste = pgste_update_all(old, pgste, mm); pgste_set(ptep, pgste); @@ -290,6 +338,28 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_modify_prot_commit); +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); +} + +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); +} + static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { @@ -298,16 +368,12 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, old = *pmdp; if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; - if (!MACHINE_HAS_IDTE) { - __pmdp_csp(pmdp); - return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pmdp_idte(addr, pmdp, IDTE_LOCAL); + pmdp_idte_local(mm, addr, pmdp); else - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); + pmdp_idte_global(mm, addr, pmdp); atomic_dec(&mm->context.flush_count); return old; } @@ -325,10 +391,9 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; - } else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); - else - __pmdp_csp(pmdp); + } else { + pmdp_idte_global(mm, addr, pmdp); + } atomic_dec(&mm->context.flush_count); return old; } @@ -359,28 +424,46 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(pmdp_xchg_lazy); -static inline pud_t pudp_flush_direct(struct mm_struct *mm, - unsigned long addr, pud_t *pudp) +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) { - pud_t old; + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); +} - old = *pudp; - if (pud_val(old) & _REGION_ENTRY_INVALID) - return old; - if (!MACHINE_HAS_IDTE) { +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); + else /* * Invalid bit position is the same for pmd and pud, so we can * re-use _pmd_csp() here */ __pmdp_csp((pmd_t *) pudp); +} + +static inline pud_t pudp_flush_direct(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old; + + old = *pudp; + if (pud_val(old) & _REGION_ENTRY_INVALID) return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pudp_idte(addr, pudp, IDTE_LOCAL); + pudp_idte_local(mm, addr, pudp); else - __pudp_idte(addr, pudp, IDTE_GLOBAL); + pudp_idte_global(mm, addr, pudp); atomic_dec(&mm->context.flush_count); return old; } @@ -482,7 +565,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, { pte_t entry; pgste_t pgste; - int pte_i, pte_p; + int pte_i, pte_p, nodat; pgste = pgste_get_lock(ptep); entry = *ptep; @@ -495,13 +578,14 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, return -EAGAIN; } /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); pte_val(entry) |= _PAGE_INVALID; } if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pte_val(entry) &= ~_PAGE_INVALID; pte_val(entry) |= _PAGE_PROTECT; } @@ -541,10 +625,12 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) { pgste_t pgste; + int nodat; pgste = pgste_get_lock(ptep); /* notifier is called by the caller */ - ptep_flush_direct(mm, saddr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); /* don't touch the storage key - it belongs to parent pgste */ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); pgste_set_unlock(ptep, pgste); @@ -617,6 +703,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte_t *ptep; pte_t pte; bool dirty; + int nodat; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); @@ -645,7 +732,8 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index d8398962a723..c0af0d7b6e5f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -38,37 +38,14 @@ static void __ref *vmem_alloc_pages(unsigned int order) return (void *) memblock_alloc(size, size); } -static inline p4d_t *vmem_p4d_alloc(void) +void *vmem_crst_alloc(unsigned long val) { - p4d_t *p4d = NULL; + unsigned long *table; - p4d = vmem_alloc_pages(2); - if (!p4d) - return NULL; - clear_table((unsigned long *) p4d, _REGION2_ENTRY_EMPTY, PAGE_SIZE * 4); - return p4d; -} - -static inline pud_t *vmem_pud_alloc(void) -{ - pud_t *pud = NULL; - - pud = vmem_alloc_pages(2); - if (!pud) - return NULL; - clear_table((unsigned long *) pud, _REGION3_ENTRY_EMPTY, PAGE_SIZE * 4); - return pud; -} - -pmd_t *vmem_pmd_alloc(void) -{ - pmd_t *pmd = NULL; - - pmd = vmem_alloc_pages(2); - if (!pmd) - return NULL; - clear_table((unsigned long *) pmd, _SEGMENT_ENTRY_EMPTY, PAGE_SIZE * 4); - return pmd; + table = vmem_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; } pte_t __ref *vmem_pte_alloc(void) @@ -114,14 +91,14 @@ static int vmem_add_mem(unsigned long start, unsigned long size) while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); } p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -136,7 +113,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size) continue; } if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); @@ -253,7 +230,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) for (address = start; address < end;) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); @@ -261,7 +238,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -269,7 +246,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index bd534b4d40e3..0ae3936e266f 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -24,6 +24,14 @@ bool zpci_unique_uid; +static void update_uid_checking(bool new) +{ + if (zpci_unique_uid != new) + zpci_dbg(1, "uid checking:%d\n", new); + + zpci_unique_uid = new; +} + static inline void zpci_err_clp(unsigned int rsp, int rc) { struct { @@ -319,7 +327,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data, goto out; } - zpci_unique_uid = rrb->response.uid_checking; + update_uid_checking(rrb->response.uid_checking); WARN_ON_ONCE(rrb->response.entry_size != sizeof(struct clp_fh_list_entry)); diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 025ea20fc4b4..29d72bf8ed2b 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -41,7 +41,7 @@ static struct facility_def facility_defs[] = { 27, /* mvcos */ 32, /* compare and swap and store */ 33, /* compare and swap and store 2 */ - 34, /* general extension facility */ + 34, /* general instructions extension */ 35, /* execute extensions */ #endif #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES @@ -54,6 +54,9 @@ static struct facility_def facility_defs[] = { #ifdef CONFIG_HAVE_MARCH_Z13_FEATURES 53, /* load-and-zero-rightmost-byte, etc. */ #endif +#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES + 58, /* miscellaneous-instruction-extension 2 */ +#endif -1 /* END */ } }, diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index d0078747d308..8f8cf941a8cd 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); } -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - u32 oparg = (encoded_op << 8) >> 20; - u32 cmparg = (encoded_op << 20) >> 20; u32 oldval, newval, prev; int ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); do { @@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break; - case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break; - case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break; - case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; return ret; } diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h index c46e8cc7b515..5ed7dbbd94ff 100644 --- a/arch/sh/include/asm/spinlock-cas.h +++ b/arch/sh/include/asm/spinlock-cas.h @@ -29,11 +29,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { while (!__sl_cas(&lock->lock, 1, 0)); diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index cec78143fa83..f77263aae760 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -21,11 +21,6 @@ #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - /* * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index ee3f11c43cda..7643e979e333 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int); int __atomic_add_unless(atomic_t *, int, int); void atomic_set(atomic_t *, int); +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define atomic_read(v) ACCESS_ONCE((v)->counter) #define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v))) diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index 4e899b0dabf7..1cfd89d92208 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -29,22 +29,14 @@ : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; if (unlikely((((unsigned long) uaddr) & 0x3UL))) return -EINVAL; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - pagefault_disable(); switch (op) { @@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 8011e79f59c9..67345b2dc408 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -14,11 +14,6 @@ #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { __asm__ __volatile__( diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index a93774255136..53a423e7cb92 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n) _atomic_xchg(&v->counter, n); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /* A 64bit atomic type */ typedef struct { diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h index e64a1b75fc38..83c1e639b411 100644 --- a/arch/tile/include/asm/futex.h +++ b/arch/tile/include/asm/futex.h @@ -106,12 +106,9 @@ lock = __atomic_hashed_lock((int __force *)uaddr) #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int uninitialized_var(val), ret; __futex_prolog(); @@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) /* The 32-bit futex code makes this assumption, so validate it here. */ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int)); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (val == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (val != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (val < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (val >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (val <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (val > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = val; + return ret; } diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index b14b1ba5bf9c..cba8ba9b8da6 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -64,8 +64,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->current_ticket = old_ticket + TICKET_QUANTUM; } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - /* * Read-write spinlocks, allowing multiple readers * but only one writer. diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h index b9718fb4e74a..9a2c2d605752 100644 --- a/arch/tile/include/asm/spinlock_64.h +++ b/arch/tile/include/asm/spinlock_64.h @@ -58,8 +58,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT); } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val); /* Grab the "next" ticket number and bump it atomically. diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 076c6cc43113..db9333f2447c 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c @@ -62,29 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - int curr = READ_ONCE(lock->current_ticket); - int next = READ_ONCE(lock->next_ticket); - - /* Return immediately if unlocked. */ - if (next == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (READ_ONCE(lock->current_ticket) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); - /* * The low byte is always reserved to be the marker for a "tns" operation * since the low bit is set to "1" by a tns. The next seven bits are diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c index a4b5b2cbce93..de414c22892f 100644 --- a/arch/tile/lib/spinlock_64.c +++ b/arch/tile/lib/spinlock_64.c @@ -62,28 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - u32 val = READ_ONCE(lock->lock); - u32 curr = arch_spin_current(val); - - /* Return immediately if unlocked. */ - if (arch_spin_next(val) == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (arch_spin_current(READ_ONCE(lock->lock)) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); /* * If the read lock fails due to a writer, we retry periodically diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h new file mode 100644 index 000000000000..7ffa5437b761 --- /dev/null +++ b/arch/um/include/asm/unwind.h @@ -0,0 +1,8 @@ +#ifndef _ASM_UML_UNWIND_H +#define _ASM_UML_UNWIND_H + +static inline void +unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} + +#endif /* _ASM_UML_UNWIND_H */ diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 586b786b3edf..f65a804b86f0 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ -# lguest paravirtualization support -obj-$(CONFIG_LGUEST_GUEST) += lguest/ - obj-y += realmode/ obj-y += kernel/ obj-y += mm/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 323cb065be5e..acb366bf6bc1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,6 +55,8 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + # Causing hangs/crashes, see the commit that added this change for details. + select ARCH_HAS_REFCOUNT if BROKEN select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN @@ -73,7 +75,6 @@ config X86 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH - select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT @@ -158,6 +159,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES @@ -167,8 +169,9 @@ config X86 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -327,6 +330,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int + default 5 if X86_5LEVEL default 4 if X86_64 default 3 if X86_PAE default 2 @@ -425,16 +429,16 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config INTEL_RDT_A - bool "Intel Resource Director Technology Allocation support" +config INTEL_RDT + bool "Intel Resource Director Technology support" default n depends on X86 && CPU_SUP_INTEL select KERNFS help - Select to enable resource allocation which is a sub-feature of - Intel Resource Director Technology(RDT). More information about - RDT can be found in the Intel x86 Architecture Software - Developer Manual. + Select to enable resource allocation and monitoring which are + sub-features of Intel Resource Director Technology(RDT). More + information about RDT can be found in the Intel x86 + Architecture Software Developer Manual. Say N if unsure. @@ -778,8 +782,6 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. -source "arch/x86/lguest/Kconfig" - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT @@ -1399,6 +1401,24 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config X86_5LEVEL + bool "Enable 5-level page tables support" + depends on X86_64 + ---help--- + 5-level paging enables access to larger address space: + upto 128 PiB of virtual address space and 4 PiB of + physical address space. + + It will be supported by future Intel CPUs. + + Note: a kernel with this option enabled can only be booted + on machines that support the feature. + + See Documentation/x86/x86_64/5level-paging.txt for more + information. + + Say N if unsure. + config ARCH_PHYS_ADDR_T_64BIT def_bool y depends on X86_64 || X86_PAE @@ -1416,6 +1436,35 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config ARCH_HAS_MEM_ENCRYPT + def_bool y + +config AMD_MEM_ENCRYPT + bool "AMD Secure Memory Encryption (SME) support" + depends on X86_64 && CPU_SUP_AMD + ---help--- + Say yes to enable support for the encryption of system memory. + This requires an AMD processor that supports Secure Memory + Encryption (SME). + +config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT + bool "Activate AMD Secure Memory Encryption (SME) by default" + default y + depends on AMD_MEM_ENCRYPT + ---help--- + Say yes to have system memory encrypted by default if running on + an AMD processor that supports Secure Memory Encryption (SME). + + If set to Y, then the encryption of system memory can be + deactivated with the mem_encrypt=off command line option. + + If set to N, then the encryption of system memory can be + activated with the mem_encrypt=on command line option. + +config ARCH_USE_MEMREMAP_PROT + def_bool y + depends on AMD_MEM_ENCRYPT + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index cd20ca0b4043..71a48a30fc84 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -305,8 +305,6 @@ config DEBUG_ENTRY Some of these sanity checks may slow down kernel entries and exits or otherwise impact performance. - This is currently used to help test NMI code. - If unsure, say N. config DEBUG_NMI_SELFTEST @@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG The current power state can be read from /sys/kernel/debug/punit_atom/dev_power_state +choice + prompt "Choose kernel unwinder" + default FRAME_POINTER_UNWINDER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +config FRAME_POINTER_UNWINDER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config ORC_UNWINDER + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION + ---help--- + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + This unwinder is more accurate across interrupt entry frames than the + frame pointer unwinder. It also enables a 5-10% performance + improvement across the entire kernel compared to frame pointers. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +config GUESS_UNWINDER + bool "Guess unwinder" + depends on EXPERT + ---help--- + This option enables the "guess" unwinder for unwinding kernel stack + traces. It scans the stack and reports every kernel text address it + finds. Some of the addresses it reports may be incorrect. + + While this option often produces false positives, it can still be + useful in many cases. Unlike the other unwinders, it has no runtime + overhead. + +endchoice + +config FRAME_POINTER + depends on !ORC_UNWINDER && !GUESS_UNWINDER + bool + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1e902f926be3..6276572259c8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -14,9 +14,11 @@ endif # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) - cc_stack_align_opt := -mpreferred-stack-boundary -else ifneq ($(call cc-option, -mstack-alignment=4),) - cc_stack_align_opt := -mstack-alignment + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 endif # How to compile the 16-bit code. Note we always compile for -march=i386; @@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y) # Align the stack to the register width instead of using the default # alignment of 16 bytes. This reduces stack usage and the number of # alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -115,7 +117,7 @@ else # default alignment which keep the stack *mis*aligned. # Furthermore an alignment to the register width reduces stack usage # and the number of alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8)) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) @@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -KBUILD_CFLAGS += $(mflags-y) -KBUILD_AFLAGS += $(mflags-y) - archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c3e869eaef0c..926c2cc4facc 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params, m |= (u64)efi->efi_memmap_hi << 32; #endif - d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size)); + d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i); switch (d->type) { case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -1058,7 +1058,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1078,7 +1078,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; @@ -1099,7 +1099,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1116,7 +1116,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = 0; desc->dpl = 0; desc->p = 1; - desc->limit = 0x0; + desc->limit1 = 0x0; desc->avl = 0; desc->l = 0; desc->d = 0; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d85b9625e836..11c68cf53d4e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -61,71 +61,6 @@ __HEAD ENTRY(startup_32) -#ifdef CONFIG_EFI_STUB - jmp preferred_addr - - /* - * We don't need the return address, so set up the stack so - * efi_main() can find its arguments. - */ -ENTRY(efi_pe_entry) - add $0x4, %esp - - call 1f -1: popl %esi - subl $1b, %esi - - popl %ecx - movl %ecx, efi32_config(%esi) /* Handle */ - popl %ecx - movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax - - call make_boot_params - cmpl $0, %eax - je fail - movl %esi, BP_code32_start(%eax) - popl %ecx - pushl %eax - pushl %ecx - jmp 2f /* Skip efi_config initialization */ - -ENTRY(efi32_stub_entry) - add $0x4, %esp - popl %ecx - popl %edx - - call 1f -1: popl %esi - subl $1b, %esi - - movl %ecx, efi32_config(%esi) /* Handle */ - movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax -2: - call efi_main - cmpl $0, %eax - movl %eax, %esi - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leal preferred_addr(%eax), %eax - jmp *%eax - -preferred_addr: -#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -208,6 +143,70 @@ preferred_addr: jmp *%eax ENDPROC(startup_32) +#ifdef CONFIG_EFI_STUB +/* + * We don't need the return address, so set up the stack so efi_main() can find + * its arguments. + */ +ENTRY(efi_pe_entry) + add $0x4, %esp + + call 1f +1: popl %esi + subl $1b, %esi + + popl %ecx + movl %ecx, efi32_config(%esi) /* Handle */ + popl %ecx + movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax + + call make_boot_params + cmpl $0, %eax + je fail + movl %esi, BP_code32_start(%eax) + popl %ecx + pushl %eax + pushl %ecx + jmp 2f /* Skip efi_config initialization */ +ENDPROC(efi_pe_entry) + +ENTRY(efi32_stub_entry) + add $0x4, %esp + popl %ecx + popl %edx + + call 1f +1: popl %esi + subl $1b, %esi + + movl %ecx, efi32_config(%esi) /* Handle */ + movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax +2: + call efi_main + cmpl $0, %eax + movl %eax, %esi + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leal startup_32(%eax), %eax + jmp *%eax +ENDPROC(efi32_stub_entry) +#endif + .text relocated: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fbf4c32d0b62..b4a5d284391c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -243,65 +243,6 @@ ENTRY(startup_64) * that maps our entire kernel(text+data+bss+brk), zero page * and command line. */ -#ifdef CONFIG_EFI_STUB - /* - * The entry point for the PE/COFF executable is efi_pe_entry, so - * only legacy boot loaders will execute this jmp. - */ - jmp preferred_addr - -ENTRY(efi_pe_entry) - movq %rcx, efi64_config(%rip) /* Handle */ - movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ - - leaq efi64_config(%rip), %rax - movq %rax, efi_config(%rip) - - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - addq %rbp, efi64_config+40(%rip) - - movq %rax, %rdi - call make_boot_params - cmpq $0,%rax - je fail - mov %rax, %rsi - leaq startup_32(%rip), %rax - movl %eax, BP_code32_start(%rsi) - jmp 2f /* Skip the relocation */ - -handover_entry: - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - movq efi_config(%rip), %rax - addq %rbp, 40(%rax) -2: - movq efi_config(%rip), %rdi - call efi_main - movq %rax,%rsi - cmpq $0,%rax - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leaq preferred_addr(%rax), %rax - jmp *%rax - -preferred_addr: -#endif /* Setup data segments. */ xorl %eax, %eax @@ -413,6 +354,59 @@ lvl5: jmp *%rax #ifdef CONFIG_EFI_STUB + +/* The entry point for the PE/COFF executable is efi_pe_entry. */ +ENTRY(efi_pe_entry) + movq %rcx, efi64_config(%rip) /* Handle */ + movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ + + leaq efi64_config(%rip), %rax + movq %rax, efi_config(%rip) + + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + addq %rbp, efi64_config+40(%rip) + + movq %rax, %rdi + call make_boot_params + cmpq $0,%rax + je fail + mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) + jmp 2f /* Skip the relocation */ + +handover_entry: + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + movq efi_config(%rip), %rax + addq %rbp, 40(%rax) +2: + movq efi_config(%rip), %rdi + call efi_main + movq %rax,%rsi + cmpq $0,%rax + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leaq startup_64(%rax), %rax + jmp *%rax +ENDPROC(efi_pe_entry) + .org 0x390 ENTRY(efi64_stub_entry) movq %rdi, efi64_config(%rip) /* Handle */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 91f27ab970ef..17818ba6906f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -37,7 +37,9 @@ #include <linux/uts.h> #include <linux/utsname.h> #include <linux/ctype.h> +#include <linux/efi.h> #include <generated/utsrelease.h> +#include <asm/efi.h> /* Macros used by the included decompressor code below. */ #define STATIC @@ -479,35 +481,31 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct boot_e820_entry *entry, +static void process_mem_region(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { struct mem_vector region, overlap; struct slot_area slot_area; unsigned long start_orig, end; - struct boot_e820_entry cur_entry; - - /* Skip non-RAM entries. */ - if (entry->type != E820_TYPE_RAM) - return; + struct mem_vector cur_entry; /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) + if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) + if (entry->start + entry->size < minimum) return; /* Ignore entries above memory limit */ - end = min(entry->size + entry->addr, mem_limit); - if (entry->addr >= end) + end = min(entry->size + entry->start, mem_limit); + if (entry->start >= end) return; - cur_entry.addr = entry->addr; - cur_entry.size = end - entry->addr; + cur_entry.start = entry->start; + cur_entry.size = end - entry->start; - region.start = cur_entry.addr; + region.start = cur_entry.start; region.size = cur_entry.size; /* Give up if slot area array is full. */ @@ -521,8 +519,8 @@ static void process_e820_entry(struct boot_e820_entry *entry, /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Did we raise the address above this e820 region? */ - if (region.start > cur_entry.addr + cur_entry.size) + /* Did we raise the address above the passed in memory entry? */ + if (region.start > cur_entry.start + cur_entry.size) return; /* Reduce size by any delta from the original address. */ @@ -562,31 +560,126 @@ static void process_e820_entry(struct boot_e820_entry *entry, } } -static unsigned long find_random_phys_addr(unsigned long minimum, - unsigned long image_size) +#ifdef CONFIG_EFI +/* + * Returns true if mirror region found (and must have been processed + * for slots adding) + */ +static bool +process_efi_entries(unsigned long minimum, unsigned long image_size) { + struct efi_info *e = &boot_params->efi_info; + bool efi_mirror_found = false; + struct mem_vector region; + efi_memory_desc_t *md; + unsigned long pmap; + char *signature; + u32 nr_desc; int i; - unsigned long addr; - /* Check if we had too many memmaps. */ - if (memmap_too_large) { - debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); - return 0; + signature = (char *)&e->efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) + return false; + +#ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (e->efi_memmap_hi) { + warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); + return false; } + pmap = e->efi_memmap; +#else + pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); +#endif - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + nr_desc = e->efi_memmap_size / e->efi_memdesc_size; + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + efi_mirror_found = true; + break; + } + } + + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + + /* + * Here we are more conservative in picking free memory than + * the EFI spec allows: + * + * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also + * free memory and thus available to place the kernel image into, + * but in practice there's firmware where using that memory leads + * to crashes. + * + * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. + */ + if (md->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (efi_mirror_found && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + + region.start = md->phys_addr; + region.size = md->num_pages << EFI_PAGE_SHIFT; + process_mem_region(®ion, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted EFI scan (slot_areas full)!\n"); + break; + } + } + return true; +} +#else +static inline bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + return false; +} +#endif + +static void process_e820_entries(unsigned long minimum, + unsigned long image_size) +{ + int i; + struct mem_vector region; + struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_table[i], minimum, - image_size); + entry = &boot_params->e820_table[i]; + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + region.start = entry->addr; + region.size = entry->size; + process_mem_region(®ion, minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); break; } } +} + +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ + /* Check if we had too many memmaps. */ + if (memmap_too_large) { + debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); + return 0; + } + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + if (process_efi_entries(minimum, image_size)) + return slots_fetch_random(); + process_e820_entries(minimum, image_size); return slots_fetch_random(); } @@ -645,7 +738,7 @@ void choose_random_location(unsigned long input, */ min_addr = min(*output, 512UL << 20); - /* Walk e820 and find a random address. */ + /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { warn("Physical KASLR disabled: no suitable memory region!"); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a0838ab929f2..c14217cd0155 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -116,8 +116,7 @@ void __putstr(const char *s) } } - if (boot_params->screen_info.orig_video_mode == 0 && - lines == 0 && cols == 0) + if (lines == 0 || cols == 0) return; x = boot_params->screen_info.orig_x; diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 28029be47fbb..f1aa43854bed 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -15,6 +15,13 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2ed8f0c25def..1bb08ecffd24 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -520,8 +520,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # the description in lib/decompressor_xxx.c for specific information. # # extra_bytes = (uncompressed_size >> 12) + 65536 + 128 +# +# LZ4 is even worse: data that cannot be further compressed grows by 0.4%, +# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as +# the size-dependent part now grows so fast. +# +# extra_bytes = (uncompressed_size >> 8) + 65536 -#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536) #if ZO_z_output_len > ZO_z_input_len # define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ ZO_z_input_len) diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4b429df40d7a..550cd5012b73 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,3 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +CONFIG_GUESS_UNWINDER=y +# CONFIG_FRAME_POINTER_UNWINDER is not set diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9976fcecd17e..af28a8a24366 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,7 +2,6 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 05ed3d393da7..640aafebdc00 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,4 +1,5 @@ #include <linux/jump_label.h> +#include <asm/unwind_hints.h> /* @@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with movq %rdx, 12*8+\offset(%rsp) movq %rsi, 13*8+\offset(%rsp) movq %rdi, 14*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with movq %r12, 3*8+\offset(%rsp) movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset .endm .macro RESTORE_EXTRA_REGS offset=0 @@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with movq 3*8+\offset(%rsp), %r12 movq 4*8+\offset(%rsp), %rbp movq 5*8+\offset(%rsp), %rbx + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 @@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with .endif movq 13*8(%rsp), %rsi movq 14*8(%rsp), %rdi + UNWIND_HINT_IRET_REGS offset=16*8 .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cdefcfdd9e63..03505ffbe1b6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -23,6 +23,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/livepatch.h> +#include <linux/syscalls.h> #include <asm/desc.h> #include <asm/traps.h> @@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); u32 cached_flags; + addr_limit_user_check(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 48ef7bb32c42..8a13d468635a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -673,16 +673,8 @@ ENTRY(name) \ jmp ret_from_intr; \ ENDPROC(name) - -#ifdef CONFIG_TRACING -# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -# define TRACE_BUILD_INTERRUPT(name, nr) -#endif - #define BUILD_INTERRUPT(name, nr) \ BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ #include <asm/entry_arch.h> @@ -880,25 +872,17 @@ ENTRY(xen_failsafe_callback) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) + xen_evtchn_do_upcall) #endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) + hyperv_vector_handler) #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp common_exception -END(trace_page_fault) -#endif - ENTRY(page_fault) ASM_CLAC pushl $do_page_fault diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 6d078b89a5e8..49167258d587 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -43,9 +44,10 @@ #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY swapgs sysretq -ENDPROC(native_usergs_sysret64) +END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_IRETQ @@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(entry_SYSCALL_64_after_swapgs) + swapgs movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 /* * If we need to do entry work or if we guess we'll need to do @@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath: movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 1: @@ -316,6 +316,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 opportunistic_sysret_failed: @@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax + UNWIND_HINT_REGS extra=0 jmp entry_SYSCALL64_slow_path 1: @@ -351,6 +353,7 @@ END(stub_ptregs_64) .macro ptregs_stub func ENTRY(ptregs_\func) + UNWIND_HINT_FUNC leaq \func(%rip), %rax jmp stub_ptregs_64 END(ptregs_\func) @@ -367,6 +370,7 @@ END(ptregs_\func) * %rsi: next task */ ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -406,6 +410,7 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -413,6 +418,7 @@ ENTRY(ret_from_fork) jnz 1f /* kernel threads are uncommon */ 2: + UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ @@ -440,13 +446,102 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 jmp common_interrupt .align 8 + vector=vector+1 .endr END(irq_entries_start) +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushfq + testl $X86_EFLAGS_IF, (%rsp) + jz .Lokay_\@ + ud2 +.Lokay_\@: + addq $8, %rsp +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + /* * Interrupt entry/exit. * @@ -485,17 +580,7 @@ END(irq_entries_start) CALL_enter_from_user_mode 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -515,10 +600,8 @@ common_interrupt: ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -561,6 +644,7 @@ restore_c_regs_and_iret: INTERRUPT_RETURN ENTRY(native_iret) + UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. @@ -633,6 +717,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 /* * At this point, we cannot write to the stack any more, but we can @@ -654,6 +739,7 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) + UNWIND_HINT_IRET_REGS ASM_CLAC pushq $~(\num) .Lcommon_\sym: @@ -662,31 +748,13 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - /* Make sure APIC interrupt handlers end up in the irqentry section: */ -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) -# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -# define POP_SECTION_IRQENTRY .popsection -#else -# define PUSH_SECTION_IRQENTRY -# define POP_SECTION_IRQENTRY -#endif +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +#define POP_SECTION_IRQENTRY .popsection .macro apicinterrupt num sym do_sym PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym POP_SECTION_IRQENTRY .endm @@ -740,13 +808,14 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=8 + /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 .error "using shift_ist requires paranoid=1" .endif ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -763,6 +832,7 @@ ENTRY(\sym) .else call error_entry .endif + UNWIND_HINT_REGS /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid @@ -829,17 +899,6 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 @@ -860,6 +919,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * edi: new selector */ ENTRY(native_load_gs_index) + FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS @@ -868,8 +928,9 @@ ENTRY(native_load_gs_index) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq + FRAME_END ret -END(native_load_gs_index) +ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, bad_gs) @@ -892,17 +953,15 @@ bad_gs: ENTRY(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK regs=0 old_rsp=%r11 call __do_softirq + LEAVE_IRQ_STACK regs=0 leaveq - decl PER_CPU_VAR(irq_count) ret -END(do_softirq_own_stack) +ENDPROC(do_softirq_own_stack) #ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -923,14 +982,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ + UNWIND_HINT_FUNC movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK + #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif @@ -951,6 +1010,7 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) jne 1f @@ -968,13 +1028,13 @@ ENTRY(xen_failsafe_callback) movq 8(%rsp), %r11 addq $0x30, %rsp pushq $0 /* RIP */ - pushq %r11 - pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 addq $0x30, %rsp + UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS @@ -998,13 +1058,12 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xendebug do_debug has_error_code=0 +idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST idtentry async_page_fault do_async_page_fault has_error_code=1 @@ -1020,6 +1079,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1047,6 +1107,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ @@ -1068,6 +1129,7 @@ END(paranoid_exit) * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1152,6 +1214,7 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testl %ebx, %ebx @@ -1160,19 +1223,9 @@ ENTRY(error_exit) END(error_exit) /* Runs on exception stack */ +/* XXX: broken on Xen PV */ ENTRY(nmi) - /* - * Fix up the exception frame if we're on Xen. - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most - * one value to the stack on native, so it may clobber the rdx - * scratch slot, but it won't clobber any of the important - * slots past it. - * - * Xen is a different story, because the Xen frame itself overlaps - * the "NMI executing" variable. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - + UNWIND_HINT_IRET_REGS /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1234,11 +1287,13 @@ ENTRY(nmi) cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ pushq 2*8(%rdx) /* pt_regs->cs */ pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -1255,6 +1310,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS ENCODE_FRAME_POINTER /* @@ -1409,6 +1465,7 @@ first_nmi: .rept 5 pushq 11*8(%rsp) .endr + UNWIND_HINT_IRET_REGS /* Everything up to here is safe from nested NMIs */ @@ -1424,6 +1481,7 @@ first_nmi: pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ INTERRUPT_RETURN /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS 1: #endif @@ -1473,6 +1531,7 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry + UNWIND_HINT_REGS /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi @@ -1510,17 +1569,19 @@ nmi_restore: END(nmi) ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -1: jmp 1b END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..e26c25ca7756 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat) */ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + swapgs /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -294,7 +293,6 @@ ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. */ - PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS @@ -342,8 +340,7 @@ ENTRY(entry_INT80_compat) jmp restore_regs_and_iret END(entry_INT80_compat) - ALIGN -GLOBAL(stub32_clone) +ENTRY(stub32_clone) /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -353,3 +350,4 @@ GLOBAL(stub32_clone) */ xchg %r8, %rcx jmp sys_clone +ENDPROC(stub32_clone) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 726355ce8497..1911310959f8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg) * and 8 bits for the node) */ d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; + d.limit1 = node >> 4; d.type = 5; /* RO data, expand down, accessed */ d.dpl = 3; /* Visible to user code */ d.s = 1; /* Not a system segment */ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index ad44af0dd667..f5cbbba99283 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu) if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared; + unsigned int nshared, subleaf, prev_eax = 0; uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); - nshared = ((eax >> 14) & 0xfff) + 1; + /* + * Iterate over Cache Topology Definition leaves until no + * more cache descriptions are available. + */ + for (subleaf = 0; subleaf < 5; subleaf++) { + cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx); + + /* EAX[0:4] gives type of cache */ + if (!(eax & 0x1f)) + break; + + prev_eax = eax; + } + nshared = ((prev_eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); @@ -555,7 +568,7 @@ static int __init amd_uncore_init(void) ret = 0; } - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index af12e294caed..80534d3c2480 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_hw_config(struct perf_event *event) +int x86_pmu_max_precise(void) { - if (event->attr.precise_ip) { - int precise = 0; + int precise = 0; - /* Support for constant skid */ - if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + /* Support for constant skid */ + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + precise++; + + /* Support for IP fixup */ + if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) - precise++; + if (x86_pmu.pebs_prec_dist) + precise++; + } + return precise; +} - if (x86_pmu.pebs_prec_dist) - precise++; - } +int x86_pmu_hw_config(struct perf_event *event) +{ + if (event->attr.precise_ip) { + int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) } static struct attribute_group x86_pmu_attr_group; +static struct attribute_group x86_pmu_caps_group; static int __init init_hw_perf_events(void) { @@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void) x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (x86_pmu.caps_attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs); + if (!WARN_ON(!tmp)) + x86_pmu_caps_group.attrs = tmp; + } + if (x86_pmu.event_attrs) x86_pmu_events_group.attrs = x86_pmu.event_attrs; @@ -2213,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = { .attrs = x86_pmu_attrs, }; +static ssize_t max_precise_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); +} + +static DEVICE_ATTR_RO(max_precise); + +static struct attribute *x86_pmu_caps_attrs[] = { + &dev_attr_max_precise.attr, + NULL +}; + +static struct attribute_group x86_pmu_caps_group = { + .name = "caps", + .attrs = x86_pmu_caps_attrs, +}; + static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, + &x86_pmu_caps_group, NULL, }; @@ -2335,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - if (idx > LDT_ENTRIES) - return 0; - /* IRQs are off, so this synchronizes with smp_store_release */ ldt = lockless_dereference(current->active_mm->context.ldt); - if (!ldt || idx > ldt->nr_entries) + if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; @@ -2348,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment) return 0; #endif } else { - if (idx > GDT_ENTRIES) + if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 06c2baa51814..e9d8520a801a 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index ddd8d3516bfc..16076eb34699 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags) bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - event->hw.itrace_started = 1; + perf_event_itrace_started(event); event->hw.state = 0; __bts_event_start(event); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 98b0f0729527..829e89cfcee2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = { &format_attr_any.attr, &format_attr_inv.attr, &format_attr_cmask.attr, + NULL, +}; + +static struct attribute *hsw_format_attr[] = { &format_attr_in_tx.attr, &format_attr_in_tx_cp.attr, + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; - &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ - &format_attr_ldlat.attr, /* PEBS load latency */ - NULL, +static struct attribute *nhm_format_attr[] = { + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; + +static struct attribute *slm_format_attr[] = { + &format_attr_offcore_rsp.attr, + NULL }; static struct attribute *skl_format_attr[] = { @@ -3781,6 +3795,36 @@ done: static DEVICE_ATTR_RW(freeze_on_smi); +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *lbr_attrs[] = { + &dev_attr_branches.attr, + NULL +}; + +static char pmu_name_str[30]; + +static ssize_t pmu_name_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str); +} + +static DEVICE_ATTR_RO(pmu_name); + +static struct attribute *intel_pmu_caps_attrs[] = { + &dev_attr_pmu_name.attr, + NULL +}; + static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, NULL, @@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; + struct attribute **extra_attr = NULL; + char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case INTEL_FAM6_CORE_YONAH: pr_cont("Core events, "); + name = "core"; break; case INTEL_FAM6_CORE2_MEROM: @@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_core2_event_constraints; x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); + name = "core2"; break; case INTEL_FAM6_NEHALEM: @@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); + x86_pmu.pebs_no_tlb = 1; + extra_attr = nhm_format_attr; pr_cont("Nehalem events, "); + name = "nehalem"; break; case INTEL_FAM6_ATOM_PINEVIEW: @@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_core2; pr_cont("Atom events, "); + name = "bonnell"; break; case INTEL_FAM6_ATOM_SILVERMONT1: @@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = slm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Silvermont events, "); + name = "silvermont"; break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = glm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Goldmont events, "); + name = "goldmont"; break; case INTEL_FAM6_ATOM_GEMINI_LAKE: @@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void) x86_pmu.cpu_events = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ event_attr_td_total_slots_scale_glm.event_str = "4"; + extra_attr = slm_format_attr; pr_cont("Goldmont plus events, "); + name = "goldmont_plus"; break; case INTEL_FAM6_WESTMERE: @@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void) X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); intel_pmu_pebs_data_source_nhm(); + extra_attr = nhm_format_attr; pr_cont("Westmere events, "); + name = "westmere"; break; case INTEL_FAM6_SANDYBRIDGE: @@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("SandyBridge events, "); + name = "sandybridge"; break; case INTEL_FAM6_IVYBRIDGE: @@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("IvyBridge events, "); + name = "ivybridge"; break; @@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Haswell events, "); + name = "haswell"; break; case INTEL_FAM6_BROADWELL_CORE: @@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Broadwell events, "); + name = "broadwell"; break; case INTEL_FAM6_XEON_PHI_KNL: @@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void) /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - + extra_attr = slm_format_attr; pr_cont("Knights Landing/Mill events, "); + name = "knights-landing"; break; case INTEL_FAM6_SKYLAKE_MOBILE: @@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - skl_format_attr); - WARN_ON(!x86_pmu.format_attrs); + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); x86_pmu.cpu_events = hsw_events_attrs; + intel_pmu_pebs_data_source_skl( + boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); + name = "skylake"; break; default: @@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void) case 1: x86_pmu.event_constraints = intel_v1_event_constraints; pr_cont("generic architected perfmon v1, "); + name = "generic_arch_v1"; break; default: /* @@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void) */ x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("generic architected perfmon, "); + name = "generic_arch_v2+"; break; } } + snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name); + + if (version >= 2 && extra_attr) { + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + extra_attr); + WARN_ON(!x86_pmu.format_attrs); + } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); @@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - if (x86_pmu.lbr_nr) + x86_pmu.caps_attrs = intel_pmu_caps_attrs; + + if (x86_pmu.lbr_nr) { + x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs); pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + } + /* * Access extra MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support offcore event diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c deleted file mode 100644 index 2521f771f2f5..000000000000 --- a/arch/x86/events/intel/cqm.c +++ /dev/null @@ -1,1766 +0,0 @@ -/* - * Intel Cache Quality-of-Service Monitoring (CQM) support. - * - * Based very, very heavily on work by Peter Zijlstra. - */ - -#include <linux/perf_event.h> -#include <linux/slab.h> -#include <asm/cpu_device_id.h> -#include <asm/intel_rdt_common.h> -#include "../perf_event.h" - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - -#define MBM_CNTR_WIDTH 24 -/* - * Guaranteed time in ms as per SDM where MBM counters will not overflow. - */ -#define MBM_CTR_OVERFLOW_TIME 1000 - -static u32 cqm_max_rmid = -1; -static unsigned int cqm_l3_scale; /* supposedly cacheline size */ -static bool cqm_enabled, mbm_enabled; -unsigned int mbm_socket_max; - -/* - * The cached intel_pqr_state is strictly per CPU and can never be - * updated from a remote CPU. Both functions which modify the state - * (intel_cqm_event_start and intel_cqm_event_stop) are called with - * interrupts disabled, which is sufficient for the protection. - */ -DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); -static struct hrtimer *mbm_timers; -/** - * struct sample - mbm event's (local or total) data - * @total_bytes #bytes since we began monitoring - * @prev_msr previous value of MSR - */ -struct sample { - u64 total_bytes; - u64 prev_msr; -}; - -/* - * samples profiled for total memory bandwidth type events - */ -static struct sample *mbm_total; -/* - * samples profiled for local memory bandwidth type events - */ -static struct sample *mbm_local; - -#define pkg_id topology_physical_package_id(smp_processor_id()) -/* - * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. - * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of - * rmids per socket, an example is given below - * RMID1 of Socket0: vrmid = 1 - * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 - * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 - */ -#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) -/* - * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. - * Also protects event->hw.cqm_rmid - * - * Hold either for stability, both for modification of ->hw.cqm_rmid. - */ -static DEFINE_MUTEX(cache_mutex); -static DEFINE_RAW_SPINLOCK(cache_lock); - -/* - * Groups of events that have the same target(s), one RMID per group. - */ -static LIST_HEAD(cache_groups); - -/* - * Mask of CPUs for reading CQM values. We only need one per-socket. - */ -static cpumask_t cqm_cpumask; - -#define RMID_VAL_ERROR (1ULL << 63) -#define RMID_VAL_UNAVAIL (1ULL << 62) - -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_MBM_LOCAL_EVENT_ID 0x03 - -/* - * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). - * - * This rmid is always free and is guaranteed to have an associated - * near-zero occupancy value, i.e. no cachelines are tagged with this - * RMID, once __intel_cqm_rmid_rotate() returns. - */ -static u32 intel_cqm_rotation_rmid; - -#define INVALID_RMID (-1) - -/* - * Is @rmid valid for programming the hardware? - * - * rmid 0 is reserved by the hardware for all non-monitored tasks, which - * means that we should never come across an rmid with that value. - * Likewise, an rmid value of -1 is used to indicate "no rmid currently - * assigned" and is used as part of the rotation code. - */ -static inline bool __rmid_valid(u32 rmid) -{ - if (!rmid || rmid == INVALID_RMID) - return false; - - return true; -} - -static u64 __rmid_read(u32 rmid) -{ - u64 val; - - /* - * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, - * it just says that to increase confusion. - */ - wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - /* - * Aside from the ERROR and UNAVAIL bits, assume this thing returns - * the number of cachelines tagged with @rmid. - */ - return val; -} - -enum rmid_recycle_state { - RMID_YOUNG = 0, - RMID_AVAILABLE, - RMID_DIRTY, -}; - -struct cqm_rmid_entry { - u32 rmid; - enum rmid_recycle_state state; - struct list_head list; - unsigned long queue_time; -}; - -/* - * cqm_rmid_free_lru - A least recently used list of RMIDs. - * - * Oldest entry at the head, newest (most recently used) entry at the - * tail. This list is never traversed, it's only used to keep track of - * the lru order. That is, we only pick entries of the head or insert - * them on the tail. - * - * All entries on the list are 'free', and their RMIDs are not currently - * in use. To mark an RMID as in use, remove its entry from the lru - * list. - * - * - * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. - * - * This list is contains RMIDs that no one is currently using but that - * may have a non-zero occupancy value associated with them. The - * rotation worker moves RMIDs from the limbo list to the free list once - * the occupancy value drops below __intel_cqm_threshold. - * - * Both lists are protected by cache_mutex. - */ -static LIST_HEAD(cqm_rmid_free_lru); -static LIST_HEAD(cqm_rmid_limbo_lru); - -/* - * We use a simple array of pointers so that we can lookup a struct - * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() - * and __put_rmid() from having to worry about dealing with struct - * cqm_rmid_entry - they just deal with rmids, i.e. integers. - * - * Once this array is initialized it is read-only. No locks are required - * to access it. - * - * All entries for all RMIDs can be looked up in the this array at all - * times. - */ -static struct cqm_rmid_entry **cqm_rmid_ptrs; - -static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - entry = cqm_rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); - - return entry; -} - -/* - * Returns < 0 on fail. - * - * We expect to be called with cache_mutex held. - */ -static u32 __get_rmid(void) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - if (list_empty(&cqm_rmid_free_lru)) - return INVALID_RMID; - - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); - list_del(&entry->list); - - return entry->rmid; -} - -static void __put_rmid(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - WARN_ON(!__rmid_valid(rmid)); - entry = __rmid_entry(rmid); - - entry->queue_time = jiffies; - entry->state = RMID_YOUNG; - - list_add_tail(&entry->list, &cqm_rmid_limbo_lru); -} - -static void cqm_cleanup(void) -{ - int i; - - if (!cqm_rmid_ptrs) - return; - - for (i = 0; i < cqm_max_rmid; i++) - kfree(cqm_rmid_ptrs[i]); - - kfree(cqm_rmid_ptrs); - cqm_rmid_ptrs = NULL; - cqm_enabled = false; -} - -static int intel_cqm_setup_rmid_cache(void) -{ - struct cqm_rmid_entry *entry; - unsigned int nr_rmids; - int r = 0; - - nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * - nr_rmids, GFP_KERNEL); - if (!cqm_rmid_ptrs) - return -ENOMEM; - - for (; r <= cqm_max_rmid; r++) { - struct cqm_rmid_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto fail; - - INIT_LIST_HEAD(&entry->list); - entry->rmid = r; - cqm_rmid_ptrs[r] = entry; - - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. - */ - entry = __rmid_entry(0); - list_del(&entry->list); - - mutex_lock(&cache_mutex); - intel_cqm_rotation_rmid = __get_rmid(); - mutex_unlock(&cache_mutex); - - return 0; - -fail: - cqm_cleanup(); - return -ENOMEM; -} - -/* - * Determine if @a and @b measure the same set of tasks. - * - * If @a and @b measure the same set of tasks then we want to share a - * single RMID. - */ -static bool __match_event(struct perf_event *a, struct perf_event *b) -{ - /* Per-cpu and task events don't mix */ - if ((a->attach_state & PERF_ATTACH_TASK) != - (b->attach_state & PERF_ATTACH_TASK)) - return false; - -#ifdef CONFIG_CGROUP_PERF - if (a->cgrp != b->cgrp) - return false; -#endif - - /* If not task event, we're machine wide */ - if (!(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Events that target same task are placed into the same cache group. - * Mark it as a multi event group, so that we update ->count - * for every event rather than just the group leader later. - */ - if (a->hw.target == b->hw.target) { - b->hw.is_group_event = true; - return true; - } - - /* - * Are we an inherited event? - */ - if (b->parent == a) - return true; - - return false; -} - -#ifdef CONFIG_CGROUP_PERF -static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) -{ - if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target, event->ctx); - - return event->cgrp; -} -#endif - -/* - * Determine if @a's tasks intersect with @b's tasks - * - * There are combinations of events that we explicitly prohibit, - * - * PROHIBITS - * system-wide -> cgroup and task - * cgroup -> system-wide - * -> task in cgroup - * task -> system-wide - * -> task in cgroup - * - * Call this function before allocating an RMID. - */ -static bool __conflict_event(struct perf_event *a, struct perf_event *b) -{ -#ifdef CONFIG_CGROUP_PERF - /* - * We can have any number of cgroups but only one system-wide - * event at a time. - */ - if (a->cgrp && b->cgrp) { - struct perf_cgroup *ac = a->cgrp; - struct perf_cgroup *bc = b->cgrp; - - /* - * This condition should have been caught in - * __match_event() and we should be sharing an RMID. - */ - WARN_ON_ONCE(ac == bc); - - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } - - if (a->cgrp || b->cgrp) { - struct perf_cgroup *ac, *bc; - - /* - * cgroup and system-wide events are mutually exclusive - */ - if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || - (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) - return true; - - /* - * Ensure neither event is part of the other's cgroup - */ - ac = event_to_cgroup(a); - bc = event_to_cgroup(b); - if (ac == bc) - return true; - - /* - * Must have cgroup and non-intersecting task events. - */ - if (!ac || !bc) - return false; - - /* - * We have cgroup and task events, and the task belongs - * to a cgroup. Check for for overlap. - */ - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } -#endif - /* - * If one of them is not a task, same story as above with cgroups. - */ - if (!(a->attach_state & PERF_ATTACH_TASK) || - !(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Must be non-overlapping. - */ - return false; -} - -struct rmid_read { - u32 rmid; - u32 evt_type; - atomic64_t value; -}; - -static void __intel_cqm_event_count(void *info); -static void init_mbm_sample(u32 rmid, u32 evt_type); -static void __intel_mbm_event_count(void *info); - -static bool is_cqm_event(int e) -{ - return (e == QOS_L3_OCCUP_EVENT_ID); -} - -static bool is_mbm_event(int e) -{ - return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); -} - -static void cqm_mask_call(struct rmid_read *rr) -{ - if (is_mbm_event(rr->evt_type)) - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); - else - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); -} - -/* - * Exchange the RMID of a group of events. - */ -static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) -{ - struct perf_event *event; - struct list_head *head = &group->hw.cqm_group_entry; - u32 old_rmid = group->hw.cqm_rmid; - - lockdep_assert_held(&cache_mutex); - - /* - * If our RMID is being deallocated, perform a read now. - */ - if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { - struct rmid_read rr = { - .rmid = old_rmid, - .evt_type = group->attr.config, - .value = ATOMIC64_INIT(0), - }; - - cqm_mask_call(&rr); - local64_set(&group->count, atomic64_read(&rr.value)); - } - - raw_spin_lock_irq(&cache_lock); - - group->hw.cqm_rmid = rmid; - list_for_each_entry(event, head, hw.cqm_group_entry) - event->hw.cqm_rmid = rmid; - - raw_spin_unlock_irq(&cache_lock); - - /* - * If the allocation is for mbm, init the mbm stats. - * Need to check if each event in the group is mbm event - * because there could be multiple type of events in the same group. - */ - if (__rmid_valid(rmid)) { - event = group; - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - - list_for_each_entry(event, head, hw.cqm_group_entry) { - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - } - } - - return old_rmid; -} - -/* - * If we fail to assign a new RMID for intel_cqm_rotation_rmid because - * cachelines are still tagged with RMIDs in limbo, we progressively - * increment the threshold until we find an RMID in limbo with <= - * __intel_cqm_threshold lines tagged. This is designed to mitigate the - * problem where cachelines tagged with an RMID are not steadily being - * evicted. - * - * On successful rotations we decrease the threshold back towards zero. - * - * __intel_cqm_max_threshold provides an upper bound on the threshold, - * and is measured in bytes because it's exposed to userland. - */ -static unsigned int __intel_cqm_threshold; -static unsigned int __intel_cqm_max_threshold; - -/* - * Test whether an RMID has a zero occupancy value on this cpu. - */ -static void intel_cqm_stable(void *arg) -{ - struct cqm_rmid_entry *entry; - - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - if (entry->state != RMID_AVAILABLE) - break; - - if (__rmid_read(entry->rmid) > __intel_cqm_threshold) - entry->state = RMID_DIRTY; - } -} - -/* - * If we have group events waiting for an RMID that don't conflict with - * events already running, assign @rmid. - */ -static bool intel_cqm_sched_in_event(u32 rmid) -{ - struct perf_event *leader, *event; - - lockdep_assert_held(&cache_mutex); - - leader = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - event = leader; - - list_for_each_entry_continue(event, &cache_groups, - hw.cqm_groups_entry) { - if (__rmid_valid(event->hw.cqm_rmid)) - continue; - - if (__conflict_event(event, leader)) - continue; - - intel_cqm_xchg_rmid(event, rmid); - return true; - } - - return false; -} - -/* - * Initially use this constant for both the limbo queue time and the - * rotation timer interval, pmu::hrtimer_interval_ms. - * - * They don't need to be the same, but the two are related since if you - * rotate faster than you recycle RMIDs, you may run out of available - * RMIDs. - */ -#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ - -static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; - -/* - * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list - * @nr_available: number of freeable RMIDs on the limbo list - * - * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no - * cachelines are tagged with those RMIDs. After this we can reuse them - * and know that the current set of active RMIDs is stable. - * - * Return %true or %false depending on whether stabilization needs to be - * reattempted. - * - * If we return %true then @nr_available is updated to indicate the - * number of RMIDs on the limbo list that have been queued for the - * minimum queue time (RMID_AVAILABLE), but whose data occupancy values - * are above __intel_cqm_threshold. - */ -static bool intel_cqm_rmid_stabilize(unsigned int *available) -{ - struct cqm_rmid_entry *entry, *tmp; - - lockdep_assert_held(&cache_mutex); - - *available = 0; - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - unsigned long min_queue_time; - unsigned long now = jiffies; - - /* - * We hold RMIDs placed into limbo for a minimum queue - * time. Before the minimum queue time has elapsed we do - * not recycle RMIDs. - * - * The reasoning is that until a sufficient time has - * passed since we stopped using an RMID, any RMID - * placed onto the limbo list will likely still have - * data tagged in the cache, which means we'll probably - * fail to recycle it anyway. - * - * We can save ourselves an expensive IPI by skipping - * any RMIDs that have not been queued for the minimum - * time. - */ - min_queue_time = entry->queue_time + - msecs_to_jiffies(__rmid_queue_time_ms); - - if (time_after(min_queue_time, now)) - break; - - entry->state = RMID_AVAILABLE; - (*available)++; - } - - /* - * Fast return if none of the RMIDs on the limbo list have been - * sitting on the queue for the minimum queue time. - */ - if (!*available) - return false; - - /* - * Test whether an RMID is free for each package. - */ - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); - - list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { - /* - * Exhausted all RMIDs that have waited min queue time. - */ - if (entry->state == RMID_YOUNG) - break; - - if (entry->state == RMID_DIRTY) - continue; - - list_del(&entry->list); /* remove from limbo */ - - /* - * The rotation RMID gets priority if it's - * currently invalid. In which case, skip adding - * the RMID to the the free lru. - */ - if (!__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_rotation_rmid = entry->rmid; - continue; - } - - /* - * If we have groups waiting for RMIDs, hand - * them one now provided they don't conflict. - */ - if (intel_cqm_sched_in_event(entry->rmid)) - continue; - - /* - * Otherwise place it onto the free list. - */ - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - - return __rmid_valid(intel_cqm_rotation_rmid); -} - -/* - * Pick a victim group and move it to the tail of the group list. - * @next: The first group without an RMID - */ -static void __intel_cqm_pick_and_rotate(struct perf_event *next) -{ - struct perf_event *rotor; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - rotor = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - - /* - * The group at the front of the list should always have a valid - * RMID. If it doesn't then no groups have RMIDs assigned and we - * don't need to rotate the list. - */ - if (next == rotor) - return; - - rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); - __put_rmid(rmid); - - list_rotate_left(&cache_groups); -} - -/* - * Deallocate the RMIDs from any events that conflict with @event, and - * place them on the back of the group list. - */ -static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) -{ - struct perf_event *group, *g; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { - if (group == event) - continue; - - rmid = group->hw.cqm_rmid; - - /* - * Skip events that don't have a valid RMID. - */ - if (!__rmid_valid(rmid)) - continue; - - /* - * No conflict? No problem! Leave the event alone. - */ - if (!__conflict_event(group, event)) - continue; - - intel_cqm_xchg_rmid(group, INVALID_RMID); - __put_rmid(rmid); - } -} - -/* - * Attempt to rotate the groups and assign new RMIDs. - * - * We rotate for two reasons, - * 1. To handle the scheduling of conflicting events - * 2. To recycle RMIDs - * - * Rotating RMIDs is complicated because the hardware doesn't give us - * any clues. - * - * There's problems with the hardware interface; when you change the - * task:RMID map cachelines retain their 'old' tags, giving a skewed - * picture. In order to work around this, we must always keep one free - * RMID - intel_cqm_rotation_rmid. - * - * Rotation works by taking away an RMID from a group (the old RMID), - * and assigning the free RMID to another group (the new RMID). We must - * then wait for the old RMID to not be used (no cachelines tagged). - * This ensure that all cachelines are tagged with 'active' RMIDs. At - * this point we can start reading values for the new RMID and treat the - * old RMID as the free RMID for the next rotation. - * - * Return %true or %false depending on whether we did any rotating. - */ -static bool __intel_cqm_rmid_rotate(void) -{ - struct perf_event *group, *start = NULL; - unsigned int threshold_limit; - unsigned int nr_needed = 0; - unsigned int nr_available; - bool rotated = false; - - mutex_lock(&cache_mutex); - -again: - /* - * Fast path through this function if there are no groups and no - * RMIDs that need cleaning. - */ - if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { - if (!__rmid_valid(group->hw.cqm_rmid)) { - if (!start) - start = group; - nr_needed++; - } - } - - /* - * We have some event groups, but they all have RMIDs assigned - * and no RMIDs need cleaning. - */ - if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - if (!nr_needed) - goto stabilize; - - /* - * We have more event groups without RMIDs than available RMIDs, - * or we have event groups that conflict with the ones currently - * scheduled. - * - * We force deallocate the rmid of the group at the head of - * cache_groups. The first event group without an RMID then gets - * assigned intel_cqm_rotation_rmid. This ensures we always make - * forward progress. - * - * Rotate the cache_groups list so the previous head is now the - * tail. - */ - __intel_cqm_pick_and_rotate(start); - - /* - * If the rotation is going to succeed, reduce the threshold so - * that we don't needlessly reuse dirty RMIDs. - */ - if (__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); - intel_cqm_rotation_rmid = __get_rmid(); - - intel_cqm_sched_out_conflicting_events(start); - - if (__intel_cqm_threshold) - __intel_cqm_threshold--; - } - - rotated = true; - -stabilize: - /* - * We now need to stablize the RMID we freed above (if any) to - * ensure that the next time we rotate we have an RMID with zero - * occupancy value. - * - * Alternatively, if we didn't need to perform any rotation, - * we'll have a bunch of RMIDs in limbo that need stabilizing. - */ - threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; - - while (intel_cqm_rmid_stabilize(&nr_available) && - __intel_cqm_threshold < threshold_limit) { - unsigned int steal_limit; - - /* - * Don't spin if nobody is actively waiting for an RMID, - * the rotation worker will be kicked as soon as an - * event needs an RMID anyway. - */ - if (!nr_needed) - break; - - /* Allow max 25% of RMIDs to be in limbo. */ - steal_limit = (cqm_max_rmid + 1) / 4; - - /* - * We failed to stabilize any RMIDs so our rotation - * logic is now stuck. In order to make forward progress - * we have a few options: - * - * 1. rotate ("steal") another RMID - * 2. increase the threshold - * 3. do nothing - * - * We do both of 1. and 2. until we hit the steal limit. - * - * The steal limit prevents all RMIDs ending up on the - * limbo list. This can happen if every RMID has a - * non-zero occupancy above threshold_limit, and the - * occupancy values aren't dropping fast enough. - * - * Note that there is prioritisation at work here - we'd - * rather increase the number of RMIDs on the limbo list - * than increase the threshold, because increasing the - * threshold skews the event data (because we reuse - * dirty RMIDs) - threshold bumps are a last resort. - */ - if (nr_available < steal_limit) - goto again; - - __intel_cqm_threshold++; - } - -out: - mutex_unlock(&cache_mutex); - return rotated; -} - -static void intel_cqm_rmid_rotate(struct work_struct *work); - -static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); - -static struct pmu intel_cqm_pmu; - -static void intel_cqm_rmid_rotate(struct work_struct *work) -{ - unsigned long delay; - - __intel_cqm_rmid_rotate(); - - delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); - schedule_delayed_work(&intel_cqm_rmid_work, delay); -} - -static u64 update_sample(unsigned int rmid, u32 evt_type, int first) -{ - struct sample *mbm_current; - u32 vrmid = rmid_2_index(rmid); - u64 val, bytes, shift; - u32 eventid; - - if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { - mbm_current = &mbm_local[vrmid]; - eventid = QOS_MBM_LOCAL_EVENT_ID; - } else { - mbm_current = &mbm_total[vrmid]; - eventid = QOS_MBM_TOTAL_EVENT_ID; - } - - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return mbm_current->total_bytes; - - if (first) { - mbm_current->prev_msr = val; - mbm_current->total_bytes = 0; - return mbm_current->total_bytes; - } - - /* - * The h/w guarantees that counters will not overflow - * so long as we poll them at least once per second. - */ - shift = 64 - MBM_CNTR_WIDTH; - bytes = (val << shift) - (mbm_current->prev_msr << shift); - bytes >>= shift; - - bytes *= cqm_l3_scale; - - mbm_current->total_bytes += bytes; - mbm_current->prev_msr = val; - - return mbm_current->total_bytes; -} - -static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) -{ - return update_sample(rmid, evt_type, 0); -} - -static void __intel_mbm_event_init(void *info) -{ - struct rmid_read *rr = info; - - update_sample(rr->rmid, rr->evt_type, 1); -} - -static void init_mbm_sample(u32 rmid, u32 evt_type) -{ - struct rmid_read rr = { - .rmid = rmid, - .evt_type = evt_type, - .value = ATOMIC64_INIT(0), - }; - - /* on each socket, init sample */ - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); -} - -/* - * Find a group and setup RMID. - * - * If we're part of a group, we use the group's RMID. - */ -static void intel_cqm_setup_event(struct perf_event *event, - struct perf_event **group) -{ - struct perf_event *iter; - bool conflict = false; - u32 rmid; - - event->hw.is_group_event = false; - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - rmid = iter->hw.cqm_rmid; - - if (__match_event(iter, event)) { - /* All tasks in a group share an RMID */ - event->hw.cqm_rmid = rmid; - *group = iter; - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - return; - } - - /* - * We only care about conflicts for events that are - * actually scheduled in (and hence have a valid RMID). - */ - if (__conflict_event(iter, event) && __rmid_valid(rmid)) - conflict = true; - } - - if (conflict) - rmid = INVALID_RMID; - else - rmid = __get_rmid(); - - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - - event->hw.cqm_rmid = rmid; -} - -static void intel_cqm_event_read(struct perf_event *event) -{ - unsigned long flags; - u32 rmid; - u64 val; - - /* - * Task events are handled by intel_cqm_event_count(). - */ - if (event->cpu == -1) - return; - - raw_spin_lock_irqsave(&cache_lock, flags); - rmid = event->hw.cqm_rmid; - - if (!__rmid_valid(rmid)) - goto out; - - if (is_mbm_event(event->attr.config)) - val = rmid_read_mbm(rmid, event->attr.config); - else - val = __rmid_read(rmid); - - /* - * Ignore this reading on error states and do not update the value. - */ - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - goto out; - - local64_set(&event->count, val); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); -} - -static void __intel_cqm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = __rmid_read(rr->rmid); - - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - - atomic64_add(val, &rr->value); -} - -static inline bool cqm_group_leader(struct perf_event *event) -{ - return !list_empty(&event->hw.cqm_groups_entry); -} - -static void __intel_mbm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = rmid_read_mbm(rr->rmid, rr->evt_type); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - atomic64_add(val, &rr->value); -} - -static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) -{ - struct perf_event *iter, *iter1; - int ret = HRTIMER_RESTART; - struct list_head *head; - unsigned long flags; - u32 grp_rmid; - - /* - * Need to cache_lock as the timer Event Select MSR reads - * can race with the mbm/cqm count() and mbm_init() reads. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (list_empty(&cache_groups)) { - ret = HRTIMER_NORESTART; - goto out; - } - - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - grp_rmid = iter->hw.cqm_rmid; - if (!__rmid_valid(grp_rmid)) - continue; - if (is_mbm_event(iter->attr.config)) - update_sample(grp_rmid, iter->attr.config, 0); - - head = &iter->hw.cqm_group_entry; - if (list_empty(head)) - continue; - list_for_each_entry(iter1, head, hw.cqm_group_entry) { - if (!iter1->hw.is_group_event) - break; - if (is_mbm_event(iter1->attr.config)) - update_sample(iter1->hw.cqm_rmid, - iter1->attr.config, 0); - } - } - - hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return ret; -} - -static void __mbm_start_timer(void *info) -{ - hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), - HRTIMER_MODE_REL_PINNED); -} - -static void __mbm_stop_timer(void *info) -{ - hrtimer_cancel(&mbm_timers[pkg_id]); -} - -static void mbm_start_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); -} - -static void mbm_stop_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); -} - -static void mbm_hrtimer_init(void) -{ - struct hrtimer *hr; - int i; - - for (i = 0; i < mbm_socket_max; i++) { - hr = &mbm_timers[i]; - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = mbm_hrtimer_handle; - } -} - -static u64 intel_cqm_event_count(struct perf_event *event) -{ - unsigned long flags; - struct rmid_read rr = { - .evt_type = event->attr.config, - .value = ATOMIC64_INIT(0), - }; - - /* - * We only need to worry about task events. System-wide events - * are handled like usual, i.e. entirely with - * intel_cqm_event_read(). - */ - if (event->cpu != -1) - return __perf_event_count(event); - - /* - * Only the group leader gets to report values except in case of - * multiple events in the same group, we still need to read the - * other events.This stops us - * reporting duplicate values to userspace, and gives us a clear - * rule for which task gets to report the values. - * - * Note that it is impossible to attribute these values to - * specific packages - we forfeit that ability when we create - * task events. - */ - if (!cqm_group_leader(event) && !event->hw.is_group_event) - return 0; - - /* - * Getting up-to-date values requires an SMP IPI which is not - * possible if we're being called in interrupt context. Return - * the cached values instead. - */ - if (unlikely(in_interrupt())) - goto out; - - /* - * Notice that we don't perform the reading of an RMID - * atomically, because we can't hold a spin lock across the - * IPIs. - * - * Speculatively perform the read, since @event might be - * assigned a different (possibly invalid) RMID while we're - * busying performing the IPI calls. It's therefore necessary to - * check @event's RMID afterwards, and if it has changed, - * discard the result of the read. - */ - rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); - - if (!__rmid_valid(rr.rmid)) - goto out; - - cqm_mask_call(&rr); - - raw_spin_lock_irqsave(&cache_lock, flags); - if (event->hw.cqm_rmid == rr.rmid) - local64_set(&event->count, atomic64_read(&rr.value)); - raw_spin_unlock_irqrestore(&cache_lock, flags); -out: - return __perf_event_count(event); -} - -static void intel_cqm_event_start(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 rmid = event->hw.cqm_rmid; - - if (!(event->hw.cqm_state & PERF_HES_STOPPED)) - return; - - event->hw.cqm_state &= ~PERF_HES_STOPPED; - - if (state->rmid_usecnt++) { - if (!WARN_ON_ONCE(state->rmid != rmid)) - return; - } else { - WARN_ON_ONCE(state->rmid); - } - - state->rmid = rmid; - wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); -} - -static void intel_cqm_event_stop(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - - if (event->hw.cqm_state & PERF_HES_STOPPED) - return; - - event->hw.cqm_state |= PERF_HES_STOPPED; - - intel_cqm_event_read(event); - - if (!--state->rmid_usecnt) { - state->rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); - } else { - WARN_ON_ONCE(!state->rmid); - } -} - -static int intel_cqm_event_add(struct perf_event *event, int mode) -{ - unsigned long flags; - u32 rmid; - - raw_spin_lock_irqsave(&cache_lock, flags); - - event->hw.cqm_state = PERF_HES_STOPPED; - rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid) && (mode & PERF_EF_START)) - intel_cqm_event_start(event, mode); - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return 0; -} - -static void intel_cqm_event_destroy(struct perf_event *event) -{ - struct perf_event *group_other = NULL; - unsigned long flags; - - mutex_lock(&cache_mutex); - /* - * Hold the cache_lock as mbm timer handlers could be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - /* - * If there's another event in this group... - */ - if (!list_empty(&event->hw.cqm_group_entry)) { - group_other = list_first_entry(&event->hw.cqm_group_entry, - struct perf_event, - hw.cqm_group_entry); - list_del(&event->hw.cqm_group_entry); - } - - /* - * And we're the group leader.. - */ - if (cqm_group_leader(event)) { - /* - * If there was a group_other, make that leader, otherwise - * destroy the group and return the RMID. - */ - if (group_other) { - list_replace(&event->hw.cqm_groups_entry, - &group_other->hw.cqm_groups_entry); - } else { - u32 rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid)) - __put_rmid(rmid); - list_del(&event->hw.cqm_groups_entry); - } - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - /* - * Stop the mbm overflow timers when the last event is destroyed. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_stop_timers(); - - mutex_unlock(&cache_mutex); -} - -static int intel_cqm_event_init(struct perf_event *event) -{ - struct perf_event *group = NULL; - bool rotate = false; - unsigned long flags; - - if (event->attr.type != intel_cqm_pmu.type) - return -ENOENT; - - if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || - (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) - return -EINVAL; - - if ((is_cqm_event(event->attr.config) && !cqm_enabled) || - (is_mbm_event(event->attr.config) && !mbm_enabled)) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - INIT_LIST_HEAD(&event->hw.cqm_group_entry); - INIT_LIST_HEAD(&event->hw.cqm_groups_entry); - - event->destroy = intel_cqm_event_destroy; - - mutex_lock(&cache_mutex); - - /* - * Start the mbm overflow timers when the first event is created. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_start_timers(); - - /* Will also set rmid */ - intel_cqm_setup_event(event, &group); - - /* - * Hold the cache_lock as mbm timer handlers be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (group) { - list_add_tail(&event->hw.cqm_group_entry, - &group->hw.cqm_group_entry); - } else { - list_add_tail(&event->hw.cqm_groups_entry, - &cache_groups); - - /* - * All RMIDs are either in use or have recently been - * used. Kick the rotation worker to clean/free some. - * - * We only do this for the group leader, rather than for - * every event in a group to save on needless work. - */ - if (!__rmid_valid(event->hw.cqm_rmid)) - rotate = true; - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - mutex_unlock(&cache_mutex); - - if (rotate) - schedule_delayed_work(&intel_cqm_rmid_work, 0); - - return 0; -} - -EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); -EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); -EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); -EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); -EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); - -EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); -EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); -EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); -EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); - -EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); -EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); -EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); -EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); - -static struct attribute *intel_cqm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute *intel_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - NULL, -}; - -static struct attribute *intel_cmt_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute_group intel_cqm_events_group = { - .name = "events", - .attrs = NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-7"); -static struct attribute *intel_cqm_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group intel_cqm_format_group = { - .name = "format", - .attrs = intel_cqm_formats_attr, -}; - -static ssize_t -max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, - char *page) -{ - ssize_t rv; - - mutex_lock(&cache_mutex); - rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); - mutex_unlock(&cache_mutex); - - return rv; -} - -static ssize_t -max_recycle_threshold_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned int bytes, cachelines; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - mutex_lock(&cache_mutex); - - __intel_cqm_max_threshold = bytes; - cachelines = bytes / cqm_l3_scale; - - /* - * The new maximum takes effect immediately. - */ - if (__intel_cqm_threshold > cachelines) - __intel_cqm_threshold = cachelines; - - mutex_unlock(&cache_mutex); - - return count; -} - -static DEVICE_ATTR_RW(max_recycle_threshold); - -static struct attribute *intel_cqm_attrs[] = { - &dev_attr_max_recycle_threshold.attr, - NULL, -}; - -static const struct attribute_group intel_cqm_group = { - .attrs = intel_cqm_attrs, -}; - -static const struct attribute_group *intel_cqm_attr_groups[] = { - &intel_cqm_events_group, - &intel_cqm_format_group, - &intel_cqm_group, - NULL, -}; - -static struct pmu intel_cqm_pmu = { - .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, - .attr_groups = intel_cqm_attr_groups, - .task_ctx_nr = perf_sw_context, - .event_init = intel_cqm_event_init, - .add = intel_cqm_event_add, - .del = intel_cqm_event_stop, - .start = intel_cqm_event_start, - .stop = intel_cqm_event_stop, - .read = intel_cqm_event_read, - .count = intel_cqm_event_count, -}; - -static inline void cqm_pick_event_reader(int cpu) -{ - int reader; - - /* First online cpu in package becomes the reader */ - reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); - if (reader >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cqm_cpumask); -} - -static int intel_cqm_cpu_starting(unsigned int cpu) -{ - struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - state->rmid = 0; - state->closid = 0; - state->rmid_usecnt = 0; - - WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); - WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); - - cqm_pick_event_reader(cpu); - return 0; -} - -static int intel_cqm_cpu_exit(unsigned int cpu) -{ - int target; - - /* Is @cpu the current cqm reader for this package ? */ - if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) - return 0; - - /* Find another online reader in this package */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &cqm_cpumask); - - return 0; -} - -static const struct x86_cpu_id intel_cqm_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, - {} -}; - -static void mbm_cleanup(void) -{ - if (!mbm_enabled) - return; - - kfree(mbm_local); - kfree(mbm_total); - mbm_enabled = false; -} - -static const struct x86_cpu_id intel_mbm_local_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, - {} -}; - -static const struct x86_cpu_id intel_mbm_total_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, - {} -}; - -static int intel_mbm_init(void) -{ - int ret = 0, array_size, maxid = cqm_max_rmid + 1; - - mbm_socket_max = topology_max_packages(); - array_size = sizeof(struct sample) * maxid * mbm_socket_max; - mbm_local = kmalloc(array_size, GFP_KERNEL); - if (!mbm_local) - return -ENOMEM; - - mbm_total = kmalloc(array_size, GFP_KERNEL); - if (!mbm_total) { - ret = -ENOMEM; - goto out; - } - - array_size = sizeof(struct hrtimer) * mbm_socket_max; - mbm_timers = kmalloc(array_size, GFP_KERNEL); - if (!mbm_timers) { - ret = -ENOMEM; - goto out; - } - mbm_hrtimer_init(); - -out: - if (ret) - mbm_cleanup(); - - return ret; -} - -static int __init intel_cqm_init(void) -{ - char *str = NULL, scale[20]; - int cpu, ret; - - if (x86_match_cpu(intel_cqm_match)) - cqm_enabled = true; - - if (x86_match_cpu(intel_mbm_local_match) && - x86_match_cpu(intel_mbm_total_match)) - mbm_enabled = true; - - if (!cqm_enabled && !mbm_enabled) - return -ENODEV; - - cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; - - /* - * It's possible that not all resources support the same number - * of RMIDs. Instead of making scheduling much more complicated - * (where we have to match a task's RMID to a cpu that supports - * that many RMIDs) just find the minimum RMIDs supported across - * all cpus. - * - * Also, check that the scales match on all cpus. - */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->x86_cache_max_rmid < cqm_max_rmid) - cqm_max_rmid = c->x86_cache_max_rmid; - - if (c->x86_cache_occ_scale != cqm_l3_scale) { - pr_err("Multiple LLC scale values, disabling\n"); - ret = -EINVAL; - goto out; - } - } - - /* - * A reasonable upper limit on the max threshold is the number - * of lines tagged per RMID if all RMIDs have the same number of - * lines tagged in the LLC. - * - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. - */ - __intel_cqm_max_threshold = - boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); - - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); - str = kstrdup(scale, GFP_KERNEL); - if (!str) { - ret = -ENOMEM; - goto out; - } - - event_attr_intel_cqm_llc_scale.event_str = str; - - ret = intel_cqm_setup_rmid_cache(); - if (ret) - goto out; - - if (mbm_enabled) - ret = intel_mbm_init(); - if (ret && !cqm_enabled) - goto out; - - if (cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; - else if (!cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_mbm_events_attr; - else if (cqm_enabled && !mbm_enabled) - intel_cqm_events_group.attrs = intel_cqm_events_attr; - - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) { - pr_err("Intel CQM perf registration failed: %d\n", ret); - goto out; - } - - if (cqm_enabled) - pr_info("Intel CQM monitoring enabled\n"); - if (mbm_enabled) - pr_info("Intel MBM enabled\n"); - - /* - * Setup the hot cpu notifier once we are sure cqm - * is enabled to avoid notifier leak. - */ - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, - "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); -out: - cpus_read_unlock(); - - if (ret) { - kfree(str); - cqm_cleanup(); - mbm_cleanup(); - } - - return ret; -} -device_initcall(intel_cqm_init); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a322fed5f8ed..e1965e5ff570 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -49,34 +49,47 @@ union intel_x86_pebs_dse { */ #define P(a, b) PERF_MEM_S(a, b) #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define LEVEL(x) P(LVLNUM, x) +#define REM P(REMOTE, REMOTE) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ static u64 pebs_data_source[] = { - P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ - OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ - OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ - OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ - OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ - OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ - OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ - OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ - OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ - OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ - OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ - OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ }; /* Patch up minor differences in the bits */ void __init intel_pmu_pebs_data_source_nhm(void) { - pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); - pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); - pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); + + pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); } static u64 precise_store_data(u64 status) @@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status) { union intel_x86_pebs_dse dse; u64 val; - int model = boot_cpu_data.x86_model; - int fam = boot_cpu_data.x86; dse.val = status; @@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status) /* * Nehalem models do not support TLB, Lock infos */ - if (fam == 0x6 && (model == 26 || model == 30 - || model == 31 || model == 46)) { + if (x86_pmu.pebs_no_tlb) { val |= P(TLB, NA) | P(LOCK, NA); return val; } @@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event, else regs->flags &= ~PERF_EFLAGS_EXACT; - if ((sample_type & PERF_SAMPLE_ADDR) && + if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 955457a30197..8a6bbacd17dc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -109,6 +109,9 @@ enum { X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ X86_BR_CALL_STACK = 1 << 16,/* call stack */ X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].in_tx = 0; cpuc->lbr_entries[i].abort = 0; cpuc->lbr_entries[i].cycles = 0; + cpuc->lbr_entries[i].type = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; @@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; cpuc->lbr_entries[out].cycles = cycles; + cpuc->lbr_entries[out].type = 0; cpuc->lbr_entries[out].reserved = 0; out++; } @@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_CALL) mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_UNKNOWN, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_UNKNOWN, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +static int +common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) bool compress = false; /* if sampling all branches, then nothing to filter */ - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) return; for (i = 0; i < cpuc->lbr_stack.nr; i++) { @@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].from = 0; compress = true; } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); } if (!compress) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index ae8324d65e61..81fd41d5a0d9 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event) struct pt *pt = this_cpu_ptr(&pt_ctx); u64 reg; - if (!event->hw.itrace_started) { - event->hw.itrace_started = 1; + /* First round: clear STATUS, in particular the PSB byte counter. */ + if (!event->hw.config) { + perf_event_itrace_started(event); wrmsrl(MSR_IA32_RTIT_STATUS, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 476aec3a4cab..4196f81ec0e1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -91,7 +91,7 @@ struct amd_nb { (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) /* * A debug store configuration. @@ -558,6 +558,7 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; struct attribute **event_attrs; + struct attribute **caps_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; @@ -591,7 +592,8 @@ struct x86_pmu { pebs :1, pebs_active :1, pebs_broken :1, - pebs_prec_dist :1; + pebs_prec_dist :1, + pebs_no_tlb :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); @@ -741,6 +743,8 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); +int x86_pmu_max_precise(void); + void hw_perf_lbr_event_destroy(struct perf_event *event); int x86_setup_perfctr(struct perf_event *event); @@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void); void intel_pmu_pebs_data_source_nhm(void); +void intel_pmu_pebs_data_source_skl(bool pmem); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 724153797209..e0bb46c02857 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, if (ksig->ka.sa.sa_flags & SA_ONSTACK) sp = sigsp(sp, ksig); /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER32_DS && + else if (regs->ss != __USER32_DS && !(ksig->ka.sa.sa_flags & SA_RESTORER) && ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2efc768e4362..72d867f6b518 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -150,8 +150,6 @@ static inline void disable_acpi(void) { } extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ -#define acpi_unlazy_tlb(x) leave_mm(x) - #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { @@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) * you call efi_mem_attributes() during boot and at runtime, * you could theoretically see different attributes. * - * Since we are yet to see any x86 platforms that require - * anything other than PAGE_KERNEL (some arm64 platforms - * require the equivalent of PAGE_KERNEL_NOCACHE), return that - * until we know differently. + * We are yet to see any x86 platforms that require anything + * other than PAGE_KERNEL (some ARM64 platforms require the + * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME + * is active, the ACPI information will not be encrypted, + * so return PAGE_KERNEL_NOENC until we know differently. */ - return PAGE_KERNEL; + return PAGE_KERNEL_NOENC; } #endif diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7a9df3beb89b..676ee5807d86 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -74,6 +74,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -123,6 +126,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 33380b871463..0874ebda3069 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..9e206f31ce2a 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..5d9de36a2f04 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index e01f7f7ccb0c..84ae170bc3d0 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,5 +2,7 @@ #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..b5069e802d5c 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5a28e8e55e36..42bbbf0f173d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -177,7 +177,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d0a21b12dd58..1a2ba368da39 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include <asm/ldt.h> #include <asm/mmu.h> #include <asm/fixmap.h> +#include <asm/irq_vectors.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -22,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf0000) >> 16; + desc->limit1 = (info->limit & 0xf0000) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; @@ -83,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); + gate->offset_low = (u16) func; + gate->bits.p = 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, - unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int memcpy(&gdt[entry], desc, size); } -static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - unsigned long limit, unsigned char type, - unsigned char flags) -{ - desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); - desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); - desc->p = 1; -} - - -static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) { -#ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; + struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); - desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->limit0 = (u16) size; + desc->base0 = (u16) addr; + desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); -#else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + desc->base2 = (addr >> 24) & 0xFF; +#ifdef CONFIG_X86_64 + desc->base3 = (u32) (addr >> 32); #endif } @@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base) static inline unsigned long get_desc_limit(const struct desc_struct *desc) { - return desc->limit0 | (desc->limit << 16); + return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0xffff; - desc->limit = (limit >> 16) & 0xf; -} - -#ifdef CONFIG_X86_64 -static inline void set_nmi_gate(int gate, void *addr) -{ - gate_desc s; - - pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(debug_idt_table, gate, &s); + desc->limit1 = (limit >> 16) & 0xf; } -#endif -#ifdef CONFIG_TRACING -extern struct desc_ptr trace_idt_descr; -extern gate_desc trace_idt_table[]; -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ - write_idt_entry(trace_idt_table, entry, gate); -} +void update_intr_gate(unsigned int n, const void *addr); +void alloc_intr_gate(unsigned int n, const void *addr); -static inline void _trace_set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_trace_idt_entry(gate, &s); -} -#else -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ -} - -#define _trace_set_gate(gate, type, addr, dpl, ist, seg) -#endif - -static inline void _set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_idt_entry(idt_table, gate, &s); - write_trace_idt_entry(gate, &s); -} - -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -#define set_intr_gate_notrace(n, addr) \ - do { \ - BUG_ON((unsigned)n > 0xFF); \ - _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ - __KERNEL_CS); \ - } while (0) - -#define set_intr_gate(n, addr) \ - do { \ - set_intr_gate_notrace(n, addr); \ - _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ - 0, 0, __KERNEL_CS); \ - } while (0) - -extern int first_system_vector; -/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; -static inline void alloc_system_vector(int vector) -{ - if (!test_bit(vector, used_vectors)) { - set_bit(vector, used_vectors); - if (first_system_vector > vector) - first_system_vector = vector; - } else { - BUG(); - } -} - -#define alloc_intr_gate(n, addr) \ - do { \ - alloc_system_vector(n); \ - set_intr_gate(n, addr); \ - } while (0) - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_system_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); -} - -static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); -} - -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); static inline bool is_debug_idt_enabled(void) @@ -567,31 +420,6 @@ static inline void load_debug_idt(void) } #endif -#ifdef CONFIG_TRACING -extern atomic_t trace_idt_ctr; -static inline bool is_trace_idt_enabled(void) -{ - if (atomic_read(&trace_idt_ctr)) - return true; - - return false; -} - -static inline void load_trace_idt(void) -{ - load_idt((const struct desc_ptr *)&trace_idt_descr); -} -#else -static inline bool is_trace_idt_enabled(void) -{ - return false; -} - -static inline void load_trace_idt(void) -{ -} -#endif - /* * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected @@ -603,9 +431,25 @@ static inline void load_current_idt(void) { if (is_debug_idt_enabled()) load_debug_idt(); - else if (is_trace_idt_enabled()) - load_trace_idt(); else load_idt((const struct desc_ptr *)&idt_descr); } + +extern void idt_setup_early_handler(void); +extern void idt_setup_early_traps(void); +extern void idt_setup_traps(void); +extern void idt_setup_apic_and_irq_gates(void); + +#ifdef CONFIG_X86_64 +extern void idt_setup_early_pf(void); +extern void idt_setup_ist_traps(void); +extern void idt_setup_debugidt_traps(void); +#else +static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_ist_traps(void) { } +static inline void idt_setup_debugidt_traps(void) { } +#endif + +extern void idt_invalidate(void *addr); + #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 49265345d4d2..346d252029b7 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -11,34 +11,30 @@ #include <linux/types.h> -/* - * FIXME: Accessing the desc_struct through its fields is more elegant, - * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than a union, - * so we can get rid of it transparently in the future -- glommer - */ /* 8 byte segment descriptor */ struct desc_struct { - union { - struct { - unsigned int a; - unsigned int b; - }; - struct { - u16 limit0; - u16 base0; - unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; - unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; - }; - }; + u16 limit0; + u16 base0; + u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1; + u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; } __attribute__((packed)); -#define GDT_ENTRY_INIT(flags, base, limit) { { { \ - .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ - .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ - ((limit) & 0xf0000) | ((base) & 0xff000000), \ - } } } +#define GDT_ENTRY_INIT(flags, base, limit) \ + { \ + .limit0 = (u16) (limit), \ + .limit1 = ((limit) >> 16) & 0x0F, \ + .base0 = (u16) (base), \ + .base1 = ((base) >> 16) & 0xFF, \ + .base2 = ((base) >> 24) & 0xFF, \ + .type = (flags & 0x0f), \ + .s = (flags >> 4) & 0x01, \ + .dpl = (flags >> 5) & 0x03, \ + .p = (flags >> 7) & 0x01, \ + .avl = (flags >> 12) & 0x01, \ + .l = (flags >> 13) & 0x01, \ + .d = (flags >> 14) & 0x01, \ + .g = (flags >> 15) & 0x01, \ + } enum { GATE_INTERRUPT = 0xE, @@ -47,49 +43,63 @@ enum { GATE_TASK = 0x5, }; -/* 16byte gate */ -struct gate_struct64 { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) - enum { DESC_TSS = 0x9, DESC_LDT = 0x2, DESCTYPE_S = 0x10, /* !system */ }; -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct ldttss_desc64 { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; +/* LDT or TSS descriptor in the GDT. */ +struct ldttss_desc { + u16 limit0; + u16 base0; + + u16 base1 : 8, type : 5, dpl : 2, p : 1; + u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8; +#ifdef CONFIG_X86_64 + u32 base3; + u32 zero1; +#endif } __attribute__((packed)); +typedef struct ldttss_desc ldt_desc; +typedef struct ldttss_desc tss_desc; + +struct idt_bits { + u16 ist : 3, + zero : 5, + type : 5, + dpl : 2, + p : 1; +} __attribute__((packed)); + +struct gate_struct { + u16 offset_low; + u16 segment; + struct idt_bits bits; + u16 offset_middle; +#ifdef CONFIG_X86_64 + u32 offset_high; + u32 reserved; +#endif +} __attribute__((packed)); + +typedef struct gate_struct gate_desc; + +static inline unsigned long gate_offset(const gate_desc *g) +{ #ifdef CONFIG_X86_64 -typedef struct gate_struct64 gate_desc; -typedef struct ldttss_desc64 ldt_desc; -typedef struct ldttss_desc64 tss_desc; -#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) -#define gate_segment(g) ((g).segment) + return g->offset_low | ((unsigned long)g->offset_middle << 16) | + ((unsigned long) g->offset_high << 32); #else -typedef struct desc_struct gate_desc; -typedef struct desc_struct ldt_desc; -typedef struct desc_struct tss_desc; -#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) -#define gate_segment(g) ((g).a >> 16) + return g->offset_low | ((unsigned long)g->offset_middle << 16); #endif +} + +static inline unsigned long gate_segment(const gate_desc *g) +{ + return g->segment; +} struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 398c79889f5c..1387dafdba2d 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,6 +12,7 @@ #include <asm/io.h> #include <asm/swiotlb.h> #include <linux/dma-contiguous.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr; + return __sme_set(paddr); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr; + return __sme_clr(daddr); } #endif /* CONFIG_X86_DMA_REMAP */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 3c69fed215c5..a8e15b04565b 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_early_remap early_ioremap -#define dmi_early_unmap early_iounmap -#define dmi_remap ioremap_cache -#define dmi_unmap iounmap +#define dmi_early_remap early_memremap +#define dmi_early_unmap early_memunmap +#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) +#define dmi_unmap(_x) memunmap(_x) #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index a504adc661a4..cd266d830e49 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void); extern void e820__reallocate_tables(void); extern void e820__register_nosave_regions(unsigned long limit_pfn); +extern int e820__get_entry_type(u64 start, u64 end); + /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9aeb91935ce0..04330c8d9af9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -126,15 +126,15 @@ do { \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ - pr_reg[7] = regs->ds & 0xffff; \ - pr_reg[8] = regs->es & 0xffff; \ - pr_reg[9] = regs->fs & 0xffff; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + pr_reg[9] = regs->fs; \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ - pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ - pr_reg[16] = regs->ss & 0xffff; \ + pr_reg[16] = regs->ss; \ } while (0); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ @@ -304,8 +305,8 @@ static inline int mmap_is_ia32(void) test_thread_flag(TIF_ADDR32)); } -extern unsigned long tasksize_32bit(void); -extern unsigned long tasksize_64bit(void); +extern unsigned long task_size_32bit(void); +extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 07b06955a05d..aa15d1f7e530 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,20 +13,14 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, - smp_irq_move_cleanup_interrupt) -BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) +BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, - smp_kvm_posted_intr_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, - smp_kvm_posted_intr_wakeup_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, - smp_kvm_posted_intr_nested_ipi) +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) #endif /* @@ -41,6 +35,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760..dcd9fb55e679 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include <asm-generic/fixmap.h> #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index b4c1f5453436..f4dc9b63bdda 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -41,20 +41,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d6dbafbd4207..6dfe366a8804 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -46,26 +46,6 @@ extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); -#ifdef CONFIG_TRACING -/* Interrupt handlers registered during init_IRQ */ -extern void trace_apic_timer_interrupt(void); -extern void trace_x86_platform_ipi(void); -extern void trace_error_interrupt(void); -extern void trace_irq_work_interrupt(void); -extern void trace_spurious_interrupt(void); -extern void trace_thermal_interrupt(void); -extern void trace_reschedule_interrupt(void); -extern void trace_threshold_interrupt(void); -extern void trace_deferred_error_interrupt(void); -extern void trace_call_function_interrupt(void); -extern void trace_call_function_single_interrupt(void); -#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt -#define trace_reboot_interrupt reboot_interrupt -#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi -#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi -#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi -#endif /* CONFIG_TRACING */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 474eb8c66fee..05c4aa00cc86 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -7,6 +7,7 @@ struct x86_mapping_info { unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ bool direct_gbpages; /* PUD level 1GB page support */ + unsigned long kernpg_flag; /* kernel pagetable flag override */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h deleted file mode 100644 index 597dc4995678..000000000000 --- a/arch/x86/include/asm/intel_rdt.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_H -#define _ASM_X86_INTEL_RDT_H - -#ifdef CONFIG_INTEL_RDT_A - -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/jump_label.h> - -#include <asm/intel_rdt_common.h> - -#define IA32_L3_QOS_CFG 0xc81 -#define IA32_L3_CBM_BASE 0xc90 -#define IA32_L2_CBM_BASE 0xd10 -#define IA32_MBA_THRTL_BASE 0xd50 - -#define L3_QOS_CDP_ENABLE 0x01ULL - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - int closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - struct kernfs_ops *kf_ops; - unsigned long flags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct rdt_domain - group of cpus sharing an RDT resource - * @list: all instances of this resource - * @id: unique id for this instance - * @cpu_mask: which cpus share this resource - * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @new_ctrl: new ctrl value to be loaded - * @have_new_ctrl: did user provide new_ctrl for this domain - */ -struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - u32 *ctrl_val; - u32 new_ctrl; - bool have_new_ctrl; -}; - -/** - * struct msr_param - set a range of MSRs from a domain - * @res: The resource to use - * @low: Beginning index from base MSR - * @high: End index - */ -struct msr_param { - struct rdt_resource *res; - int low; - int high; -}; - -/** - * struct rdt_cache - Cache allocation related data - * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set - * @cbm_idx_mult: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - * in a cache bit mask - */ -struct rdt_cache { - unsigned int cbm_len; - unsigned int min_cbm_bits; - unsigned int cbm_idx_mult; - unsigned int cbm_idx_offset; -}; - -/** - * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. - * @min_bw: Minimum memory bandwidth percentage user can request - * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @mb_map: Mapping of memory B/W percentage to memory B/W delay - */ -struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - u32 *mb_map; -}; - -/** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @cache_level: Which cache level defines scope of this resource - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @msr_base: Base MSR address for CBMs - * @msr_update: Function pointer to update QOS MSRs - * @data_width: Character width of data when displaying - * @domains: All domains for this resource - * @cache: Cache allocation related data - * @info_files: resctrl info files for the resource - * @nr_info_files: Number of info files - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cache_level; - u32 default_ctrl; - unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); - int data_width; - struct list_head domains; - struct rdt_cache cache; - struct rdt_membw membw; - struct rftype *info_files; - int nr_info_files; - const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, - struct rdt_domain *d); -}; - -void rdt_get_cache_infofile(struct rdt_resource *r); -void rdt_get_mba_infofile(struct rdt_resource *r); -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); - -extern struct mutex rdtgroup_mutex; - -extern struct rdt_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); - -int __init rdtgroup_init(void); - -enum { - RDT_RESOURCE_L3, - RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -#define for_each_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->capable) - -#define for_each_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->enabled) - -/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ -union cpuid_0x10_1_eax { - struct { - unsigned int cbm_len:5; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ -union cpuid_0x10_3_eax { - struct { - unsigned int max_delay:12; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID).EDX */ -union cpuid_0x10_x_edx { - struct { - unsigned int cos_max:16; - } split; - unsigned int full; -}; - -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); - -void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); - -/* - * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control and we enable by mounting the - * resctrl file system. - * - Caches the per cpu CLOSid values and does the MSR write only - * when a task with a different CLOSid is scheduled in. - * - * Must be called with preemption disabled. - */ -static inline void intel_rdt_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) { - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; - - /* - * If this task has a closid assigned, use it. - * Else use the closid assigned to this cpu. - */ - closid = current->closid; - if (closid == 0) - closid = this_cpu_read(cpu_closid); - - if (closid != state->closid) { - state->closid = closid; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); - } - } -} - -#else - -static inline void intel_rdt_sched_in(void) {} - -#endif /* CONFIG_INTEL_RDT_A */ -#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h deleted file mode 100644 index b31081b89407..000000000000 --- a/arch/x86/include/asm/intel_rdt_common.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_COMMON_H -#define _ASM_X86_INTEL_RDT_COMMON_H - -#define MSR_IA32_PQR_ASSOC 0x0c8f - -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - -DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); - -#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h new file mode 100644 index 000000000000..b4bbf8b21512 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_sched.h @@ -0,0 +1,92 @@ +#ifndef _ASM_X86_INTEL_RDT_SCHED_H +#define _ASM_X86_INTEL_RDT_SCHED_H + +#ifdef CONFIG_INTEL_RDT + +#include <linux/sched.h> +#include <linux/jump_label.h> + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __intel_rdt_sched_in(); +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT */ + +#endif /* _ASM_X86_INTEL_RDT_SCHED_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 48febf07e828..c40a95c33bb8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) +#define readb readb +#define readw readw +#define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) @@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb writeb +#define writew writew +#define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) @@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) readq(a) -#define writeq_relaxed(v, a) writeq(v, a) +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) -#define __raw_readq(a) readq(a) -#define __raw_writeq(val, addr) writeq(val, addr) +#define __raw_readq __readq +#define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq @@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } +#define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual @@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } +#define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. @@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * else, you probably want one of the following. */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +#define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +#define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +#define ioremap_prot ioremap_prot /** * ioremap - map bus memory into CPU space @@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); } +#define ioremap ioremap extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap extern void set_iounmap_nonlazy(void); @@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> /* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -/** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set - * - * Set a range of I/O memory to a given value. - */ -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -/** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data from I/O memory. - */ -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -/** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data to I/O memory. - */ -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values @@ -341,13 +309,38 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) +#define inb inb +#define inw inw +#define inl inl +#define inb_p inb_p +#define inw_p inw_p +#define inl_p inl_p +#define insb insb +#define insw insw +#define insl insl + +#define outb outb +#define outw outw +#define outl outl +#define outb_p outb_p +#define outw_p outw_p +#define outl_p outl_p +#define outsb outsb +#define outsw outsw +#define outsl outsl + extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +#define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); +#define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#include <asm-generic/io.h> +#undef PCI_IOBASE + #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index @@ -381,4 +377,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +extern bool arch_memremap_can_ram_remap(resource_size_t offset, + unsigned long size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + +extern bool phys_mem_access_encrypted(unsigned long phys_addr, + unsigned long size); + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 668cca540025..9958ceea2fa3 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -42,10 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); -extern int vector_used_by_percpu_irq(unsigned int vector); - extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index f70604125286..ddbb8ea0f5a9 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -3,9 +3,17 @@ #include <asm/cpufeature.h> +#ifdef CONFIG_X86_LOCAL_APIC static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } +extern void arch_irq_work_raise(void); +#else +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} +#endif #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 70ef205489f0..942c1f444da8 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -147,7 +147,8 @@ unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, unsigned long start_address, - unsigned int preserve_context); + unsigned int preserve_context, + unsigned int sme_active); #endif #define ARCH_HAS_KIMAGE_ARCH @@ -207,6 +208,14 @@ struct kexec_entry64_regs { uint64_t r15; uint64_t rip; }; + +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, + gfp_t gfp); +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages + +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages + #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 92c9032502d8..369e41c23f07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1079,7 +1079,7 @@ void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask); + u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include <asm/desc.h> - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include <asm/hw_irq.h> - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..8e618fcf1f7c --- /dev/null +++ b/arch/x86/include/asm/mem_encrypt.h @@ -0,0 +1,80 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __X86_MEM_ENCRYPT_H__ +#define __X86_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#include <linux/init.h> + +#include <asm/bootparam.h> + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +extern unsigned long sme_me_mask; + +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, + unsigned long decrypted_kernel_vaddr, + unsigned long kernel_len, + unsigned long encryption_wa, + unsigned long encryption_pgd); + +void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size); +void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size); + +void __init sme_map_bootdata(char *real_mode_data); +void __init sme_unmap_bootdata(char *real_mode_data); + +void __init sme_early_init(void); + +void __init sme_encrypt_kernel(void); +void __init sme_enable(struct boot_params *bp); + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +static inline void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size) { } +static inline void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size) { } + +static inline void __init sme_map_bootdata(char *real_mode_data) { } +static inline void __init sme_unmap_bootdata(char *real_mode_data) { } + +static inline void __init sme_early_init(void) { } + +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(struct boot_params *bp) { } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 79b647a7ebd0..bb8c597c2248 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,12 +3,28 @@ #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/atomic.h> /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -37,6 +53,11 @@ typedef struct { #endif } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 7a234be7e298..7ae318c340d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -12,6 +12,9 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/mpx.h> + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + cpumask_clear_cpu(cpu, mm_cpumask(mm)); } static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ @@ -290,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void) { unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + if (static_cpu_has(X86_FEATURE_PCID)) + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e3b7819caeef..9eb7c718aaf8 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -2,6 +2,15 @@ #define _ASM_X86_MODULE_H #include <asm-generic/module.h> +#include <asm/orc_types.h> + +struct mod_arch_specific { +#ifdef CONFIG_ORC_UNWINDER + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a0d662be4c5b..7d7404756bb4 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm) } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); + +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { @@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } + +static inline unsigned long mpx_unmapped_area_check(unsigned long addr, + unsigned long len, unsigned long flags) +{ + return addr; +} #endif /* CONFIG_X86_INTEL_MPX */ #endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 2b58c8c1eeaa..58b9291b46d8 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -28,6 +28,8 @@ struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5573c75f8e4c..17f5c12e1afd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -356,6 +356,8 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h new file mode 100644 index 000000000000..91c8d868424d --- /dev/null +++ b/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h new file mode 100644 index 000000000000..9c9dc579bd7d --- /dev/null +++ b/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include <linux/types.h> +#include <linux/compiler.h> + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b4a0d43248cf..b50df06ad251 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -51,6 +51,10 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_MCE +#define arch_unmap_kpfn arch_unmap_kpfn +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384ca..b98ed9d14630 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include <linux/const.h> #include <linux/types.h> +#include <linux/mem_encrypt.h> /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +16,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 9ccac1926587..c25dd22f7c70 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -960,11 +960,6 @@ extern void default_banner(void); #define GET_CR2_INTO_RAX \ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) -#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ - CLBR_NONE, \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) - #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9ffc36bfe4cd..6b64fc6367f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -196,9 +196,6 @@ struct pv_irq_ops { void (*safe_halt)(void); void (*halt)(void); -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 77037b6f1caa..bbeae4a2bd01 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include <linux/mem_encrypt.h> #include <asm/page.h> #include <asm/pgtable_types.h> @@ -13,9 +14,18 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include <asm/x86_init.h> +extern pgd_t early_top_pgt[PTRS_PER_PGD]; +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -38,6 +48,8 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT */ @@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ @@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) \ - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) \ - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define p4d_page(p4d) \ - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the third-level page table.. */ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) @@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee42..399261ce904c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> +#include <linux/mem_encrypt.h> + #include <asm/page_types.h> #define FIRST_USER_ADDRESS 0UL @@ -121,10 +123,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -159,6 +161,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -187,22 +190,42 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#ifndef __ASSEMBLY__ + +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE @@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d) #else #include <asm-generic/pgtable-nop4d.h> +static inline p4d_t native_make_p4d(pudval_t val) +{ + return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; +} + static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 79aa2f98398d..dc723b64acf0 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -2,6 +2,7 @@ #define _ASM_X86_PROCESSOR_FLAGS_H #include <uapi/asm/processor-flags.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM @@ -32,16 +33,18 @@ * CR3_ADDR_MASK is the mask used by read_cr3_pa(). */ #ifdef CONFIG_X86_64 -/* Mask off the address space ID bits. */ -#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull -#define CR3_PCID_MASK 0xFFFull +/* Mask off the address space ID and SME encryption bits. */ +#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_PCID_MASK 0xFFFull +#define CR3_NOFLUSH BIT_ULL(63) #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save * a tiny bit of code size by setting all the bits. */ -#define CR3_ADDR_MASK 0xFFFFFFFFull -#define CR3_PCID_MASK 0ull +#define CR3_ADDR_MASK 0xFFFFFFFFull +#define CR3_PCID_MASK 0ull +#define CR3_NOFLUSH 0 #endif #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 028245e1c42b..3fa26a61eabc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -22,6 +22,7 @@ struct vm86; #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> +#include <asm/unwind_hints.h> #include <linux/personality.h> #include <linux/cache.h> @@ -29,6 +30,7 @@ struct vm86; #include <linux/math64.h> #include <linux/err.h> #include <linux/irqflags.h> +#include <linux/mem_encrypt.h> /* * We handle most unaligned accesses in hardware. On the other hand @@ -239,9 +241,14 @@ static inline unsigned long read_cr3_pa(void) return __read_cr3() & CR3_ADDR_MASK; } +static inline unsigned long native_read_cr3_pa(void) +{ + return __native_read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 @@ -661,7 +668,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a @@ -684,6 +691,7 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( + UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -693,6 +701,7 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" + UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif @@ -802,7 +811,9 @@ static inline void spin_lock_prefetch(const void *x) */ #define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE #define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -842,7 +853,9 @@ static inline void spin_lock_prefetch(const void *x) * particular problem by preventing anything from being mapped * at the maximum canonical address. */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -850,12 +863,14 @@ static inline void spin_lock_prefetch(const void *x) #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define STACK_TOP TASK_SIZE +#define STACK_TOP TASK_SIZE_LOW #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ @@ -876,7 +891,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) -#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8d3964fc5f91..b408b1886195 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_INT80_compat(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +void xen_entry_INT80_compat(void); +#endif #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 2b5d686ea9f3..91c04c8e67fa 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -9,6 +9,20 @@ #ifdef __i386__ struct pt_regs { + /* + * NB: 32-bit x86 CPUs are inconsistent as what happens in the + * following cases (where %seg represents a segment register): + * + * - pushl %seg: some do a 16-bit write and leave the high + * bits alone + * - movl %seg, [mem]: some do a 16-bit write despite the movl + * - IDT entry: some (e.g. 486) will leave the high bits of CS + * and (if applicable) SS undefined. + * + * Fortunately, x86-32 doesn't read the high bits on POP or IRET, + * so we can just treat all of the segment registers as 16-bit + * values. + */ unsigned long bx; unsigned long cx; unsigned long dx; @@ -16,16 +30,22 @@ struct pt_regs { unsigned long di; unsigned long bp; unsigned long ax; - unsigned long ds; - unsigned long es; - unsigned long fs; - unsigned long gs; + unsigned short ds; + unsigned short __dsh; + unsigned short es; + unsigned short __esh; + unsigned short fs; + unsigned short __fsh; + unsigned short gs; + unsigned short __gsh; unsigned long orig_ax; unsigned long ip; - unsigned long cs; + unsigned short cs; + unsigned short __csh; unsigned long flags; unsigned long sp; - unsigned long ss; + unsigned short ss; + unsigned short __ssh; }; #else /* __i386__ */ @@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); + + /* The selector fields are 16-bit. */ + if (offset == offsetof(struct pt_regs, cs) || + offset == offsetof(struct pt_regs, ss) || + offset == offsetof(struct pt_regs, ds) || + offset == offsetof(struct pt_regs, es) || + offset == offsetof(struct pt_regs, fs) || + offset == offsetof(struct pt_regs, gs)) { + return *(u16 *)((unsigned long)regs + offset); + + } #endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 230e1903acf0..90d91520c13a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -1,6 +1,15 @@ #ifndef _ARCH_X86_REALMODE_H #define _ARCH_X86_REALMODE_H +/* + * Flag bit definitions for use with the flags field of the trampoline header + * in the CONFIG_X86_64 variant. + */ +#define TH_FLAGS_SME_ACTIVE_BIT 0 +#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) + +#ifndef __ASSEMBLY__ + #include <linux/types.h> #include <asm/io.h> @@ -38,6 +47,7 @@ struct trampoline_header { u64 start; u64 efer; u32 cr4; + u32 flags; #endif }; @@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void) void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); +#endif /* __ASSEMBLY__ */ + #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..ff871210b9f2 --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,109 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from + * PaX/grsecurity. + */ +#include <linux/refcount.h> + +/* + * This is the first portion of the refcount error handling, which lives in + * .text.unlikely, and is jumped to from the CPU flag check (in the + * following macros). This saves the refcount value location into CX for + * the exception handler to use (in mm/extable.c), and then triggers the + * central refcount exception. The fixup address for the exception points + * back to the regular execution flow in .text. + */ +#define _REFCOUNT_EXCEPTION \ + ".pushsection .text.unlikely\n" \ + "111:\tlea %[counter], %%" _ASM_CX "\n" \ + "112:\t" ASM_UD0 "\n" \ + ASM_UNREACHABLE \ + ".popsection\n" \ + "113:\n" \ + _ASM_EXTABLE_REFCOUNT(112b, 113b) + +/* Trigger refcount exception if refcount result is negative. */ +#define REFCOUNT_CHECK_LT_ZERO \ + "js 111f\n\t" \ + _REFCOUNT_EXCEPTION + +/* Trigger refcount exception if refcount result is zero or negative. */ +#define REFCOUNT_CHECK_LE_ZERO \ + "jz 111f\n\t" \ + REFCOUNT_CHECK_LT_ZERO + +/* Trigger refcount exception unconditionally. */ +#define REFCOUNT_ERROR \ + "jmp 111f\n\t" \ + _REFCOUNT_EXCEPTION + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_LE_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "%0", e); +} + +static __always_inline __must_check +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + int c, result; + + c = atomic_read(&(r->refs)); + do { + if (unlikely(c == 0)) + return false; + + result = c + i; + + /* Did we try to increment from/to an undesirable state? */ + if (unlikely(c < 0 || c == INT_MAX || result < c)) { + asm volatile(REFCOUNT_ERROR + : : [counter] "m" (r->refs.counter) + : "cc", "cx"); + break; + } + + } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); + + return c != 0; +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return refcount_add_not_zero(1, r); +} + +#endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 661dd305694a..045f99211a99 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,45 +1,56 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +#define __CLOBBERS_MEM "memory" +#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" + #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : "m" (var), ## __VA_ARGS__ \ - : "memory" : cc_label); \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : clobbers : cc_label); \ return 0; \ cc_label: \ return 1; \ } while (0) -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) +#define __BINARY_RMWcc_ARG " %1, " -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ asm volatile (fullop ";" CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ return c; \ } while (0) +#define __BINARY_RMWcc_ARG " %2, " + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ + __CLOBBERS_MEM, vcon (val)) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX, vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 1549caa098f0..066aaf813141 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -238,9 +238,7 @@ #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; -#ifdef CONFIG_TRACING -# define trace_early_idt_handler_array early_idt_handler_array -#endif +extern void early_ignore_irq(void); /* * Load a segment. Fall back on loading the zero segment if something goes diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index eaec6c364e42..cd71273ec49d 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -11,6 +11,7 @@ * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent + * Encryption : Encrypted, Decrypted * * Within a category, the attributes are mutually exclusive. * @@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index e4585a393965..a65cf544686a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,7 @@ static inline void vsmp_init(void) { } #endif void setup_bios_corruption_check(void); +void early_platform_quirks(void); extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e00e1bd6e7b3..5161da1a0fa0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -98,6 +98,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -122,6 +123,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -137,7 +139,8 @@ struct thread_info { (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ - _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index c7797307fc2b..79a4ca6a9606 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -15,4 +15,18 @@ #include <asm-generic/tlb.h> +/* + * While x86 architecture in general requires an IPI to perform TLB + * shootdown, enablement code for several hypervisors overrides + * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing + * a hypercall. To keep software pagetable walkers safe in this case we + * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment + * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ +static inline void __tlb_remove_table(void *table) +{ + free_page_and_swap_cache(table); +} + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 50ea3482e1d1..d23e61dc0640 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); } +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + u64 new_tlb_gen; + + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + smp_mb__before_atomic(); + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); + smp_mb__after_atomic(); + + return new_tlb_gen; +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * 6 because 6 should be plenty and struct tlb_state will fit in + * two cache lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts @@ -73,13 +101,35 @@ struct tlb_state { * mode even if we've already switched back to swapper_pg_dir. */ struct mm_struct *loaded_mm; - int state; + u16 loaded_mm_asid; + u16 next_asid; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * There is one per ASID that we use, and the ASID (what the + * CPU calls PCID) is the index into ctxts. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + * + * NB: context 0 is a bit special, since it's also used by + * various bits of init code. This is fine -- code that + * isn't aware of PCID will end up harmlessly flushing + * context 0. + */ + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -207,6 +257,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) @@ -231,9 +289,26 @@ static inline void __flush_tlb_one(unsigned long addr) * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { - struct mm_struct *mm; - unsigned long start; - unsigned long end; + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; }; #define local_flush_tlb() __flush_tlb() @@ -256,12 +331,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { + inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6358a85e2270..c1d2a9892352 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) extern void setup_node_to_cpumask_map(void); -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h new file mode 100644 index 000000000000..57c8da027d99 --- /dev/null +++ b/arch/x86/include/asm/trace/common.h @@ -0,0 +1,16 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); +#define trace_pagefault_enabled() \ + static_branch_unlikely(&trace_pagefault_key) +DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); +#define trace_resched_ipi_enabled() \ + static_branch_unlikely(&trace_resched_ipi_key) +#else +static inline bool trace_pagefault_enabled(void) { return false; } +static inline bool trace_resched_ipi_enabled(void) { return false; } +#endif + +#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2422b14c50a7..5665bf205b8d 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -5,9 +5,10 @@ #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +extern int trace_pagefault_reg(void); +extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, @@ -37,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); + trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 32dd6a9e343c..1599d394c8c1 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -5,9 +5,12 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#ifdef CONFIG_X86_LOCAL_APIC + +extern int trace_resched_ipi_reg(void); +extern void trace_resched_ipi_unreg(void); DECLARE_EVENT_CLASS(x86_irq_vector, @@ -28,15 +31,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector, #define DEFINE_IRQ_VECTOR_EVENT(name) \ DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); + +#define DEFINE_RESCHED_IPI_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); \ + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); \ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); - + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); /* * local_timer - called when entering/exiting a local timer interrupt @@ -45,11 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ DEFINE_IRQ_VECTOR_EVENT(local_timer); /* - * reschedule - called when entering/exiting a reschedule vector handler - */ -DEFINE_IRQ_VECTOR_EVENT(reschedule); - -/* * spurious_apic - called when entering/exiting a spurious apic vector handler */ DEFINE_IRQ_VECTOR_EVENT(spurious_apic); @@ -65,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic); */ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); +#ifdef CONFIG_IRQ_WORK /* * irq_work - called when entering/exiting a irq work interrupt * vector handler @@ -81,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work); * 4) goto 1 */ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); +#endif + +/* + * The ifdef is required because that tracepoint macro hell emits tracepoint + * code in files which include this header even if the tracepoint is not + * enabled. Brilliant stuff that. + */ +#ifdef CONFIG_SMP +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_RESCHED_IPI_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt @@ -93,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function); * single interrupt vector handler */ DEFINE_IRQ_VECTOR_EVENT(call_function_single); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD /* * threshold_apic - called when entering/exiting a threshold apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +#endif +#ifdef CONFIG_X86_MCE_AMD /* * deferred_error_apic - called when entering/exiting a deferred apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(thermal_apic); +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7f48cd..5545f6459bf5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,9 +13,6 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void xen_debug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -38,22 +35,29 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -#ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); -#define trace_stack_segment stack_segment -#define trace_divide_error divide_error -#define trace_bounds bounds -#define trace_invalid_op invalid_op -#define trace_device_not_available device_not_available -#define trace_coprocessor_segment_overrun coprocessor_segment_overrun -#define trace_invalid_TSS invalid_TSS -#define trace_segment_not_present segment_not_present -#define trace_general_protection general_protection -#define trace_spurious_interrupt_bug spurious_interrupt_bug -#define trace_coprocessor_error coprocessor_error -#define trace_alignment_check alignment_check -#define trace_simd_coprocessor_error simd_coprocessor_error -#define trace_async_page_fault async_page_fault +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +asmlinkage void xen_divide_error(void); +asmlinkage void xen_xendebug(void); +asmlinkage void xen_xenint3(void); +asmlinkage void xen_nmi(void); +asmlinkage void xen_overflow(void); +asmlinkage void xen_bounds(void); +asmlinkage void xen_invalid_op(void); +asmlinkage void xen_device_not_available(void); +asmlinkage void xen_double_fault(void); +asmlinkage void xen_coprocessor_segment_overrun(void); +asmlinkage void xen_invalid_TSS(void); +asmlinkage void xen_segment_not_present(void); +asmlinkage void xen_stack_segment(void); +asmlinkage void xen_general_protection(void); +asmlinkage void xen_page_fault(void); +asmlinkage void xen_spurious_interrupt_bug(void); +asmlinkage void xen_coprocessor_error(void); +asmlinkage void xen_alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void xen_machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -74,14 +78,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 30269dafec47..184eb9894dae 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -26,7 +26,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.addr_limit) -#define set_fs(x) (current->thread.addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current->thread.addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e6676495b125..e9f793e2df7a 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,14 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_ORC_UNWINDER) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_FRAME_POINTER_UNWINDER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,41 +27,30 @@ struct unwind_state { void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } +#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_ORC_UNWINDER +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h new file mode 100644 index 000000000000..bae46fc6b9de --- /dev/null +++ b/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp + .if \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .else + .set sp_reg, ORC_REG_SP + .endif + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index c4b9dc2f67c5..9f42beefc67a 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -7,12 +7,24 @@ #ifndef _ASM_X86_VGA_H #define _ASM_X86_VGA_H +#include <asm/set_memory.h> + /* * On the PC, we can just recalculate addresses and then * access the videoram directly without any black magic. + * To support memory encryption however, we need to access + * the videoram as decrypted memory. */ -#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x, s) \ +({ \ + unsigned long start = (unsigned long)phys_to_virt(x); \ + \ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \ + set_memory_decrypted(start, (s) >> PAGE_SHIFT); \ + \ + start; \ +}) #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 11071fcd630e..9606688caa4b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,6 +552,8 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { + u32 *p = (u32 *) &desc; + mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; @@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } else { mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ddef37b16af2..66b8f93333d1 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -201,7 +201,7 @@ struct boot_params { * * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a01892bdd61a..fd0a7895b63f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o -obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o @@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o @@ -126,11 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -ifdef CONFIG_FRAME_POINTER -obj-y += unwind_frame.o -else -obj-y += unwind_guess.o -endif +obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7491e73d9253..f8ae286c1502 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -115,24 +115,24 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { #define ACPI_INVALID_GSI INT_MIN /* - * This is just a simple wrapper around early_ioremap(), + * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!phys || !size) return NULL; - return early_ioremap(phys, size); + return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; - early_iounmap(map, size); + early_memunmap(map, size); } #ifdef CONFIG_X86_LOCAL_APIC @@ -199,8 +199,10 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; +#ifdef CONFIG_X86_X2APIC int apic_id; u8 enabled; +#endif processor = (struct acpi_madt_local_x2apic *)header; @@ -209,9 +211,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); +#ifdef CONFIG_X86_X2APIC apic_id = processor->local_apic_id; enabled = processor->lapic_flags & ACPI_MADT_ENABLED; -#ifdef CONFIG_X86_X2APIC + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -1083,7 +1086,7 @@ static void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs()); /* * Use the default configuration for the IRQs 0-15. Unless diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 32e14d137416..3344d3382e91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr; int poke_int3_handler(struct pt_regs *regs) { - /* bp_patching_in_progress */ + /* + * Having observed our INT3 instruction, we now must observe + * bp_patching_in_progress. + * + * in_progress = TRUE INT3 + * WMB RMB + * write INT3 if (in_progress) + * + * Idem for bp_int3_handler. + */ smp_rmb(); if (likely(!bp_patching_in_progress)) @@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) bp_int3_addr = (u8 *)addr + sizeof(int3); bp_patching_in_progress = true; /* - * Corresponding read barrier in int3 notifier for - * making sure the in_progress flags is correctly ordered wrt. - * patching + * Corresponding read barrier in int3 notifier for making sure the + * in_progress and handler are correctly ordered wrt. patching. */ smp_wmb(); @@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) text_poke(addr, opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - + /* + * sync_core() implies an smp_mb() and orders this store against + * the writing of the new instruction. + */ bp_patching_in_progress = false; - smp_wmb(); return addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 98b3dd8cf2bf..7834f73efbf1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -177,8 +177,6 @@ static int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -int first_system_vector = FIRST_SYSTEM_VECTOR; - /* * Debug level, exported for io_apic.c */ @@ -599,9 +597,13 @@ static const struct x86_cpu_id deadline_match[] = { static void apic_check_deadline_errata(void) { - const struct x86_cpu_id *m = x86_match_cpu(deadline_match); + const struct x86_cpu_id *m; u32 rev; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + m = x86_match_cpu(deadline_match); if (!m) return; @@ -990,8 +992,7 @@ void setup_secondary_APIC_clock(void) */ static void local_apic_timer_interrupt(void) { - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + struct clock_event_device *evt = this_cpu_ptr(&lapic_events); /* * Normally we should not be here till LAPIC has been initialized but @@ -1005,7 +1006,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1040,25 +1042,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * interrupt lock, which is the WrongThing (tm) to do. */ entering_ack_irq(); - local_apic_timer_interrupt(); - exiting_irq(); - - set_irq_regs(old_regs); -} - -__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1920,10 +1903,14 @@ void __init register_lapic_address(unsigned long address) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static void __smp_spurious_interrupt(u8 vector) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { + u8 vector = ~regs->orig_ax; u32 v; + entering_irq(); + trace_spurious_apic_entry(vector); + /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1938,22 +1925,7 @@ static void __smp_spurious_interrupt(u8 vector) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " "should never happen.\n", vector, smp_processor_id()); -} -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_spurious_interrupt(~regs->orig_ax); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) -{ - u8 vector = ~regs->orig_ax; - - entering_irq(); - trace_spurious_apic_entry(vector); - __smp_spurious_interrupt(vector); trace_spurious_apic_exit(vector); exiting_irq(); } @@ -1961,10 +1933,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static void __smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { - u32 v; - u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ "Receive CS error", /* APIC Error Bit 1 */ @@ -1975,6 +1945,10 @@ static void __smp_error_interrupt(struct pt_regs *regs) "Received illegal vector", /* APIC Error Bit 6 */ "Illegal register address", /* APIC Error Bit 7 */ }; + u32 v, i = 0; + + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ @@ -1996,20 +1970,6 @@ static void __smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); -} - -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_error_interrupt(regs); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - trace_error_apic_entry(ERROR_APIC_VECTOR); - __smp_error_interrupt(regs); trace_error_apic_exit(ERROR_APIC_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 237e9c2341c7..70e48aa6af98 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index << 15) | ir_entry->index, + buf, (ir_entry->index2 << 15) | ir_entry->index, ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b3af457ed667..88c214e75a6b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, offset = current_offset; next: vector += 16; - if (vector >= first_system_vector) { + if (vector >= FIRST_SYSTEM_VECTOR) { offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 880aa093268d..710edab9e644 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -4,9 +4,6 @@ #include <asm/ucontext.h> -#include <linux/lguest.h> -#include "../../../drivers/lguest/lg.h" - #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include <asm/syscalls_32.h> @@ -62,23 +59,6 @@ void foo(void) OFFSET(stack_canary_offset, stack_canary, canary); #endif -#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) - BLANK(); - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - - BLANK(); - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); -#endif BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); DEFINE(NR_syscalls, sizeof(syscalls)); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 99332f550c48..cf42206926af 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,7 +20,6 @@ static char syscalls_ia32[] = { int main(void) { #ifdef CONFIG_PARAVIRT - OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); BLANK(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cdf82492b770..e17942c131c8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3b9e220621f8..9862e2cd6d93 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -297,13 +297,29 @@ static int nearby_node(int apicid) } #endif +#ifdef CONFIG_SMP +/* + * Fix up cpu_core_id for pre-F17h systems to be in the + * [0 .. cores_per_node - 1] range. Not really needed but + * kept so as not to break existing setups. + */ +static void legacy_fixup_core_id(struct cpuinfo_x86 *c) +{ + u32 cus_per_node; + + if (c->x86 >= 0x17) + return; + + cus_per_node = c->x86_max_cores / nodes_per_socket; + c->cpu_core_id %= cus_per_node; +} + /* * Fixup core topology information for * (1) AMD multi-node processors * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u8 node_id; @@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c) } else return; - /* fixup multi-node processor information */ if (nodes_per_socket > 1) { - u32 cus_per_node; - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cus_per_node = c->x86_max_cores / nodes_per_socket; - - /* core id has to be in the [0 .. cores_per_node - 1] range */ - c->cpu_core_id %= cus_per_node; + legacy_fixup_core_id(c); } } #endif @@ -548,8 +558,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) static void early_init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + early_init_amd_mc(c); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states @@ -612,6 +626,27 @@ static void early_init_amd(struct cpuinfo_x86 *c) */ if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); + + /* + * BIOS support is required for SME. If BIOS has enabled SME then + * adjust x86_phys_bits by the SME physical address space reduction + * value. If BIOS has not enabled SME then don't advertise the + * feature (set in scattered.c). Also, since the SME support requires + * long mode, don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME)) { + u64 msr; + + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + if (IS_ENABLED(CONFIG_X86_32)) + clear_cpu_cap(c, X86_FEATURE_SME); + } else { + clear_cpu_cap(c, X86_FEATURE_SME); + } + } } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -730,8 +765,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static void init_amd(struct cpuinfo_x86 *c) { - u32 dummy; - early_init_amd(c); /* @@ -793,8 +826,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); - /* 3DNow or LM implies PREFETCHW */ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0af86d9242da..db684880d74a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,6 +21,14 @@ void __init check_bugs(void) { +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c8b39870f33e..efba8e3da3e2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +#ifdef CONFIG_X86_64 +static int __init x86_pcid_setup(char *s) +{ + /* require an exact match without trailing characters */ + if (strlen(s)) + return 0; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 1; +} +__setup("nopcid", x86_pcid_setup); +#endif + static int __init x86_noinvpcid_setup(char *s) { /* noinvpcid doesn't accept parameters */ @@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static void setup_pcid(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smep(c); setup_smap(c); + /* Set up PCID */ + setup_pcid(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -1289,15 +1329,6 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr __ro_after_init = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) idt_table, -}; -const struct desc_ptr debug_idt_descr = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; - DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c55fb2cb2acc..24f749324c0f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct cacheinfo *this_leaf; int i, sibling; - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; this_leaf = this_cpu_ci->info_list + index; @@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, &this_leaf->shared_cpu_map); } } - } else if (index == 3) { - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - this_leaf = this_cpu_ci->info_list + index; - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); - } - } } else return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5b366462f579..cd5fc61ba450 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -30,7 +30,8 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -38,7 +39,13 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Used to store the max resource name width and max resource data width @@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); */ int max_name_width, max_data_width; +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + static void mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); static void @@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = { + .rid = RDT_RESOURCE_L3, .name = "L3", .domains = domain_init(RDT_RESOURCE_L3), .msr_base = IA32_L3_CBM_BASE, @@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3DATA] = { + .rid = RDT_RESOURCE_L3DATA, .name = "L3DATA", .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = IA32_L3_CBM_BASE, @@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3CODE] = { + .rid = RDT_RESOURCE_L3CODE, .name = "L3CODE", .domains = domain_init(RDT_RESOURCE_L3CODE), .msr_base = IA32_L3_CBM_BASE, @@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2] = { + .rid = RDT_RESOURCE_L2, .name = "L2", .domains = domain_init(RDT_RESOURCE_L2), .msr_base = IA32_L2_CBM_BASE, @@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_MBA] = { + .rid = RDT_RESOURCE_MBA, .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .msr_base = IA32_MBA_THRTL_BASE, @@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = { .cache_level = 3, .parse_ctrlval = parse_bw, .format_str = "%d=%*d", + .fflags = RFTYPE_RES_MB, }, }; @@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ -static inline bool cache_alloc_hsw_probe(void) +static inline void cache_alloc_hsw_probe(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - u32 l, h, max_cbm = BIT_MASK(20) - 1; - - if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) - return false; - rdmsr(IA32_L3_CBM_BASE, l, h); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; - /* If all the bits were set in MSR, return success */ - if (l != max_cbm) - return false; + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return; + rdmsr(IA32_L3_CBM_BASE, l, h); - r->num_closid = 4; - r->default_ctrl = max_cbm; - r->cache.cbm_len = 20; - r->cache.min_cbm_bits = 2; - r->capable = true; - r->enabled = true; + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; - return true; - } + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; - return false; + rdt_alloc_capable = true; } /* @@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r) return false; } r->data_width = 3; - rdt_get_mba_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; return true; } -static void rdt_get_cache_config(int idx, struct rdt_resource *r) +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; @@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r) r->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - rdt_get_cache_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; } static void rdt_get_cdp_l3_config(int type) @@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type) r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - r->capable = true; + r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->enabled = false; + r->alloc_enabled = false; } static int get_cache_id(int cpu, int level) @@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); } +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; @@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg) int cpu = smp_processor_id(); struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - r->msr_update(d, m, r); - return; - } + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); @@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg) * caller, return the first domain whose id is bigger than the input id. * The domain list is sorted by id in ascending order. */ -static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) { struct rdt_domain *d; struct list_head *l; @@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), + sizeof(unsigned long), + GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + kfree(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); - if (domain_setup_ctrlval(r, d)) { + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { kfree(d); return; } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); list_del(&d->list); + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } } } -static void clear_closid(int cpu) +static void clear_closid_rmid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - per_cpu(cpu_closid, cpu) = 0; - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); } static int intel_rdt_online_cpu(unsigned int cpu) @@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu) domain_add_cpu(cpu, r); /* The cpu is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdtgroup *rdtgrp; @@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu) for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); break; + } } - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -492,7 +627,7 @@ static __init void rdt_init_padding(void) struct rdt_resource *r; int cl; - for_each_capable_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { cl = strlen(r->name); if (cl > max_name_width) max_name_width = cl; @@ -502,38 +637,153 @@ static __init void rdt_init_padding(void) } } -static __init bool get_rdt_resources(void) +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_rdt_alloc_resources(void) { bool ret = false; - if (cache_alloc_hsw_probe()) + if (rdt_alloc_capable) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); } ret = true; } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); ret = true; } - if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_cpu_has(X86_FEATURE_MBA)) { if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) ret = true; } - return ret; } +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void rdt_quirks(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_mask <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + } +} + +static __init bool get_rdt_resources(void) +{ + rdt_quirks(); + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void) return ret; } - for_each_capable_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); + for_each_mon_capable_rdt_resource(r) + pr_info("Intel RDT %s monitoring detected\n", r->name); + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h new file mode 100644 index 000000000000..ebaddaeef023 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#include <linux/sched.h> +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH 24 +#define MBM_OVERFLOW_INTERVAL 1000 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int intel_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned long fflags; +}; + +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); + +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 406d7a6532f9..f6ea94f8954a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -26,7 +26,7 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> -#include <asm/intel_rdt.h> +#include "intel_rdt.h" /* * Check whether MBA bandwidth percentage value is correct. The value is @@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) { struct rdt_resource *r; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } @@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; } @@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { ret = update_domains(r, closid); if (ret) goto out; @@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, { struct rdtgroup *rdtgrp; struct rdt_resource *r; - int closid, ret = 0; + int ret = 0; + u32 closid; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (closid < r->num_closid) show_doms(s, r, closid); } @@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return ret; } + +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (!d) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c new file mode 100644 index 000000000000..30827510094b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -0,0 +1,499 @@ +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/cpu_device_id.h> +#include "intel_rdt.h" + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int intel_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= intel_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= intel_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + u64 chunks, shift, tval; + struct mbm_state *m; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + rr->val = tval; + return -EINVAL; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because + * an invalid event id would fail the __rmid_read. + */ + return -EINVAL; + } + + if (rr->first) { + m->prev_msr = tval; + m->chunks = 0; + return 0; + } + + shift = 64 - MBM_CNTR_WIDTH; + chunks = (tval << shift) - (m->prev_msr << shift); + chunks >>= shift; + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + + rdtgrp = rr->rgrp; + + if (__mon_event_count(rdtgrp->mon.rmid, rr)) + return; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr)) + return; + } + } +} + +static void mbm_update(struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = get_domain_from_cpu(cpu, r); + + if (!d) { + pr_warn_once("Failure to get domain for limbo worker\n"); + goto out_unlock; + } + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + struct rdt_resource *r; + int cpu; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_enable_key)) + goto out_unlock; + + d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + if (!d) + goto out_unlock; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(d, crgrp->mon.rmid); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + intel_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9257bd9dc664..a869d4a073c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -32,17 +32,25 @@ #include <uapi/linux/magic.h> -#include <asm/intel_rdt.h> -#include <asm/intel_rdt_common.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -struct kernfs_root *rdt_root; +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -66,7 +74,7 @@ static void closid_init(void) int rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) rdt_min_closid = min(rdt_min_closid, r->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -75,9 +83,9 @@ static void closid_init(void) closid_free_map &= ~1; } -int closid_alloc(void) +static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; @@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) return 0; } -static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, - int len) -{ - struct rftype *rft; - int ret; - - lockdep_assert_held(&rdtgroup_mutex); - - for (rft = rfts; rft < rfts + len; rft++) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) - kernfs_remove_by_name(kn, rft->name); - return ret; -} - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + static bool is_cpu_list(struct kernfs_open_file *of) { struct rftype *rft = of->kn->priv; @@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_closid() is proteced against __switch_to() because + * from update_closid_rmid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *closid) +static void update_cpu_closid_rmid(void *info) { - if (closid) - this_cpu_write(cpu_closid, *(int *)closid); + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid) /* * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * - * Per task closids must have been set up before calling this function. - * - * The per cpu closids are updated with the smp function call, when @closid - * is not NULL. If @closid is NULL then all affected percpu closids must - * have been set up before calling this function. + * Per task closids/rmids must have been set up before calling this function. */ static void -rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); put_cpu(); } +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) + return -EINVAL; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) + return -EINVAL; + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - cpumask_var_t tmpmask, newmask; - struct rdtgroup *rdtgrp, *r; + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; int ret; if (!buf) @@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, free_cpumask_var(tmpmask); return -ENOMEM; } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - ret = -EINVAL; - goto unlock; - } - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_closid(tmpmask, &rdtgroup_default.closid); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu closid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); - } - rdt_update_closid(tmpmask, &rdtgrp->closid); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); return ret ?: nbytes; } @@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } -/* Files in each rdtgroup */ -static struct rftype rdtgroup_base_files[] = { - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - }, -}; - static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, return 0; } +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, return 0; } +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + intel_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + /* rdtgroup information files for one cache resource. */ -static struct rftype res_cache_info_files[] = { +static struct rftype res_common_files[] = { { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, -}; - -/* rdtgroup information files for memory bandwidth. */ -static struct rftype res_mba_info_files[] = { { - .name = "num_closids", + .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, }, }; -void rdt_get_mba_infofile(struct rdt_resource *r) +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) { - r->info_files = res_mba_info_files; - r->nr_info_files = ARRAY_SIZE(res_mba_info_files); + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if ((fflags & rft->fflags) == rft->fflags) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; } -void rdt_get_cache_infofile(struct rdt_resource *r) +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) { - r->info_files = res_cache_info_files; - r->nr_info_files = ARRAY_SIZE(res_cache_info_files); + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { - struct kernfs_node *kn_subdir; - struct rftype *res_info_files; struct rdt_resource *r; - int ret, len; + unsigned long fflags; + char name[32]; + int ret; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); - for_each_enabled_rdt_resource(r) { - kn_subdir = kernfs_create_dir(kn_info, r->name, - kn_info->mode, r); - if (IS_ERR(kn_subdir)) { - ret = PTR_ERR(kn_subdir); - goto out_destroy; - } - kernfs_get(kn_subdir); - ret = rdtgroup_kn_set_ugid(kn_subdir); + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); if (ret) goto out_destroy; + } - res_info_files = r->info_files; - len = r->nr_info_files; - - ret = rdtgroup_add_files(kn_subdir, res_info_files, len); + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) goto out_destroy; - kernfs_activate(kn_subdir); } /* @@ -678,6 +889,39 @@ out_destroy: return ret; } +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -718,14 +962,15 @@ static int cdp_enable(void) struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; int ret; - if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + if (!r_l3->alloc_capable || !r_l3data->alloc_capable || + !r_l3code->alloc_capable) return -EINVAL; ret = set_l3_qos_cfg(r_l3, true); if (!ret) { - r_l3->enabled = false; - r_l3data->enabled = true; - r_l3code->enabled = true; + r_l3->alloc_enabled = false; + r_l3data->alloc_enabled = true; + r_l3code->alloc_enabled = true; } return ret; } @@ -734,11 +979,11 @@ static void cdp_disable(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - r->enabled = r->capable; + r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; set_l3_qos_cfg(r, false); } } @@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) } } +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct rdt_domain *dom; + struct rdt_resource *r; struct dentry *dentry; int ret; @@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + NULL, "mon_groups", + &kn_mongrp); + if (ret) { + dentry = ERR_PTR(ret); + goto out_info; + } + kernfs_get(kn_mongrp); + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mongrp; + } + kernfs_get(kn_mondata); + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_destroy; + goto out_mondata; + + if (rdt_alloc_capable) + static_branch_enable(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } - static_branch_enable(&rdt_enable_key); goto out; -out_destroy: +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: kernfs_remove(kn_info); out_cdp: cdp_disable(); @@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_lock(&tasklist_lock); for_each_process_thread(p, t) { - if (!from || t->closid == from->closid) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { t->closid = to->closid; + t->rmid = to->mon.rmid; + #ifdef CONFIG_SMP /* * This is safe on x86 w/o barriers as the ordering @@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_unlock(&tasklist_lock); } +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + kfree(sentry); + } +} + /* * Forcibly remove all of subdirectories under root. */ @@ -955,6 +1273,9 @@ static void rmdir_all_sub(void) rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; @@ -967,16 +1288,20 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + free_rmid(rdtgrp->mon.rmid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ get_online_cpus(); - rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + update_closid_rmid(cpu_online_mask, &rdtgroup_default); put_online_cpus(); kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); } static void rdt_kill_sb(struct super_block *sb) @@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb) mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); + static_branch_disable(&rdt_alloc_enable_key); + static_branch_disable(&rdt_mon_enable_key); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); @@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = { .kill_sb = rdt_kill_sb, }; -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) { - struct rdtgroup *parent, *rdtgrp; struct kernfs_node *kn; - int ret, closid; + int ret = 0; - /* Only allow mkdir in the root directory */ - if (parent_kn != rdtgroup_default.kn) - return -EPERM; + kn = __kernfs_create_file(parent_kn, name, 0444, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } - parent = rdtgroup_kn_lock_live(parent_kn); - if (!parent) { - ret = -ENODEV; - goto out_unlock; + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); } +} - ret = closid_alloc(); - if (ret < 0) +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that kn is always accessible. + */ + kernfs_get(kn); + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + if (!prdtgrp) { + ret = -ENODEV; goto out_unlock; - closid = ret; + } /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; - goto out_closid_free; + goto out_unlock; } - rdtgrp->closid = closid; - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_cancel_ref; + goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + files = RFTYPE_BASE | RFTYPE_CTRL; + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); if (ret) goto out_destroy; + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) + goto out_destroy; + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) + goto out_idfree; + } kernfs_activate(kn); - ret = 0; - goto out_unlock; + /* + * The caller unlocks the prgrp_kn upon success. + */ + return 0; +out_idfree: + free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_remove(rdtgrp->kn); -out_cancel_ref: - list_del(&rdtgrp->rdtgroup_list); +out_free_rgrp: kfree(rdtgrp); -out_closid_free: - closid_free(closid); out_unlock: - rdtgroup_kn_unlock(parent_kn); + rdtgroup_kn_unlock(prgrp_kn); return ret; } -static int rdtgroup_rmdir(struct kernfs_node *kn) +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + kfree(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, + umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, + &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode) { - int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; + struct kernfs_node *kn; + u32 closid; + int ret; - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, + &rdtgrp); + if (ret) + return ret; - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) + goto out_common_fail; + closid = ret; + + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + if (ret) + goto out_id_free; } + goto out_unlock; + +out_id_free: + closid_free(closid); + list_del(&rdtgrp->rdtgroup_list); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + /* Give any tasks back to the default group */ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); @@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - /* Update per cpu closid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = closid; + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. */ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - rdt_update_closid(tmpmask, NULL); + update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + list_del(&rdtgrp->rdtgroup_list); /* @@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - ret = 0; + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + else + ret = -EPERM; + out: rdtgroup_kn_unlock(kn); free_cpumask_var(tmpmask); @@ -1129,7 +1845,7 @@ out: static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); return 0; } @@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void) mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6dde0497efc7..3b413065c613 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,6 +51,7 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> +#include <asm/set_memory.h> #include "mce-internal.h" @@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m) return ret; } +#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) + +void arch_unmap_kpfn(unsigned long pfn) +{ + unsigned long decoy_addr; + + /* + * Unmap this page from the kernel 1:1 mappings to make sure + * we don't log more errors because of speculative access to + * the page. + * We would like to just call: + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the posion page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_np() not checking whether we passed + * a legal address. + */ + +/* + * Build time check to see if we have a spare virtual bit. Don't want + * to leave this until run time because most developers don't have a + * system that can exercise this code path. This will only become a + * problem if/when we move beyond 5-level page tables. + * + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) + */ +#if PGDIR_SHIFT + 9 < 63 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); +#else +#error "no unused virtual bit available" +#endif + + if (set_memory_np(decoy_addr, 1)) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + +} +#endif + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9e314bcf67cc..40e28ed77fbf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - /* Collect bank_info using CPU 0 for now. */ - if (cpu) + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid) return; if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { @@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; if (hwid_mcatype == s_hwid->hwid_mcatype) { - - WARN(smca_banks[bank].hwid, - "Bank %s already initialized!\n", - smca_get_name(s_hwid->bank_type)); - smca_banks[bank].hwid = s_hwid; smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; @@ -776,24 +771,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -static inline void __smp_deferred_error_interrupt(void) -{ - inc_irq_stat(irq_deferred_error_count); - deferred_error_int_vector(); -} - asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); - __smp_deferred_error_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) -{ - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); - __smp_deferred_error_interrupt(); + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index f7370abd33c6..2da67b70ba98 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -static inline void __smp_thermal_interrupt(void) -{ - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); -} - -asmlinkage __visible void __irq_entry -smp_thermal_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_thermal_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry -smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - __smp_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index bb0e75eed10a..5e7249e42f8f 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,24 +17,12 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -static inline void __smp_threshold_interrupt(void) -{ - inc_irq_stat(irq_threshold_count); - mce_threshold_vector(); -} - asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); - __smp_threshold_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) -{ - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); - __smp_threshold_interrupt(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 21b185793c80..c6daec4bdba5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch) list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) + if (p->patch_id >= new_patch->patch_id) { /* we already have the latest patch */ + kfree(new_patch->data); + kfree(new_patch); return; + } list_replace(&p->plist, &new_patch->plist); kfree(p->data); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 59edbe9d4ccb..8f7a9bbad514 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header, return false; } -static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) +static struct ucode_patch *memdup_patch(void *data, unsigned int size) { struct ucode_patch *p; p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) - return ERR_PTR(-ENOMEM); + return NULL; p->data = kmemdup(data, size, GFP_KERNEL); if (!p->data) { kfree(p); - return ERR_PTR(-ENOMEM); + return NULL; } return p; @@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size) if (mc_hdr->rev <= mc_saved_hdr->rev) continue; - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer %p\n", data); else list_replace(&iter->plist, &p->plist); @@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size) * newly found. */ if (!prev_found) { - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer for %p\n", data); else list_add_tail(&p->plist, µcode_cache); } + if (!p) + return; + /* * Save for early loading. On 32-bit, that needs to be a physical * address as the APs are running from physical addresses, before * paging has been enabled. */ - if (p) { - if (IS_ENABLED(CONFIG_X86_32)) - intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); - else - intel_ucode_patch = p->data; - } + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; } static int microcode_sanity_check(void *mc, int print_err) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 70e717fccdd6..fbafd24174af 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs) void hv_setup_vmbus_irq(void (*handler)(void)) { vmbus_handler = handler; - /* - * Setup the IDT for hypervisor callback. Prevent reallocation - * at module reload. - */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - hyperv_callback_vector); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); } void hv_remove_vmbus_irq(void) @@ -184,9 +179,15 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -219,7 +220,7 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -253,7 +254,7 @@ static void __init ms_hyperv_init_platform(void) } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 23c23508c012..05459ad3db46 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index dbce3cca94cb..f13b4c00a5de 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) + __show_regs(regs, 0); + /* * Scan the stack, printing any text addresses we find. At the * same time, follow proper stack frames with the unwinder. @@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * Don't print regs->ip again if it was already printed * by __show_regs() below. */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); - continue; - } + if (regs && stack == ®s->ip) + goto next; if (stack == ret_addr_p) reliable = 1; @@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!reliable) continue; +next: /* * Get the next frame from the unwinder. No need to * check for an error: if anything goes wrong, the rest @@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs) + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) __show_regs(regs, 0); } @@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_X86_32 if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; } else { sp = kernel_stack_pointer(regs); savesegment(ss, ss); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e5f0b40e66d2..4f0481474903 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 3e1471d57487..225af4184f06 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stack <= begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 532da61d605c..71c11ad5643e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any); * Note: this function only works correctly once the E820 table is sorted and * not-overlapping (at least for the range specified), which is the case normally. */ -bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +static struct e820_entry *__e820__mapped_all(u64 start, u64 end, + enum e820_type type) { int i; @@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) * coverage of the desired range exists: */ if (start >= end) - return 1; + return entry; } - return 0; + + return NULL; +} + +/* + * This function checks if the entire range <start,end> is mapped with type. + */ +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +{ + return __e820__mapped_all(start, end, type); +} + +/* + * This function returns the type associated with the range <start,end>. + */ +int e820__get_entry_type(u64 start, u64 end) +{ + struct e820_entry *entry = __e820__mapped_all(start, end, 0); + + return entry ? entry->type : -EINVAL; } /* diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index d907c3d8633f..927abeaf63e2 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -12,10 +12,10 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/delay.h> -#include <linux/dmi.h> #include <linux/pci_ids.h> #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_regs.h> +#include <linux/platform_data/x86/apple.h> #include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), + INTEL_CNL_IDS(&gen9_early_ops), }; static void __init @@ -593,7 +594,7 @@ static void __init apple_airport_reset(int bus, int slot, int func) u64 addr; int i; - if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + if (!x86_apple_machine) return; /* Card may have been put into PCI_D3hot by grub quirk */ diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000000000000..f260e452e4f8 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,19 @@ +/* + * EISA specific code + * + * This file is licensed under the GPL V2 + */ +#include <linux/ioport.h> +#include <linux/eisa.h> +#include <linux/io.h> + +static __init int eisa_bus_probe(void) +{ + void __iomem *p = ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6b91e2eb8d3f..9c4e7ba6870c 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,7 +195,7 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 538ec012b371..cf2ce063f65a 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/memblock.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820/api.h> @@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { cr4_init_shadow(); + + idt_setup_early_handler(); + sanitize_boot_params(&boot_params); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 9ba79543d9ee..bab4fa579450 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -14,6 +14,7 @@ #include <linux/start_kernel.h> #include <linux/io.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <asm/processor.h> #include <asm/proto.h> @@ -33,7 +34,6 @@ /* * Manage page tables very early on. */ -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); @@ -45,9 +45,11 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } -void __head __startup_64(unsigned long physaddr) +unsigned long __head __startup_64(unsigned long physaddr, + struct boot_params *bp) { unsigned long load_delta, *p; + unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; @@ -69,6 +71,12 @@ void __head __startup_64(unsigned long physaddr) if (load_delta & ~PMD_PAGE_MASK) for (;;); + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(bp); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); @@ -92,31 +100,35 @@ void __head __startup_64(unsigned long physaddr) * creates a bunch of nonsense entries but that is fine -- * it avoids problems around wraparound. */ + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 0] = (pudval_t)pmd + pgtable_flags; + pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { @@ -137,9 +149,30 @@ void __head __startup_64(unsigned long physaddr) pmd[i] += load_delta; } - /* Fixup phys_base */ + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ p = fixup_pointer(&phys_base, physaddr); - *p += load_delta; + *p += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel (if SME is active) */ + sme_encrypt_kernel(); + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); } /* Wipe all early page tables except for the kernel symbol map */ @@ -147,17 +180,17 @@ static void __init reset_early_page_tables(void) { memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_top_pgt)); + write_cr3(__sme_pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ -int __init early_make_pgtable(unsigned long address) +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; - pmdval_t pmd, *pmd_p; + pmdval_t *pmd_p; /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) @@ -216,12 +249,21 @@ again: memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - pmd = (physaddr & PMD_MASK) + early_pmd_flags; pmd_p[pmd_index(address)] = pmd; return 0; } +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -244,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data) char * command_line; unsigned long cmd_line_ptr; + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); cmd_line_ptr = get_cmd_line_ptr(); @@ -251,12 +299,18 @@ static void __init copy_bootdata(char *real_mode_data) command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); } asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { - int i; - /* * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) @@ -280,11 +334,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_page(init_top_pgt); + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + kasan_early_init(); - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handler_array[i]); - load_idt((const struct desc_ptr *)&idt_descr); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1f85ee8f9439..9ed3074d0d27 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -155,7 +155,6 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ @@ -165,7 +164,6 @@ WEAK(xen_entry) subarch_entries: .long .Ldefault_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 @@ -347,7 +345,6 @@ ENTRY(startup_32_smp) movl %eax,%cr0 lgdt early_gdt_descr - lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -380,37 +377,6 @@ ENDPROC(startup_32_smp) */ __INIT setup_once: - /* - * Set up a idt with 256 interrupt gates that push zero if there - * is no error code and then jump to early_idt_handler_common. - * It doesn't actually load the idt - that needs to be done on - * each CPU. Interrupts are enabled elsewhere, when we can be - * relatively sure everything is ok. - */ - - movl $idt_table,%edi - movl $early_idt_handler_array,%eax - movl $NUM_EXCEPTION_VECTORS,%ecx -1: - movl %eax,(%edi) - movl %eax,4(%edi) - /* interrupt gate, dpl=0, present */ - movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $EARLY_IDT_HANDLER_SIZE,%eax - addl $8,%edi - loop 1b - - movl $256 - NUM_EXCEPTION_VECTORS,%ecx - movl $ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ -2: - movl %eax,(%edi) - movl %edx,4(%edi) - addl $8,%edi - loop 2b - #ifdef CONFIG_CC_STACKPROTECTOR /* * Configure the stack canary. The linker can't handle this by @@ -457,12 +423,9 @@ early_idt_handler_common: /* The vector number is in pt_regs->gs */ cld - pushl %fs /* pt_regs->fs */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %es /* pt_regs->es */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %ds /* pt_regs->ds */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ pushl %eax /* pt_regs->ax */ pushl %ebp /* pt_regs->bp */ pushl %edi /* pt_regs->di */ @@ -479,9 +442,8 @@ early_idt_handler_common: /* Load the vector number into EDX */ movl PT_GS(%esp), %edx - /* Load GS into pt_regs->gs and clear high bits */ + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ movw %gs, PT_GS(%esp) - movw $0, PT_GS+2(%esp) movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ call early_fixup_exception @@ -493,18 +455,17 @@ early_idt_handler_common: popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ - popl %ds /* pt_regs->ds */ - popl %es /* pt_regs->es */ - popl %fs /* pt_regs->fs */ - popl %gs /* pt_regs->gs */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ - ALIGN -ignore_int: +ENTRY(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -539,7 +500,8 @@ ignore_int: hlt_loop: hlt jmp hlt_loop -ENDPROC(ignore_int) +ENDPROC(early_ignore_irq) + __INITDATA .align 4 GLOBAL(early_recursion_flag) @@ -628,7 +590,6 @@ int_msg: .data .globl boot_gdt_descr -.globl idt_descr ALIGN # early boot GDT descriptor (must use 1:1 address mapping) @@ -637,11 +598,6 @@ boot_gdt_descr: .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET - .word 0 # 32-bit align idt_desc.address -idt_descr: - .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table - # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6225550883df..513cbb012ecc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,12 +73,19 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ leaq _text(%rip), %rdi pushq %rsi call __startup_64 popq %rsi - movq $(early_top_pgt - __START_KERNEL_map), %rax + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -98,7 +105,16 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_top_pgt - __START_KERNEL_map), %rax + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax 1: /* Enable PAE mode, PGE and LA57 */ @@ -335,9 +351,9 @@ GLOBAL(name) NEXT_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(early_dynamic_pgts) @@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else NEXT_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. @@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt) #ifdef CONFIG_X86_5LEVEL NEXT_PAGE(level4_kernel_pgt) .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level2_kernel_pgt) /* @@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000000000000..6107ee1cb8d5 --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,371 @@ +/* + * Interrupt descriptor table related code + * + * This file is licensed under the GPL V2 + */ +#include <linux/interrupt.h> + +#include <asm/traps.h> +#include <asm/proto.h> +#include <asm/desc.h> + +struct idt_data { + unsigned int vector; + unsigned int segment; + struct idt_bits bits; + const void *addr; +}; + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type, \ + .bits.dpl = _dpl, \ + .bits.p = 1, \ + .addr = _addr, \ + .segment = _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Interrupt gate with interrupt stack */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate with interrupt stack */ +#define SISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Task gate */ +#define TSKG(_vector, _gdt) \ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, debug), + SYSG(X86_TRAP_BP, int3), +#ifdef CONFIG_X86_32 + INTG(X86_TRAP_PF, page_fault), +#endif +}; + +/* + * The default IDT entries which are set up in trap_init() before + * cpu_init() is invoked. Interrupt stacks cannot be used at that point and + * the traps which use them are reinitialized with IST after cpu_init() has + * set up TSS. + */ +static const __initdata struct idt_data def_idts[] = { + INTG(X86_TRAP_DE, divide_error), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BR, bounds), + INTG(X86_TRAP_UD, invalid_op), + INTG(X86_TRAP_NM, device_not_available), + INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), + INTG(X86_TRAP_TS, invalid_TSS), + INTG(X86_TRAP_NP, segment_not_present), + INTG(X86_TRAP_SS, stack_segment), + INTG(X86_TRAP_GP, general_protection), + INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), + INTG(X86_TRAP_MF, coprocessor_error), + INTG(X86_TRAP_AC, alignment_check), + INTG(X86_TRAP_XF, simd_coprocessor_error), + +#ifdef CONFIG_X86_32 + TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), +#else + INTG(X86_TRAP_DF, double_fault), +#endif + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BP, int3), + +#ifdef CONFIG_X86_MCE + INTG(X86_TRAP_MC, &machine_check), +#endif + + SYSG(X86_TRAP_OF, overflow), +#if defined(CONFIG_IA32_EMULATION) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), +#elif defined(CONFIG_X86_32) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), +#endif +}; + +/* + * The APIC and SMP idt entries + */ +static const __initdata struct idt_data apic_idts[] = { +#ifdef CONFIG_SMP + INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), + INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), + INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), + INTG(REBOOT_VECTOR, reboot_interrupt), +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + INTG(THERMAL_APIC_VECTOR, thermal_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_AMD + INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), +# ifdef CONFIG_HAVE_KVM + INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +# endif +# ifdef CONFIG_IRQ_WORK + INTG(IRQ_WORK_VECTOR, irq_work_interrupt), +# endif + INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), + INTG(ERROR_APIC_VECTOR, error_interrupt), +#endif +}; + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, page_fault), +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +static const __initdata struct idt_data dbg_idts[] = { + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_BP, int3), +}; +#endif + +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +struct desc_ptr idt_descr __ro_after_init = { + .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .address = (unsigned long) idt_table, +}; + +#ifdef CONFIG_X86_64 +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initdata struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, debug, DEBUG_STACK), + ISTG(X86_TRAP_NMI, nmi, NMI_STACK), + SISTG(X86_TRAP_BP, int3, DEBUG_STACK), + ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), +#endif +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +const struct desc_ptr debug_idt_descr = { + .size = IDT_ENTRIES * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; +#endif + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low = (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + +static void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(&desc, t); + write_idt_entry(idt, t->vector, &desc); + if (sys) + set_bit(t->vector, used_vectors); + } +} + +static void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + BUG_ON(n > 0xFF); + + memset(&data, 0, sizeof(data)); + data.vector = n; + data.addr = addr; + data.segment = __KERNEL_CS; + data.bits.type = GATE_INTERRUPT; + data.bits.p = 1; + + idt_setup_from_table(idt_table, &data, 1, false); +} + +/** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), + true); + load_idt(&idt_descr); +} + +/** + * idt_setup_traps - Initialize the idt table with default traps + */ +void __init idt_setup_traps(void) +{ + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); +} + +#ifdef CONFIG_X86_64 +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * FIXME: Why is 32bit and 64bit installing the PF handler at different + * places in the early setup code? + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, + ARRAY_SIZE(early_pf_idts), true); +} + +/** + * idt_setup_ist_traps - Initialize the idt table with traps using IST + */ +void __init idt_setup_ist_traps(void) +{ + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); +} + +/** + * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps + */ +void __init idt_setup_debugidt_traps(void) +{ + memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); + + idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); +} +#endif + +/** + * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates + */ +void __init idt_setup_apic_and_irq_gates(void) +{ + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); + + for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { +#ifdef CONFIG_X86_LOCAL_APIC + set_bit(i, used_vectors); + set_intr_gate(i, spurious_interrupt); +#else + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); +#endif + } +} + +/** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif + load_idt(&idt_descr); +} + +/** + * idt_invalidate - Invalidate interrupt descriptor table + * @addr: The virtual address of the 'invalid' IDT + */ +void idt_invalidate(void *addr) +{ + struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + + load_idt(&idt); +} + +void __init update_intr_gate(unsigned int n, const void *addr) +{ + if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + return; + set_intr_gate(n, addr); +} + +void alloc_intr_gate(unsigned int n, const void *addr) +{ + BUG_ON(n < FIRST_SYSTEM_VECTOR); + if (!test_and_set_bit(n, used_vectors)) + set_intr_gate(n, addr); +} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4ed0aba8dbc8..52089c043160 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); atomic_t irq_err_count; -/* Function pointer for generic interrupt vector handling */ -void (*x86_platform_ipi_callback)(void) = NULL; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_puts(p, " APIC ICR read retries\n"); -#endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_puts(p, " Platform interrupts\n"); } +#endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) @@ -183,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; sum += irq_stats(cpu)->icr_read_retry_count; -#endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; +#endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -259,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -void __smp_x86_platform_ipi(void) -{ - inc_irq_stat(x86_platform_ipis); - - if (x86_platform_ipi_callback) - x86_platform_ipi_callback(); -} - __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - __smp_x86_platform_ipi(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); exiting_irq(); set_irq_regs(old_regs); } +#endif #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {} @@ -334,19 +331,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) } #endif -__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); - trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); - __smp_x86_platform_ipi(); - trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); - set_irq_regs(old_regs); -} - -EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU @@ -431,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void) * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; - vector < first_system_vector; vector++) { + vector < FIRST_SYSTEM_VECTOR; vector++) { if (!test_bit(vector, used_vectors) && IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { if (++count == this_count) diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 275487872be2..70dee056f92b 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -11,35 +11,23 @@ #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> -static inline void __smp_irq_work_interrupt(void) -{ - inc_irq_stat(apic_irq_work_irqs); - irq_work_run(); -} - +#ifdef CONFIG_X86_LOCAL_APIC __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_irq_work_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); - __smp_irq_work_interrupt(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); exiting_irq(); } void arch_irq_work_raise(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!arch_irq_work_has_interrupt()) return; apic->send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); -#endif } +#endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c7fd18526c3e..1add9e08e83e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - return 1; - } - - return 0; -} - void __init init_ISA_irqs(void) { struct irq_chip *chip = legacy_pic->chip; @@ -99,100 +87,12 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); - - /* IPI used for rebooting/stopping */ - alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_THERMAL_VECTOR - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#ifdef CONFIG_X86_MCE_THRESHOLD - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#ifdef CONFIG_X86_MCE_AMD - alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI for X86 platform specific use */ - alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); -#ifdef CONFIG_HAVE_KVM - /* IPI for KVM to deliver posted interrupt */ - alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); - /* IPI for KVM to deliver interrupt to wake up tasks */ - alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); - /* IPI for KVM to deliver nested posted interrupt */ - alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi); -#endif - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* IRQ work interrupts: */ -# ifdef CONFIG_IRQ_WORK - alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); -# endif - -#endif -} - void __init native_init_IRQ(void) { - int i; - /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - apic_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - i = FIRST_EXTERNAL_VECTOR; -#ifndef CONFIG_X86_LOCAL_APIC -#define first_system_vector NR_VECTORS -#endif - for_each_clear_bit_from(i, used_vectors, first_system_vector) { - /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } -#ifdef CONFIG_X86_LOCAL_APIC - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) - set_intr_gate(i, spurious_interrupt); -#endif + idt_setup_apic_and_irq_gates(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 38b64587b31b..fd6f8fbbe6f2 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, struct setup_data_node *node = file->private_data; unsigned long remain; loff_t pos = *ppos; - struct page *pg; void *p; u64 pa; @@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, count = node->len - pos; pa = node->paddr + sizeof(struct setup_data) + pos; - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - p = ioremap_cache(pa, count); - if (!p) - return -ENXIO; - } else - p = __va(pa); + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; remain = copy_to_user(user_buf, p, count); - if (PageHighMem(pg)) - iounmap(p); + memunmap(p); if (remain) return -EFAULT; @@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent) struct setup_data *data; int error; struct dentry *d; - struct page *pg; u64 pa_data; int no = 0; @@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - data = ioremap_cache(pa_data, sizeof(*data)); - if (!data) { - kfree(node); - error = -ENXIO; - goto err_dir; - } - } else - data = __va(pa_data); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } node->paddr = pa_data; node->type = data->type; @@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = create_setup_data_node(d, no, node); pa_data = data->next; - if (PageHighMem(pg)) - iounmap(data); + memunmap(data); if (error) goto err_dir; no++; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 69ea0bc1cfa3..4f98aad38237 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -39,6 +39,7 @@ #include <asm/insn.h> #include <asm/debugreg.h> #include <asm/set_memory.h> +#include <asm/sections.h> #include "common.h" @@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr) /* * Do not optimize in the entry code due to the unstable - * stack handling. + * stack handling and registers setup. */ - if ((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) + if (((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) || + ((paddr >= (unsigned long)__irqentry_text_start) && + (paddr < (unsigned long)__irqentry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 06e1ff5562c0..4b0592ca9e47 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -16,8 +16,8 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/io.h> -#include <asm/io.h> #include <asm/setup.h> static ssize_t version_show(struct kobject *kobj, @@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr) *paddr = pa_data; return 0; } - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size) u64 pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; if (nr == i) { *size = data->len; - iounmap(data); + memunmap(data); return 0; } pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; ret = sprintf(buf, "0x%x\n", data->type); - iounmap(data); + memunmap(data); return ret; } @@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp, goto out; ret = count; - p = ioremap_cache(paddr + sizeof(*data), data->len); + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; } memcpy(buf, p + off, count); - iounmap(p); + memunmap(p); out: - iounmap(data); + memunmap(data); return ret; } @@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr) *nr = 0; while (pa_data) { *nr += 1; - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) { ret = -ENOMEM; goto out; } pa_data = data->next; - iounmap(data); + memunmap(data); } out: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d04e30e3c0ff..874827b0d7ca 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -263,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - trace_do_page_fault(regs, error_code); + do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ @@ -455,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static void __init kvm_apf_trap_init(void) { - set_intr_gate(14, async_page_fault); + update_intr_gate(X86_TRAP_PF, async_page_fault); } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a870910c8565..f0e64db18ac8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -21,6 +21,25 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + /* context.lock is held for us, so we don't need any locking. */ static void flush_ldt(void *__mm) { @@ -32,6 +51,8 @@ static void flush_ldt(void *__mm) pc = &mm->context; set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + + refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8c53c5d7a1bc..00bc751c861c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -26,18 +26,6 @@ #include <asm/set_memory.h> #include <asm/debugreg.h> -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - load_idt(&curidt); -} - - static void set_gdt(void *newgdt, __u16 limit) { struct desc_ptr curgdt; @@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image) * If you want to load them you must set up your own idt & gdt. */ set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + idt_invalidate(phys_to_virt(0)); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cb0a30473c23..1f790cf9d38f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); return 0; err: free_transition_pgtable(image); @@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .alloc_pgt_page = alloc_pgt_page, .context = image, .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image) image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, - image->preserve_context); + image->preserve_context, + sme_active()); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } + +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) +{ + /* + * If SME is active we need to be sure that kexec pages are + * not encrypted because when we boot to the new kernel the + * pages won't be accessed encrypted (initially). + */ + return set_memory_decrypted((unsigned long)vaddr, pages); +} + +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) +{ + /* + * If SME is active we need to reset the pages back to being + * an encrypted mapping before freeing them. + */ + set_memory_encrypted((unsigned long)vaddr, pages); +} diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f67bd3205df7..62e7d70aadd5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/setup.h> +#include <asm/unwind.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr, locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; } if (alt) { @@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr, /* make jump label nops */ jump_label_apply_nops(me); + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0d904d759ff1..5cbb3177ed17 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) } } -static struct mpf_intel *mpf_found; +static unsigned long mpf_base; static unsigned long __init get_mpc_size(unsigned long physptr) { struct mpc_table *mpc; unsigned long size; - mpc = early_ioremap(physptr, PAGE_SIZE); + mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; - early_iounmap(mpc, PAGE_SIZE); + early_memunmap(mpc, PAGE_SIZE); apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); return size; @@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) unsigned long size; size = get_mpc_size(mpf->physptr); - mpc = early_ioremap(mpf->physptr, size); + mpc = early_memremap(mpf->physptr, size); + /* * Read the physical hardware table. Anything here will * override the defaults. @@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #endif pr_err("BIOS bug, MP table errors detected!...\n"); pr_cont("... disabling SMP support. (tell your hw vendor)\n"); - early_iounmap(mpc, size); + early_memunmap(mpc, size); return -1; } - early_iounmap(mpc, size); + early_memunmap(mpc, size); if (early) return -1; @@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) */ void __init default_get_smp_config(unsigned int early) { - struct mpf_intel *mpf = mpf_found; + struct mpf_intel *mpf; if (!smp_found_config) return; - if (!mpf) + if (!mpf_base) return; if (acpi_lapic && early) @@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: error mapping MP table\n"); + return; + } + pr_info("Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early) /* * Now see if we need to read further. */ - if (mpf->feature1 != 0) { + if (mpf->feature1) { if (early) { /* * local APIC has default address @@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - if (check_physptr(mpf, early)) + if (check_physptr(mpf, early)) { + early_memunmap(mpf, sizeof(*mpf)); return; + } } else BUG(); @@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early) /* * Only use the first configuration found. */ + + early_memunmap(mpf, sizeof(*mpf)); } static void __init smp_reserve_memory(struct mpf_intel *mpf) @@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) static int __init smp_scan_config(unsigned long base, unsigned long length) { - unsigned int *bp = phys_to_virt(base); + unsigned int *bp; struct mpf_intel *mpf; - unsigned long mem; + int ret = 0; apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { + bp = early_memremap(base, length); mpf = (struct mpf_intel *)bp; if ((*bp == SMP_MAGIC_IDENT) && (mpf->length == 1) && @@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; #endif - mpf_found = mpf; + mpf_base = base; - pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", - (unsigned long long) virt_to_phys(mpf), - (unsigned long long) virt_to_phys(mpf) + - sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", + base, base + sizeof(*mpf) - 1, mpf); - mem = virt_to_phys(mpf); - memblock_reserve(mem, sizeof(*mpf)); + memblock_reserve(base, sizeof(*mpf)); if (mpf->physptr) smp_reserve_memory(mpf); - return 1; + ret = 1; } - bp += 4; + early_memunmap(bp, length); + + if (ret) + break; + + base += 16; length -= 16; } - return 0; + return ret; } void __init default_find_smp_config(void) @@ -838,29 +852,40 @@ static int __init update_mp_table(void) char oem[10]; struct mpf_intel *mpf; struct mpc_table *mpc, *mpc_new; + unsigned long size; if (!enable_update_mptable) return 0; - mpf = mpf_found; - if (!mpf) + if (!mpf_base) return 0; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: mpf early_memremap() failed\n"); + return 0; + } + /* * Now see if we need to go further. */ - if (mpf->feature1 != 0) - return 0; + if (mpf->feature1) + goto do_unmap_mpf; if (!mpf->physptr) - return 0; + goto do_unmap_mpf; - mpc = phys_to_virt(mpf->physptr); + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + if (!mpc) { + pr_err("MPTABLE: mpc early_memremap() failed\n"); + goto do_unmap_mpf; + } if (!smp_check_mpc(mpc, oem, str)) - return 0; + goto do_unmap_mpc; - pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); + pr_info("mpf: %llx\n", (u64)mpf_base); pr_info("physptr: %x\n", mpf->physptr); if (mpc_new_phys && mpc->length > mpc_new_length) { @@ -878,21 +903,32 @@ static int __init update_mp_table(void) new = mpf_checksum((unsigned char *)mpc, mpc->length); if (old == new) { pr_info("mpc is readonly, please try alloc_mptable instead\n"); - return 0; + goto do_unmap_mpc; } pr_info("use in-position replacing\n"); } else { + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); + if (!mpc_new) { + pr_err("MPTABLE: new mpc early_memremap() failed\n"); + goto do_unmap_mpc; + } mpf->physptr = mpc_new_phys; - mpc_new = phys_to_virt(mpc_new_phys); memcpy(mpc_new, mpc, mpc->length); + early_memunmap(mpc, size); mpc = mpc_new; + size = mpc_new_length; /* check if we can modify that */ if (mpc_new_phys - mpf->physptr) { struct mpf_intel *mpf_new; /* steal 16 bytes from [0, 1k) */ + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); + if (!mpf_new) { + pr_err("MPTABLE: new mpf early_memremap() failed\n"); + goto do_unmap_mpc; + } pr_info("mpf new: %x\n", 0x400 - 16); - mpf_new = phys_to_virt(0x400 - 16); memcpy(mpf_new, mpf, 16); + early_memunmap(mpf, sizeof(*mpf)); mpf = mpf_new; mpf->physptr = mpc_new_phys; } @@ -909,6 +945,12 @@ static int __init update_mp_table(void) */ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); +do_unmap_mpc: + early_memunmap(mpc, size); + +do_unmap_mpf: + early_memunmap(mpf, sizeof(*mpf)); + return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 446c8aa09b9b..35aafc95e4b8 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -39,26 +39,26 @@ #include <trace/events/nmi.h> struct nmi_desc { - spinlock_t lock; + raw_spinlock_t lock; struct list_head head; }; static struct nmi_desc nmi_desc[NMI_MAX] = { { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), .head = LIST_HEAD_INIT(nmi_desc[0].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), .head = LIST_HEAD_INIT(nmi_desc[2].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), .head = LIST_HEAD_INIT(nmi_desc[3].head), }, @@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) init_irq_work(&action->irq_work, nmi_max_handler); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * Indicate if there are multiple registrations on the @@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) else list_add_tail_rcu(&action->list, &desc->head); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(__register_nmi_handler); @@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) struct nmiaction *n; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); list_for_each_entry_rcu(n, &desc->head, list) { /* @@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); } EXPORT_SYMBOL_GPL(unregister_nmi_handler); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bc0a849589bb..a14df9eecfed 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = { .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .safe_halt = native_safe_halt, .halt = native_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, -#endif }; __visible struct pv_cpu_ops pv_cpu_ops = { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5e16d3f29594..0accc2404b92 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -93,9 +93,12 @@ again: if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size), flag); - if (page && page_to_phys(page) + size > dma_mask) { - dma_release_from_contiguous(dev, page, count); - page = NULL; + if (page) { + addr = phys_to_dma(dev, page_to_phys(page)); + if (addr + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } } } /* fallback */ @@ -104,7 +107,7 @@ again: if (!page) return NULL; - addr = page_to_phys(page); + addr = phys_to_dma(dev, page_to_phys(page)); if (addr + size > dma_mask) { __free_pages(page, get_order(size)); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a6d404087fe3..4fc3cb60ea11 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t bus = page_to_phys(page) + offset; + dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) return NOMMU_MAPPING_ERROR; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 1e23577e17cf..677077510e30 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,12 +6,14 @@ #include <linux/swiotlb.h> #include <linux/bootmem.h> #include <linux/dma-mapping.h> +#include <linux/mem_encrypt.h> #include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/xen/swiotlb-xen.h> #include <asm/iommu_table.h> + int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override, pci_swiotlb_late_init); /* - * if 4GB or more detected (and iommu=off not set) return 1 - * and set swiotlb to 1. + * If 4GB or more detected (and iommu=off not set) or if SME is active + * then set swiotlb to 1 and return 1. */ int __init pci_swiotlb_detect_4gb(void) { @@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void) if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif + + /* + * If SME is active then swiotlb will be set to 1 so that bounce + * buffers are allocated and used for devices that do not support + * the addressing range required for the encryption mask. + */ + if (sme_active()) + swiotlb = 1; + return swiotlb; } IOMMU_INIT(pci_swiotlb_detect_4gb, diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 91271122f0df..502a77d0adb0 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void) x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: - case X86_SUBARCH_LGUEST: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3ca198080ea9..bd6b85fac666 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -355,6 +355,7 @@ bool xen_set_default_idle(void) return ret; } #endif + void stop_this_cpu(void *dummy) { local_irq_disable(); @@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); - for (;;) - halt(); + for (;;) { + /* + * Use wbinvd followed by hlt to stop the processor. This + * provides support for kexec on a processor that supports + * SME. With kexec, going from SME inactive to SME active + * requires clearing cache entries so that addresses without + * the encryption bit set don't corrupt the same physical + * address that has the encryption bit set when caches are + * flushed. To achieve this a wbinvd is performed followed by + * a hlt. Even if the processor is not in the kexec/SME + * scenario this only adds a wbinvd to a halting processor. + */ + asm volatile("wbinvd; hlt" : : : "memory"); + } } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c6d6dc5f8bb2..11966251cd42 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) @@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all) if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; gs = get_user_gs(regs); } else { sp = kernel_stack_pointer(regs); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c3169be4c596..302e7b2572d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/unistd.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ @@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, - (void *)regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, regs->sp, regs->flags); if (regs->orig_ax != -1) @@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task) } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); switch_fpu_prepare(prev_fpu, cpu); @@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, cpu); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 0bee04d41bed..eaa591cfd98b 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -1,6 +1,7 @@ /* * This file contains work-arounds for x86 and x86_64 platform bugs. */ +#include <linux/dmi.h> #include <linux/pci.h> #include <linux/irq.h> @@ -656,3 +657,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif #endif + +bool x86_apple_machine; +EXPORT_SYMBOL(x86_apple_machine); + +void __init early_platform_quirks(void) +{ + x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") || + dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a56bf6051f4e..54984b142641 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -38,8 +38,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static const struct desc_ptr no_idt = {}; - /* * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on @@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - load_idt(&no_idt); + idt_invalidate(NULL); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 98111b38ebfd..307d3bac5f04 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -47,6 +47,7 @@ relocate_kernel: * %rsi page_list * %rdx start address * %rcx preserve_context + * %r8 sme_active */ /* Save the CPU context, used for jumping back */ @@ -71,6 +72,9 @@ relocate_kernel: pushq $0 popfq + /* Save SME active flag */ + movq %r8, %r12 + /* * get physical address of control page now * this is impossible after page table switch @@ -132,6 +136,16 @@ identity_mapped: /* Flush the TLB (needed?) */ movq %r9, %cr3 + /* + * If SME is active, there could be old encrypted cache line + * entries that will conflict with the now unencrypted memory + * used by kexec. Flush the caches before copying the kernel. + */ + testq %r12, %r12 + jz 1f + wbinvd +1: + movq %rcx, %r11 call swap_pages diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3486d0498800..d84afb0a322d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -69,6 +69,7 @@ #include <linux/crash_dump.h> #include <linux/tboot.h> #include <linux/jiffies.h> +#include <linux/mem_encrypt.h> #include <linux/usb/xhci-dbgp.h> #include <video/edid.h> @@ -115,6 +116,7 @@ #include <asm/microcode.h> #include <asm/mmu_context.h> #include <asm/kaslr.h> +#include <asm/unwind.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -374,6 +376,14 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ + /* + * If SME is active, this memory will be marked encrypted by the + * kernel when it is accessed (including relocation). However, the + * ramdisk image was loaded decrypted by the bootloader, so make + * sure that it is encrypted before accessing it. + */ + sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); + initrd_start = 0; mapped_size = memblock_mem_size(max_pfn_mapped); @@ -890,7 +900,7 @@ void __init setup_arch(char **cmdline_p) */ olpc_ofw_detect(); - early_trap_init(); + idt_setup_early_traps(); early_cpu_init(); early_ioremap_init(); @@ -1161,7 +1171,7 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - early_trap_pf_init(); + idt_setup_early_pf(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) @@ -1206,6 +1216,8 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); + early_platform_quirks(); + /* * Parse the ACPI tables for possible boot-time SMP configuration. */ @@ -1310,6 +1322,8 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif + + unwind_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 10edd1e69a68..6e8fcb6f7e1e 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct gdt; + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xFFFFF); - pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, - 0x2 | DESCTYPE_S, 0x8); - gdt.s = 1; - write_gdt_entry(get_cpu_gdt_rw(cpu), - GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc30a74e4adb..e04442345fc0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && - (regs->ss & 0xffff) != __USER_DS && + regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d798c0da451c..5c574dff4c1a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -254,84 +254,45 @@ finish: } /* - * Reschedule call back. + * Reschedule call back. KVM uses this interrupt to force a cpu out of + * guest mode */ -static inline void __smp_reschedule_interrupt(void) -{ - inc_irq_stat(irq_resched_count); - scheduler_ipi(); -} - __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - __smp_reschedule_interrupt(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} - -__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) -{ - /* - * Need to call irq_enter() before calling the trace point. - * __smp_reschedule_interrupt() calls irq_enter/exit() too (in - * scheduler_ipi(). This is OK, since those functions are allowed - * to nest. - */ - ipi_entering_ack_irq(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - __smp_reschedule_interrupt(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - exiting_irq(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} + inc_irq_stat(irq_resched_count); -static inline void __smp_call_function_interrupt(void) -{ - generic_smp_call_function_interrupt(); - inc_irq_stat(irq_call_count); + if (trace_resched_ipi_enabled()) { + /* + * scheduler_ipi() might call irq_enter() as well, but + * nested calls are fine. + */ + irq_enter(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + irq_exit(); + return; + } + scheduler_ipi(); } __visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_call_function_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry -smp_trace_call_function_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); - __smp_call_function_interrupt(); - trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); -} - -static inline void __smp_call_function_single_interrupt(void) -{ - generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); -} - -__visible void __irq_entry -smp_call_function_single_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); - __smp_call_function_single_interrupt(); + generic_smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); exiting_irq(); } -__visible void __irq_entry -smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); - __smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 5f25cfbd952e..5ee663836c08 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re unsigned long addr, seg; addr = regs->ip; - seg = regs->cs & 0xffff; + seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 213ddf3e937d..73e4d28112f8 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,6 +21,7 @@ #include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> +#include <asm/mpx.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. @@ -100,8 +101,8 @@ out: return error; } -static void find_start_end(unsigned long flags, unsigned long *begin, - unsigned long *end) +static void find_start_end(unsigned long addr, unsigned long flags, + unsigned long *begin, unsigned long *end) { if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small @@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin, } *begin = get_mmap_base(1); - *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); + if (in_compat_syscall()) + *end = task_size_32bit(); + else + *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); } unsigned long @@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_unmapped_area_info info; unsigned long begin, end; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (flags & MAP_FIXED) return addr; - find_start_end(flags, &begin, &end); + find_start_end(addr, flags, &begin, &end); if (len > end) return -ENOMEM; @@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; struct vm_unmapped_area_info info; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + /* requested length too big for entire address space */ if (len > TASK_SIZE) return -ENOMEM; @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + * + * !in_compat_syscall() check to avoid high addresses for x32. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index dcd699baea1b..a106b9719c58 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx, while (n-- > 0) { if (LDT_empty(info) || LDT_zero(info)) { - desc->a = desc->b = 0; + memset(desc, 0, sizeof(*desc)); } else { fill_ldt(desc, info); diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 15515132bf0d..c6636d1f60b9 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -4,57 +4,38 @@ * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> * */ -#include <asm/hw_irq.h> -#include <asm/desc.h> +#include <linux/jump_label.h> #include <linux/atomic.h> -atomic_t trace_idt_ctr = ATOMIC_INIT(0); -struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) trace_idt_table }; - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; +#include <asm/hw_irq.h> +#include <asm/desc.h> -static int trace_irq_vector_refcount; -static DEFINE_MUTEX(irq_vector_mutex); +DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); -static void set_trace_idt_ctr(int val) +int trace_pagefault_reg(void) { - atomic_set(&trace_idt_ctr, val); - /* Ensure the trace_idt_ctr is set before sending IPI */ - wmb(); + static_branch_inc(&trace_pagefault_key); + return 0; } -static void switch_idt(void *arg) +void trace_pagefault_unreg(void) { - unsigned long flags; - - local_irq_save(flags); - load_current_idt(); - local_irq_restore(flags); + static_branch_dec(&trace_pagefault_key); } -int trace_irq_vector_regfunc(void) +#ifdef CONFIG_SMP + +DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); + +int trace_resched_ipi_reg(void) { - mutex_lock(&irq_vector_mutex); - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(1); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - trace_irq_vector_refcount++; - mutex_unlock(&irq_vector_mutex); + static_branch_inc(&trace_resched_ipi_key); return 0; } -void trace_irq_vector_unregfunc(void) +void trace_resched_ipi_unreg(void) { - mutex_lock(&irq_vector_mutex); - trace_irq_vector_refcount--; - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(0); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - mutex_unlock(&irq_vector_mutex); + static_branch_dec(&trace_resched_ipi_key); } + +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf54309b85da..34ea3651362e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -38,11 +38,6 @@ #include <linux/smp.h> #include <linux/io.h> -#ifdef CONFIG_EISA -#include <linux/ioport.h> -#include <linux/eisa.h> -#endif - #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif @@ -70,20 +65,13 @@ #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> #include <asm/proto.h> #endif -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[NR_VECTORS] __page_aligned_bss; - DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); static inline void cond_local_irq_enable(struct pt_regs *regs) { @@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif -/* Set of traps needed for early debugging. */ -void __init early_trap_init(void) -{ - /* - * Don't use IST to set DEBUG_STACK as it doesn't work until TSS - * is ready in cpu_init() <-- trap_init(). Before trap_init(), - * CPU runs at ring 0 so it is impossible to hit an invalid - * stack. Using the original stack works well enough at this - * early stage. DEBUG_STACK will be equipped after cpu_init() in - * trap_init(). - * - * We don't need to set trace_idt_table like set_intr_gate(), - * since we don't have trace_debug and it will be reset to - * 'debug' in trap_init() by set_intr_gate_ist(). - */ - set_intr_gate_notrace(X86_TRAP_DB, debug); - /* int3 can be called from all */ - set_system_intr_gate(X86_TRAP_BP, &int3); -#ifdef CONFIG_X86_32 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif - load_idt(&idt_descr); -} - -void __init early_trap_pf_init(void) -{ -#ifdef CONFIG_X86_64 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif -} - void __init trap_init(void) { - int i; - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(X86_TRAP_DE, divide_error); - set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); - /* int4 can be called from all */ - set_system_intr_gate(X86_TRAP_OF, &overflow); - set_intr_gate(X86_TRAP_BR, bounds); - set_intr_gate(X86_TRAP_UD, invalid_op); - set_intr_gate(X86_TRAP_NM, device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); - set_intr_gate(X86_TRAP_TS, invalid_TSS); - set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate(X86_TRAP_SS, stack_segment); - set_intr_gate(X86_TRAP_GP, general_protection); - set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); - set_intr_gate(X86_TRAP_MF, coprocessor_error); - set_intr_gate(X86_TRAP_AC, alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); -#endif - set_intr_gate(X86_TRAP_XF, simd_coprocessor_error); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif - -#ifdef CONFIG_X86_32 - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif + idt_setup_traps(); /* * Set the IDT descriptor to a fixed read-only location, so that the @@ -1030,20 +940,9 @@ void __init trap_init(void) */ cpu_init(); - /* - * X86_TRAP_DB and X86_TRAP_BP have been set - * in early_trap_init(). However, ITS works only after - * cpu_init() loads TSS. See comments in early_trap_init(). - */ - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + idt_setup_ist_traps(); x86_init.irqs.trap_init(); -#ifdef CONFIG_X86_64 - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(X86_TRAP_DB, &debug); - set_nmi_gate(X86_TRAP_BP, &int3); -#endif + idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index b9389d72b2f7..d145a0b1f529 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -10,20 +10,22 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs ? &state->regs->ip : state->bp + 1; +} static void unwind_dump(struct unwind_state *state) { @@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state) } } -unsigned long unwind_get_return_address(struct unwind_state *state) -{ - if (unwind_done(state)) - return 0; - - return __kernel_text_address(state->ip) ? state->ip : 0; -} -EXPORT_SYMBOL_GPL(unwind_get_return_address); - static size_t regs_size(struct pt_regs *regs) { /* x86_32 regs from kernel mode are two words shorter: */ @@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip) if (addr >= __entry_text_start && addr < __entry_text_end) return true; -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) if (addr >= __irqentry_text_start && addr < __irqentry_text_end) return true; -#endif return false; } diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 039f36738e49..4f0e17b90463 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c new file mode 100644 index 000000000000..570b70d3f604 --- /dev/null +++ b/arch/x86/kernel/unwind_orc.c @@ -0,0 +1,582 @@ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> +#include <asm/orc_types.h> +#include <asm/orc_lookup.h> +#include <asm/sections.h> + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static DEFINE_MUTEX(sort_mutex); +int *cur_orc_ip_table = __start_orc_unwind_ip; +struct orc_entry *cur_orc_table = __start_orc_unwind; + +unsigned int lookup_num_blocks; +bool orc_init; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *last = ip_table + num_entries - 1; + int *mid = first, *found = first; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, + mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +static struct orc_entry *orc_find(unsigned long ip) +{ + if (!orc_init) + return NULL; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", + idx, lookup_num_blocks, ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", + idx, lookup_num_blocks, start, stop, ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + return orc_module_find(ip); +} + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + struct orc_entry *orc_a, *orc_b; + struct orc_entry orc_tmp; + int *a = _a, *b = _b, tmp; + int delta = _b - _a; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + orc_tmp = *orc_a; + *orc_a = *orc_b; + *orc_b = orc_tmp; +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a; + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be on the left + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; +} + +#ifdef CONFIG_MODULES +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + int i; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* Sort the .orc_unwind and .orc_unwind_ip tables: */ + sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, + orc_sort_swap); + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, + LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, + LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + if (state->regs) + return &state->regs->ip; + + if (state->sp) + return (unsigned long *)state->sp - 1; + + return NULL; +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If the address isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that info->next_sp could point to an empty stack and the address + * could be on a subsequent stack. + */ + while (!on_stack(info, (void *)addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + unsigned long *val) +{ + if (!stack_access_ok(state, addr, sizeof(long))) + return false; + + *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); + return true; +} + +#define REGS_SIZE (sizeof(struct pt_regs)) +#define SP_OFFSET (offsetof(struct pt_regs, sp)) +#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) + +static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp, bool full) +{ + size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; + size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; + struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); + + if (IS_ENABLED(CONFIG_X86_64)) { + if (!stack_access_ok(state, addr, regs_size)) + return false; + + *ip = regs->ip; + *sp = regs->sp; + + return true; + } + + if (!stack_access_ok(state, addr, sp_offset)) + return false; + + *ip = regs->ip; + + if (user_mode(regs)) { + if (!stack_access_ok(state, addr + sp_offset, + REGS_SIZE - SP_OFFSET)) + return false; + + *sp = regs->sp; + } else + *sp = (unsigned long)®s->sp; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; + struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + /* Have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto done; + + /* + * Find the orc_entry associated with the text address. + * + * Decrement call return addresses by one so they work for sibling + * calls and calls to noreturn functions. + */ + orc = orc_find(state->signal ? state->ip : state->ip - 1); + if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) + goto done; + orig_ip = state->ip; + + /* Find the previous frame's stack: */ + switch (orc->sp_reg) { + case ORC_REG_SP: + sp = state->sp + orc->sp_offset; + break; + + case ORC_REG_BP: + sp = state->bp + orc->sp_offset; + break; + + case ORC_REG_SP_INDIRECT: + sp = state->sp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_BP_INDIRECT: + sp = state->bp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_R10: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R10 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r10; + break; + + case ORC_REG_R13: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R13 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r13; + break; + + case ORC_REG_DI: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DI at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->di; + break; + + case ORC_REG_DX: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DX at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->dx; + break; + + default: + orc_warn("unknown SP base reg %d for ip %p\n", + orc->sp_reg, (void *)state->ip); + goto done; + } + + if (indirect) { + if (!deref_stack_reg(state, sp, &sp)) + goto done; + } + + /* Find IP, SP and possibly regs: */ + switch (orc->type) { + case ORC_TYPE_CALL: + ip_p = sp - sizeof(long); + + if (!deref_stack_reg(state, ip_p, &state->ip)) + goto done; + + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->ip, (void *)ip_p); + + state->sp = sp; + state->regs = NULL; + state->signal = false; + break; + + case ORC_TYPE_REGS: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + orc_warn("can't dereference registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + state->regs = (struct pt_regs *)sp; + state->full_regs = true; + state->signal = true; + break; + + case ORC_TYPE_REGS_IRET: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + orc_warn("can't dereference iret registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + ptregs = container_of((void *)sp, struct pt_regs, ip); + if ((unsigned long)ptregs >= prev_sp && + on_stack(&state->stack_info, ptregs, REGS_SIZE)) { + state->regs = ptregs; + state->full_regs = false; + } else + state->regs = NULL; + + state->signal = true; + break; + + default: + orc_warn("unknown .orc_unwind entry type %d\n", orc->type); + break; + } + + /* Find BP: */ + switch (orc->bp_reg) { + case ORC_REG_UNDEFINED: + if (state->regs && state->full_regs) + state->bp = state->regs->bp; + break; + + case ORC_REG_PREV_SP: + if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) + goto done; + break; + + case ORC_REG_BP: + if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) + goto done; + break; + + default: + orc_warn("unknown BP base reg %d for ip %p\n", + orc->bp_reg, (void *)orig_ip); + goto done; + } + + /* Prevent a recursive loop due to bad ORC data: */ + if (state->stack_info.type == prev_type && + on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && + state->sp <= prev_sp) { + orc_warn("stack going in the wrong direction? ip=%p\n", + (void *)orig_ip); + goto done; + } + + preempt_enable(); + return true; + +done: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* + * Refuse to unwind the stack of a task while it's executing on another + * CPU. This check is racy, but that's ok: the unwinder has other + * checks to prevent it from going off the rails. + */ + if (task_on_another_cpu(task)) + goto done; + + if (regs) { + if (user_mode(regs)) + goto done; + + state->ip = regs->ip; + state->sp = kernel_stack_pointer(regs); + state->bp = regs->bp; + state->regs = regs; + state->full_regs = true; + state->signal = true; + + } else if (task == current) { + asm volatile("lea (%%rip), %0\n\t" + "mov %%rsp, %1\n\t" + "mov %%rbp, %2\n\t" + : "=r" (state->ip), "=r" (state->sp), + "=r" (state->bp)); + + } else { + struct inactive_task_frame *frame = (void *)task->thread.sp; + + state->sp = task->thread.sp; + state->bp = READ_ONCE_NOCHECK(frame->bp); + state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + } + + if (get_stack_info((unsigned long *)state->sp, state->task, + &state->stack_info, &state->stack_mask)) + return; + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + + /* When starting from regs, skip the regs frame: */ + if (regs) { + unwind_next_frame(state); + return; + } + + /* Otherwise, skip ahead to the user-specified starting frame: */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->sp <= (unsigned long)first_frame)) + unwind_next_frame(state); + + return; + +done: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8a3b61be0aa..f05f00acac89 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -24,6 +24,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page_types.h> +#include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> @@ -148,6 +149,8 @@ SECTIONS BUG_TABLE + ORC_UNWIND_TABLE + . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2688c7dc5323..3ea624452f93 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -89,6 +89,5 @@ config KVM_MMU_AUDIT # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig -source drivers/lguest/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b1dd114956a..04d750813c9d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -108,7 +108,7 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) #define PT64_LVL_ADDR_MASK(level) \ @@ -126,7 +126,7 @@ module_param(dbg, bool, 0644); * PT32_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask) + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_present_mask; +static u64 __read_mostly shadow_me_mask; /* * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. @@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) */ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask) + u64 acc_track_mask, u64 me_mask) { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); @@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; shadow_acc_track_mask = acc_track_mask; + shadow_me_mask = me_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask; + shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) spte |= shadow_acc_track_value; @@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pte_access &= ~ACC_WRITE_MASK; spte |= (u64)pfn << PAGE_SHIFT; + spte |= shadow_me_mask; if (pte_access & ACC_WRITE_MASK) { @@ -4106,16 +4109,28 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + struct rsvd_bits_validate *shadow_zero_check; + int i; /* * Passing "true" to the last argument is okay; it adds a check * on bit 8 of the SPTEs which KVM doesn't use anyway. */ - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + shadow_zero_check = &context->shadow_zero_check; + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), true); + + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } + } EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); @@ -4133,17 +4148,29 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { + struct rsvd_bits_validate *shadow_zero_check; + int i; + + shadow_zero_check = &context->shadow_zero_check; + if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else - __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + __reset_rsvds_bits_mask_ept(shadow_zero_check, boot_cpu_data.x86_phys_bits, false); + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } } /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af256b786a70..8dbd8dbc83eb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb; struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; - phys_addr_t bpa = page_to_phys(svm->avic_backing_page); - phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); - phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); + phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); + phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page)); + phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page)); vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; @@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_MWAIT); } - control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = __pa(svm->msrpm); + control->iopm_base_pa = __sme_set(iopm_base); + control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return -EINVAL; new_entry = READ_ONCE(*entry); - new_entry = (page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); WRITE_ONCE(*entry, new_entry); svm->avic_physical_id_cache = entry; @@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb = page_address(page); clear_page(svm->vmcb); - svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); @@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); @@ -2330,7 +2330,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, offset_in_page(cr3) + index * 8, 8); if (ret) return 0; @@ -2342,7 +2342,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.nested_cr3 = root; + svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); svm_flush_tlb(vcpu); } @@ -2873,7 +2873,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; } @@ -4506,7 +4506,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, irq.vector); *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); + vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); vcpu_info->vector = irq.vector; return 0; @@ -4557,7 +4557,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct amd_iommu_pi_data pi; /* Try to enable guest_mode in IRTE */ - pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; + pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & + AVIC_HPA_MASK); pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; @@ -5006,7 +5007,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.cr3 = root; + svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); svm_flush_tlb(vcpu); } @@ -5015,7 +5016,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.nested_cr3 = root; + svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); /* Also sync guest cr3 here in case we live migrate */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6ef2940119b..70b90c0810d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6556,7 +6556,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK); + VMX_EPT_RWX_MASK, 0ull); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -8779,7 +8779,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) vector = exit_intr_info & INTR_INFO_VECTOR_MASK; desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(*desc); + entry = gate_offset(desc); asm volatile( #ifdef CONFIG_X86_64 "mov %%" _ASM_SP ", %[sp]\n\t" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 272320eb328c..ef5102f80497 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -54,6 +54,7 @@ #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> +#include <linux/mem_encrypt.h> #include <trace/events/kvm.h> @@ -6125,7 +6126,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0); + PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig deleted file mode 100644 index 08f41caada45..000000000000 --- a/arch/x86/lguest/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config LGUEST_GUEST - bool "Lguest guest support" - depends on X86_32 && PARAVIRT && PCI - select TTY - select VIRTUALIZATION - select VIRTIO - select VIRTIO_CONSOLE - help - Lguest is a tiny in-kernel hypervisor. Selecting this will - allow your kernel to boot under lguest. This option will increase - your kernel size by about 10k. If in doubt, say N. - - If you say Y here, make sure you say Y (or M) to the virtio block - and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile deleted file mode 100644 index 8f38d577a2fa..000000000000 --- a/arch/x86/lguest/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y := head_32.o boot.o -CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c deleted file mode 100644 index 99472698c931..000000000000 --- a/arch/x86/lguest/boot.c +++ /dev/null @@ -1,1558 +0,0 @@ -/*P:010 - * A hypervisor allows multiple Operating Systems to run on a single machine. - * To quote David Wheeler: "Any problem in computer science can be solved with - * another layer of indirection." - * - * We keep things simple in two ways. First, we start with a normal Linux - * kernel and insert a module (lg.ko) which allows us to run other Linux - * kernels the same way we'd run processes. We call the first kernel the Host, - * and the others the Guests. The program which sets up and configures Guests - * (such as the example in tools/lguest/lguest.c) is called the Launcher. - * - * Secondly, we only run specially modified Guests, not normal kernels: setting - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows - * how to be a Guest at boot time. This means that you can use the same kernel - * you boot normally (ie. as a Host) as a Guest. - * - * These Guests know that they cannot do privileged operations, such as disable - * interrupts, and that they have to ask the Host to do such things explicitly. - * This file consists of all the replacements for such low-level native - * hardware operations: these special Guest versions call the Host. - * - * So how does the kernel know it's a Guest? We'll see that later, but let's - * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. -:*/ - -/* - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include <linux/kernel.h> -#include <linux/start_kernel.h> -#include <linux/string.h> -#include <linux/console.h> -#include <linux/screen_info.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/clocksource.h> -#include <linux/clockchips.h> -#include <linux/lguest.h> -#include <linux/lguest_launcher.h> -#include <linux/virtio_console.h> -#include <linux/pm.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <linux/virtio_pci.h> -#include <asm/acpi.h> -#include <asm/apic.h> -#include <asm/lguest.h> -#include <asm/paravirt.h> -#include <asm/param.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/setup.h> -#include <asm/e820/api.h> -#include <asm/mce.h> -#include <asm/io.h> -#include <asm/fpu/api.h> -#include <asm/stackprotector.h> -#include <asm/reboot.h> /* for struct machine_ops */ -#include <asm/kvm_para.h> -#include <asm/pci_x86.h> -#include <asm/pci-direct.h> - -/*G:010 - * Welcome to the Guest! - * - * The Guest in our tale is a simple creature: identical to the Host but - * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). -:*/ - -struct lguest_data lguest_data = { - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_iret = (u32)lguest_noirq_iret, - .kernel_address = PAGE_OFFSET, - .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = IA32_SYSCALL_VECTOR, -}; - -/*G:037 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a - * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, - * and 255 once the Host has finished with it. - * - * If we come around to a slot which hasn't been finished, then the table is - * full and we just make the hypercall directly. This has the nice side - * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! - */ -static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* Note: This code assumes we're uniprocessor. */ - static unsigned int next_call; - unsigned long flags; - - /* - * Disable interrupts if not already disabled: we don't want an - * interrupt handler making a hypercall while we're already doing - * one! - */ - local_irq_save(flags); - if (lguest_data.hcall_status[next_call] != 0xFF) { - /* Table full, so do normal hcall which will flush table. */ - hcall(call, arg1, arg2, arg3, arg4); - } else { - lguest_data.hcalls[next_call].arg0 = call; - lguest_data.hcalls[next_call].arg1 = arg1; - lguest_data.hcalls[next_call].arg2 = arg2; - lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; - /* Arguments must all be written before we mark it to go */ - wmb(); - lguest_data.hcall_status[next_call] = 0; - if (++next_call == LHCALL_RING_SIZE) - next_call = 0; - } - local_irq_restore(flags); -} - -/*G:035 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real - * optimization trick! - * - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do - * them as a batch when lazy_mode is eventually turned off. Because hypercalls - * are reasonably expensive, batching them up makes sense. For example, a - * large munmap might update dozens of page table entries: that code calls - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls - * lguest_leave_lazy_mode(). - * - * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: - */ -static void lazy_hcall1(unsigned long call, unsigned long arg1) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, 0, 0, 0); - else - async_hcall(call, arg1, 0, 0, 0); -} - -/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ -static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, 0, 0); - else - async_hcall(call, arg1, arg2, 0, 0); -} - -static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, 0); - else - async_hcall(call, arg1, arg2, arg3, 0); -} - -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - -/*G:036 - * When lazy mode is turned off, we issue the do-nothing hypercall to - * flush any stored calls, and call the generic helper to reset the - * per-cpu lazy mode variable. - */ -static void lguest_leave_lazy_mmu_mode(void) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_leave_lazy_mmu(); -} - -/* - * We also catch the end of context switch; we enter lazy mode for much of - * that too, so again we need to flush here. - * - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU - * mode, but unlike Xen, lguest doesn't care about the difference). - */ -static void lguest_end_context_switch(struct task_struct *next) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_end_context_switch(next); -} - -/*G:032 - * After that diversion we return to our first native-instruction - * replacements: four functions for interrupt control. - * - * The simplest way of implementing these would be to have "turn interrupts - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: - * these are by far the most commonly called functions of those we override. - * - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", - * which the Guest can update with a single instruction. The Host knows to - * check there before it tries to deliver an interrupt. - */ - -/* - * save_flags() is expected to return the processor state (ie. "flags"). The - * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. - */ -asmlinkage __visible unsigned long lguest_save_fl(void) -{ - return lguest_data.irq_enabled; -} - -/* Interrupts go off... */ -asmlinkage __visible void lguest_irq_disable(void) -{ - lguest_data.irq_enabled = 0; -} - -/* - * Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. - */ -PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); -PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); -/*:*/ - -/* These are in head_32.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); - -/*M:003 - * We could be more efficient in our checking of outstanding interrupts, rather - * than using a branch. One way would be to put the "irq_enabled" field in a - * page by itself, and have the Host write-protect it when an interrupt comes - * in when irqs are disabled. There will then be a page fault as soon as - * interrupts are re-enabled. - * - * A better method is to implement soft interrupt disable generally for x86: - * instead of disabling interrupts, we set a flag. If an interrupt does come - * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. -:*/ - -/*G:034 - * The Interrupt Descriptor Table (IDT). - * - * The IDT tells the processor what to do when an interrupt comes in. Each - * entry in the table is a 64-bit descriptor: this holds the privilege level, - * address of the handler, and... well, who cares? The Guest just asks the - * Host to make the change anyway, because the Host controls the real IDT. - */ -static void lguest_write_idt_entry(gate_desc *dt, - int entrynum, const gate_desc *g) -{ - /* - * The gate_desc structure is 8 bytes long: we hand it to the Host in - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors - * around like this; typesafety wasn't a big concern in Linux's early - * years. - */ - u32 *desc = (u32 *)g; - /* Keep the local copy up to date. */ - native_write_idt_entry(dt, entrynum, g); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); -} - -/* - * Changing to a different IDT is very rare: we keep the IDT up-to-date every - * time it is written, so we can simply loop through all entries and tell the - * Host about them. - */ -static void lguest_load_idt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *idt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); -} - -/* - * The Global Descriptor Table. - * - * The Intel architecture defines another table, called the Global Descriptor - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" - * instruction, and then several other instructions refer to entries in the - * table. There are three entries which the Switcher needs, so the Host simply - * controls the entire thing and the Guest asks it to make changes using the - * LOAD_GDT hypercall. - * - * This is the exactly like the IDT code. - */ -static void lguest_load_gdt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *gdt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); -} - -/* - * For a single GDT entry which changes, we simply change our copy and - * then tell the host about it. - */ -static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, - const void *desc, int type) -{ - native_write_gdt_entry(dt, entrynum, desc, type); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b, 0); -} - -/* - * There are three "thread local storage" GDT entries which change - * on every context switch (these three entries are how glibc implements - * __thread variables). As an optimization, we have a hypercall - * specifically for this case. - * - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall - * which took a range of entries? - */ -static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * There's one problem which normal hardware doesn't have: the Host - * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. - */ - lazy_load_gs(0); - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); -} - -/*G:038 - * That's enough excitement for now, back to ploughing through each of the - * different pv_ops structures (we're about 1/3 of the way through). - * - * This is the Local Descriptor Table, another weird Intel thingy. Linux only - * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. - */ -static void lguest_set_ldt(const void *addr, unsigned entries) -{ -} - -/* - * This loads a GDT entry into the "Task Register": that entry points to a - * structure called the Task State Segment. Some comments scattered though the - * kernel code indicate that this used for task switching in ages past, along - * with blood sacrifice and astrology. - * - * Now there's nothing interesting in here that we don't get told elsewhere. - * But the native version uses the "ltr" instruction, which makes the Host - * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. - */ -static void lguest_load_tr_desc(void) -{ -} - -/* - * The "cpuid" instruction is a way of querying both the CPU identity - * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. - * As you might imagine, after a decade and a half this treatment, it is now a - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. - * - * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 6 languages. I am not making this up! - * - * We could get funky here and identify ourselves as "GenuineLguest", but - * instead we just use the real "cpuid" instruction. Then I pretty much turned - * off feature bits until the Guest booted. (Don't say that: you'll damage - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) No one's listening! They don't like you anyway, - * parenthetic weirdo! - * - * Replacing the cpuid so we can turn features off is great for the kernel, but - * anyone (including userspace) can just use the raw "cpuid" instruction and - * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. - */ -static void lguest_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int function = *ax; - - native_cpuid(ax, bx, cx, dx); - switch (function) { - /* - * CPUID 0 gives the highest legal CPUID number (and the ID string). - * We futureproof our code a little by sticking to known CPUID values. - */ - case 0: - if (*ax > 5) - *ax = 5; - break; - - /* - * CPUID 1 is a basic feature request. - * - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. - */ - case 1: - *cx &= 0x00002201; - *dx &= 0x07808151; - /* - * The Host can do a nice optimization if it knows that the - * kernel mappings (addresses above 0xC0000000 or whatever - * PAGE_OFFSET is set to) haven't changed. But Linux calls - * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. - */ - *dx |= 0x00002000; - /* - * We also lie, and say we're family id 5. 6 or greater - * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. - */ - *ax &= 0xFFFFF0FF; - *ax |= 0x00000500; - break; - - /* - * This is used to detect if we're running under KVM. We might be, - * but that's a Host matter, not us. So say we're not. - */ - case KVM_CPUID_SIGNATURE: - *bx = *cx = *dx = 0; - break; - - /* - * 0x80000000 returns the highest Extended Function, so we futureproof - * like we do above by limiting it to known fields. - */ - case 0x80000000: - if (*ax > 0x80000008) - *ax = 0x80000008; - break; - - /* - * PAE systems can mark pages as non-executable. Linux calls this the - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch it off here, since we don't - * support it. - */ - case 0x80000001: - *dx &= ~(1 << 20); - break; - } -} - -/* - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother - * it. The Host needs to know when the Guest wants to change them, so we have - * a whole series of functions like read_cr0() and write_cr0(). - * - * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but the only cr0 bit that Linux ever used at runtime was the - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) - * - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if - * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch if we wanted to, but wouldn't a name like - * "FPUTRAP bit" be a little less cryptic? - * - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore - * cr0. - */ -static void lguest_write_cr0(unsigned long val) -{ -} - -static unsigned long lguest_read_cr0(void) -{ - return 0; -} - -/* - * cr2 is the virtual address of the last page fault, which the Guest only ever - * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. - */ -static unsigned long lguest_read_cr2(void) -{ - return lguest_data.cr2; -} - -/* See lguest_set_pte() below. */ -static bool cr3_changed = false; -static unsigned long current_cr3; - -/* - * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. - */ -static void lguest_write_cr3(unsigned long cr3) -{ - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - current_cr3 = cr3; - - /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa_symbol(swapper_pg_dir) && - cr3 != __pa_symbol(initial_page_table)) - cr3_changed = true; -} - -static unsigned long lguest_read_cr3(void) -{ - return current_cr3; -} - -/* cr4 is used to enable and disable PGE, but we don't care. */ -static unsigned long lguest_read_cr4(void) -{ - return 0; -} - -static void lguest_write_cr4(unsigned long val) -{ -} - -/* - * Page Table Handling. - * - * Now would be a good time to take a rest and grab a coffee or similarly - * relaxing stimulant. The easy parts are behind us, and the trek gradually - * winds uphill from here. - * - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU - * maps virtual addresses to physical addresses using "page tables". We could - * use one huge index of 1 million entries: each address is 4 bytes, so that's - * 1024 pages just to hold the page tables. But since most virtual addresses - * are unused, we use a two level index which saves space. The cr3 register - * contains the physical address of the top level "page directory" page, which - * contains physical addresses of up to 1024 second-level pages. Each of these - * second level pages contains up to 1024 physical addresses of actual pages, - * or Page Table Entries (PTEs). - * - * Here's a diagram, where arrows indicate physical addresses: - * - * cr3 ---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * So to convert a virtual address to a physical address, we look up the top - * level, which points us to the second level, which gives us the physical - * address of that page. If the top level entry was not present, or the second - * level entry was not present, then the virtual address is invalid (we - * say "the page was not mapped"). - * - * Put another way, a 32-bit virtual address is divided up like so: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| - * Index into top Index into second Offset within page - * page directory page pagetable page - * - * Now, unfortunately, this isn't the whole story: Intel added Physical Address - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). - * These are held in 64-bit page table entries, so we can now only fit 512 - * entries in a page, and the neat three-level tree breaks down. - * - * The result is a four level page table: - * - * cr3 --> [ 4 Upper ] - * [ Level ] - * [ Entries ] - * [(PUD Page)]---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * - * And the virtual address is decoded as: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| - * Index into Index into mid Index into lower Offset within page - * top entries directory page pagetable page - * - * It's too hard to switch between these two formats at runtime, so Linux only - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many - * distributions turn it on, and not just for people with silly amounts of - * memory: the larger PTE entries allow room for the NX bit, which lets the - * kernel disable execution of pages and increase security. - * - * This was a problem for lguest, which couldn't run on these distributions; - * then Matias Zabaljauregui figured it all out and implemented it, and only a - * handful of puppies were crushed in the process! - * - * Back to our point: the kernel spends a lot of time changing both the - * top-level page directory and lower-level pagetable pages. The Guest doesn't - * know physical addresses, so while it maintains these page tables exactly - * like normal, it also needs to keep the Host informed whenever it makes a - * change: the Host will create the real page tables based on the Guests'. - */ - -/* - * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. We tell the Host the toplevel and - * address this corresponds to. The Guest uses one pagetable per process, so - * we need to tell the Host which one we're changing (mm->pgd). - */ -static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ -#ifdef CONFIG_X86_PAE - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif -} - -/* This is the "set and update" combo-meal-deal version. */ -static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - lguest_pte_update(mm, addr, ptep); -} - -/* - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. - */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif - -/* - * There are a couple of legacy places where the kernel sets a PTE, but we - * don't know the top level any more. This is useless for us, since we don't - * know which pagetable is changing or what address, so we just tell the Host - * to forget all of them. Fortunately, this is very rare. - * - * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 48 seconds! So we don't even tell - * the Host anything changed until we've done the first real page table switch, - * which brings boot back to 4.3 seconds. - */ -static void lguest_set_pte(pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -/* - * With 64-bit PTE values, we need to be careful setting them: if we set 32 - * bits at a time, the hardware could see a weird half-set entry. These - * versions ensure we update all 64 bits at once. - */ -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -static void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - -/* - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on - * native page table operations. On native hardware you can set a new page - * table entry whenever you want, but if you want to remove one you have to do - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). - * - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only - * called when a valid entry is written, not when it's removed (ie. marked not - * present). Instead, this is where we come when the Guest wants to remove a - * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). - */ -static void lguest_flush_tlb_single(unsigned long addr) -{ - /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); -} - -/* - * This is what happens after the Guest has removed a large number of entries. - * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. - */ -static void lguest_flush_tlb_user(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 0); -} - -/* - * This is called when the kernel page tables have changed. That's not very - * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. - */ -static void lguest_flush_tlb_kernel(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -/* - * The Unadvanced Programmable Interrupt Controller. - * - * This is an attempt to implement the simplest possible interrupt controller. - * I spent some time looking though routines like set_irq_chip_and_handler, - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and - * I *think* this is as simple as it gets. - * - * We can tell the Host what interrupts we want blocked ready for using the - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as - * simple as setting a bit. We don't actually "ack" interrupts as such, we - * just mask and unmask them. I wonder if we should be cleverer? - */ -static void disable_lguest_irq(struct irq_data *data) -{ - set_bit(data->irq, lguest_data.blocked_interrupts); -} - -static void enable_lguest_irq(struct irq_data *data) -{ - clear_bit(data->irq, lguest_data.blocked_interrupts); -} - -/* This structure describes the lguest IRQ controller. */ -static struct irq_chip lguest_irq_controller = { - .name = "lguest", - .irq_mask = disable_lguest_irq, - .irq_mask_ack = disable_lguest_irq, - .irq_unmask = enable_lguest_irq, -}; - -/* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -static int lguest_setup_irq(unsigned int irq) -{ - struct irq_desc *desc; - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - /* - * Tell the Linux infrastructure that the interrupt is - * controlled by our level-based lguest interrupt controller. - */ - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - desc = irq_to_desc(irq); - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); - return 0; -} - -static int lguest_enable_irq(struct pci_dev *dev) -{ - int err; - u8 line = 0; - - /* We literally use the PCI interrupt line as the irq number. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - err = lguest_setup_irq(line); - if (!err) - dev->irq = line; - return err; -} - -/* We don't do hotplug PCI, so this shouldn't be called. */ -static void lguest_disable_irq(struct pci_dev *dev) -{ - WARN_ON(1); -} - -/* - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls). - */ -static void __init lguest_init_IRQ(void) -{ - unsigned int i; - - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } - - /* - * This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. - */ - irq_ctx_init(smp_processor_id()); -} - -/* - * Time. - * - * It would be far better for everyone if the Guest had its own clock, but - * until then the Host gives us the time on every interrupt. - */ -static void lguest_get_wallclock(struct timespec *now) -{ - *now = lguest_data.time; -} - -/* - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us - * what speed it runs at, or 0 if it's unusable as a reliable clock source. - * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. - */ -static unsigned long lguest_tsc_khz(void) -{ - return lguest_data.tsc_khz; -} - -/* - * If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. - */ -static u64 lguest_clock_read(struct clocksource *cs) -{ - unsigned long sec, nsec; - - /* - * Since the time is in two parts (seconds and nanoseconds), we risk - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, - * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: - */ - do { - /* First we read the seconds part. */ - sec = lguest_data.time.tv_sec; - /* - * This read memory barrier tells the compiler and the CPU that - * this can't be reordered: we have to complete the above - * before going on. - */ - rmb(); - /* Now we read the nanoseconds part. */ - nsec = lguest_data.time.tv_nsec; - /* Make sure we've done that. */ - rmb(); - /* Now if the seconds part has changed, try again. */ - } while (unlikely(lguest_data.time.tv_sec != sec)); - - /* Our lguest clock is in real nanoseconds. */ - return sec*1000000000ULL + nsec; -} - -/* This is the fallback clocksource: lower priority than the TSC clocksource. */ -static struct clocksource lguest_clock = { - .name = "lguest", - .rating = 200, - .read = lguest_clock_read, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - * We also need a "struct clock_event_device": Linux asks us to set it to go - * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. - */ -static int lguest_clockevent_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* FIXME: I don't think this can ever happen, but James tells me he had - * to put this code in. Maybe we should remove it now. Anyone? */ - if (delta < LG_CLOCK_MIN_DELTA) { - if (printk_ratelimit()) - printk(KERN_DEBUG "%s: small delta %lu ns\n", - __func__, delta); - return -ETIME; - } - - /* Please wake us this far in the future. */ - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); - return 0; -} - -static int lguest_clockevent_shutdown(struct clock_event_device *evt) -{ - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - return 0; -} - -/* This describes our primitive timer chip. */ -static struct clock_event_device lguest_clockevent = { - .name = "lguest", - .features = CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = lguest_clockevent_set_next_event, - .set_state_shutdown = lguest_clockevent_shutdown, - .rating = INT_MAX, - .mult = 1, - .shift = 0, - .min_delta_ns = LG_CLOCK_MIN_DELTA, - .min_delta_ticks = LG_CLOCK_MIN_DELTA, - .max_delta_ns = LG_CLOCK_MAX_DELTA, - .max_delta_ticks = LG_CLOCK_MAX_DELTA, -}; - -/* - * This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. - */ -static void lguest_time_irq(struct irq_desc *desc) -{ - unsigned long flags; - - /* Don't interrupt us while this is running. */ - local_irq_save(flags); - lguest_clockevent.event_handler(&lguest_clockevent); - local_irq_restore(flags); -} - -/* - * At some point in the boot process, we get asked to set up our timing - * infrastructure. The kernel doesn't expect timer interrupts before this, but - * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. - */ -static void lguest_time_init(void) -{ - /* Set up the timer interrupt (0) to go to our simple timer routine */ - if (lguest_setup_irq(0) != 0) - panic("Could not set up timer irq"); - irq_set_handler(0, lguest_time_irq); - - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); - - /* We can't set cpumask in the initializer: damn C limitations! Set it - * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&lguest_clockevent); - - /* Finally, we unblock the timer interrupt. */ - clear_bit(0, lguest_data.blocked_interrupts); -} - -/* - * Miscellaneous bits and pieces. - * - * Here is an oddball collection of functions which the Guest needs for things - * to work. They're pretty simple. - */ - -/* - * The Guest needs to tell the Host what stack it expects traps to use. For - * native hardware, this is part of the Task State Segment mentioned above in - * lguest_load_tr_desc(), but to help hypervisors there's this special call. - * - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data - * segment), the privilege level (we're privilege level 1, the Host is 0 and - * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. - */ -static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, - THREAD_SIZE / PAGE_SIZE); - tss->x86_tss.sp0 = thread->sp0; -} - -/* Let's just say, I wouldn't do debugging under a Guest. */ -static unsigned long lguest_get_debugreg(int regno) -{ - /* FIXME: Implement */ - return 0; -} - -static void lguest_set_debugreg(int regno, unsigned long value) -{ - /* FIXME: Implement */ -} - -/* - * There are times when the kernel wants to make sure that no memory writes are - * caught in the cache (that they've all reached real hardware devices). This - * doesn't matter for the Guest which has virtual hardware. - * - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush - * (clflush) instruction is available and the kernel uses that. Otherwise, it - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can - * ignore clflush, but replace wbinvd. - */ -static void lguest_wbinvd(void) -{ -} - -/* - * If the Guest expects to have an Advanced Programmable Interrupt Controller, - * we play dumb by ignoring writes and returning 0 for reads. So it's no - * longer Programmable nor Controlling anything, and I don't think 8 lines of - * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. - */ -#ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(u32 reg, u32 v) -{ -} - -static u32 lguest_apic_read(u32 reg) -{ - return 0; -} - -static u64 lguest_apic_icr_read(void) -{ - return 0; -} - -static void lguest_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void lguest_apic_wait_icr_idle(void) -{ - return; -} - -static u32 lguest_apic_safe_wait_icr_idle(void) -{ - return 0; -} - -static void set_lguest_basic_apic_ops(void) -{ - apic->read = lguest_apic_read; - apic->write = lguest_apic_write; - apic->icr_read = lguest_apic_icr_read; - apic->icr_write = lguest_apic_icr_write; - apic->wait_icr_idle = lguest_apic_wait_icr_idle; - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; -}; -#endif - -/* STOP! Until an interrupt comes in. */ -static void lguest_safe_halt(void) -{ - hcall(LHCALL_HALT, 0, 0, 0, 0); -} - -/* - * The SHUTDOWN hypercall takes a string to describe what's happening, and - * an argument which says whether this to restart (reboot) the Guest or not. - * - * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. - */ -static void lguest_power_off(void) -{ - hcall(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF, 0, 0); -} - -/* - * Panicing. - * - * Don't. But if you did, this is what happens. - */ -static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) -{ - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); - /* The hcall won't return, but to keep gcc happy, we're "done". */ - return NOTIFY_DONE; -} - -static struct notifier_block paniced = { - .notifier_call = lguest_panic -}; - -/* Setting up memory is fairly easy. */ -static __init char *lguest_memory_setup(void) -{ - /* - * The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. - */ - e820__range_add(boot_params.e820_table[0].addr, - boot_params.e820_table[0].size, - boot_params.e820_table[0].type); - - /* This string is for the boot messages. */ - return "LGUEST"; -} - -/* Offset within PCI config space of BAR access capability. */ -static int console_cfg_offset = 0; -static int console_access_cap; - -/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ -static void set_cfg_window(u32 cfg_offset, u32 off) -{ - write_pci_config_byte(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, bar), - 0); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, length), - 4); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, offset), - off); -} - -static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) -{ - /* - * We could set this up once, then leave it; nothing else in the * - * kernel should touch these registers. But if it went wrong, that - * would be a horrible bug to find. - */ - set_cfg_window(cfg_offset, off); - write_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap), val); -} - -static void probe_pci_console(void) -{ - u8 cap, common_cap = 0, device_cap = 0; - u32 device_len; - - /* Avoid recursive printk into here. */ - console_cfg_offset = -1; - - if (!early_pci_allowed()) { - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); - return; - } - - /* We expect a console PCI device at BUS0, slot 1. */ - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { - printk(KERN_ERR "lguest: PCI device is %#x!\n", - read_pci_config(0, 1, 0, 0)); - return; - } - - /* Find the capabilities we need (must be in bar0) */ - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); - while (cap) { - u8 vndr = read_pci_config_byte(0, 1, 0, cap); - if (vndr == PCI_CAP_ID_VNDR) { - u8 type, bar; - - type = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, cfg_type)); - bar = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, bar)); - - switch (type) { - case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) - device_cap = cap; - break; - case VIRTIO_PCI_CAP_PCI_CFG: - console_access_cap = cap; - break; - } - } - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); - } - if (!device_cap || !console_access_cap) { - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", - common_cap, device_cap, console_access_cap); - return; - } - - /* - * Note that we can't check features, until we've set the DRIVER - * status bit. We don't want to do that until we have a real driver, - * so we just check that the device-specific config has room for - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE - * it should ignore the access. - */ - device_len = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, length)); - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) - + sizeof(u32))) { - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); - return; - } - - console_cfg_offset = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, offset)); - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); -} - -/* - * We will eventually use the virtio console device to produce console output, - * but before that is set up we use the virtio PCI console's backdoor mmio - * access and the "emergency" write facility (which is legal even before the - * device is configured). - */ -static __init int early_put_chars(u32 vtermno, const char *buf, int count) -{ - /* If we couldn't find PCI console, forget it. */ - if (console_cfg_offset < 0) - return count; - - if (unlikely(!console_cfg_offset)) { - probe_pci_console(); - if (console_cfg_offset < 0) - return count; - } - - write_bar_via_cfg(console_access_cap, - console_cfg_offset - + offsetof(struct virtio_console_config, emerg_wr), - buf[0]); - return 1; -} - -/* - * Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. - */ -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); -} - -/*G:050 - * Patching (Powerfully Placating Performance Pedants) - * - * We have already seen that pv_ops structures let us replace simple native - * instructions with calls to the appropriate back end all throughout the - * kernel. This allows the same kernel to run as a Guest and as a native - * kernel, but it's slow because of all the indirect branches. - * - * Remember that David Wheeler quote about "Any problem in computer science can - * be solved with another layer of indirection"? The rest of that quote is - * "... But that usually will create another problem." This is the first of - * those problems. - * - * Our current solution is to allow the paravirt back end to optionally patch - * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. - * - * First we need assembly templates of each of the patchable Guest operations, - * and these are in head_32.S. - */ - -/*G:060 We construct a table from the assembler templates: */ -static const struct lguest_insns -{ - const char *start, *end; -} lguest_insns[] = { - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, -}; - -/* - * Now our patch routine is fairly simple (based on the native one in - * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. - */ -static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, - unsigned long addr, unsigned len) -{ - unsigned int insn_len; - - /* Don't do anything special if we don't have a replacement */ - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - insn_len = lguest_insns[type].end - lguest_insns[type].start; - - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ - if (len < insn_len) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - /* Copy in our instructions. */ - memcpy(ibuf, lguest_insns[type].start, insn_len); - return insn_len; -} - -/*G:029 - * Once we get to lguest_init(), we know we're a Guest. The various - * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. - */ -__init void lguest_init(void) -{ - /* We're under lguest. */ - pv_info.name = "lguest"; - /* We're running at privilege level 1, not 0 as normal. */ - pv_info.kernel_rpl = 1; - /* Everyone except Xen runs with this set. */ - pv_info.shared_kernel_pmd = 1; - - /* - * We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. - */ - - /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); - pv_irq_ops.safe_halt = lguest_safe_halt; - - /* Setup operations */ - pv_init_ops.patch = lguest_patch; - - /* Intercepts of various CPU instructions */ - pv_cpu_ops.load_gdt = lguest_load_gdt; - pv_cpu_ops.cpuid = lguest_cpuid; - pv_cpu_ops.load_idt = lguest_load_idt; - pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_sp0 = lguest_load_sp0; - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; - pv_cpu_ops.set_ldt = lguest_set_ldt; - pv_cpu_ops.load_tls = lguest_load_tls; - pv_cpu_ops.get_debugreg = lguest_get_debugreg; - pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.read_cr0 = lguest_read_cr0; - pv_cpu_ops.write_cr0 = lguest_write_cr0; - pv_cpu_ops.read_cr4 = lguest_read_cr4; - pv_cpu_ops.write_cr4 = lguest_write_cr4; - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; - pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; - pv_cpu_ops.end_context_switch = lguest_end_context_switch; - - /* Pagetable management */ - pv_mmu_ops.write_cr3 = lguest_write_cr3; - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - pv_mmu_ops.set_pte = lguest_set_pte; - pv_mmu_ops.set_pte_at = lguest_set_pte_at; - pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif - pv_mmu_ops.read_cr2 = lguest_read_cr2; - pv_mmu_ops.read_cr3 = lguest_read_cr3; - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; - pv_mmu_ops.pte_update = lguest_pte_update; - -#ifdef CONFIG_X86_LOCAL_APIC - /* APIC read/write intercepts */ - set_lguest_basic_apic_ops(); -#endif - - x86_init.resources.memory_setup = lguest_memory_setup; - x86_init.irqs.intr_init = lguest_init_IRQ; - x86_init.timers.timer_init = lguest_time_init; - x86_platform.calibrate_tsc = lguest_tsc_khz; - x86_platform.get_wallclock = lguest_get_wallclock; - - /* - * Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). - */ - - /*G:070 - * Now we've seen all the paravirt_ops, we return to - * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - */ - - /* - * The stack protector is a weird thing where gcc places a canary - * value on the stack and then checks it on return. This file is - * compiled with -fno-stack-protector it, so we got this far without - * problems. The value of the canary is kept at offset 20 from the - * %gs register, so we need to set that up before calling C functions - * in other files. - */ - setup_stack_canary_segment(0); - - /* - * We could just call load_stack_canary_segment(), but we might as well - * call switch_to_new_gdt() which loads the whole table and sets up the - * per-cpu segment descriptor register %fs as well. - */ - switch_to_new_gdt(0); - - /* - * The Host<->Guest Switcher lives at the top of our address space, and - * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem - */ - reserve_top_address(lguest_data.reserve_mem); - - /* Hook in our special panic hypercall code. */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - - /* - * This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: - */ - cpu_detect(&new_cpu_data); - /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); - - /* Math is always hard! */ - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - - /* We don't have features. We have puppies! Puppies! */ -#ifdef CONFIG_X86_MCE - mca_cfg.disabled = true; -#endif -#ifdef CONFIG_ACPI - acpi_disabled = 1; -#endif - - /* - * We set the preferred console to "hvc". This is the "hypervisor - * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. - */ - add_preferred_console("hvc", 0, NULL); - - /* Register our very early console. */ - virtio_cons_early_init(early_put_chars); - - /* Don't let ACPI try to control our PCI interrupts. */ - disable_acpi(); - - /* We control them ourselves, by overriding these two hooks. */ - pcibios_enable_irq = lguest_enable_irq; - pcibios_disable_irq = lguest_disable_irq; - - /* - * Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off, and the reboot hook to our restart - * routine. - */ - pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; - - /* - * Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. - */ - i386_start_kernel(); -} -/* - * This marks the end of stage II of our journey, The Guest. - * - * It is now time for us to explore the layer of virtual drivers and complete - * our understanding of the Guest in "make Drivers". - */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S deleted file mode 100644 index d5ae63f5ec5d..000000000000 --- a/arch/x86/lguest/head_32.S +++ /dev/null @@ -1,192 +0,0 @@ -#include <linux/linkage.h> -#include <linux/lguest.h> -#include <asm/lguest_hcall.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/processor-flags.h> - -/*G:020 - - * Our story starts with the bzImage: booting starts at startup_32 in - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real - * kernel in place and then jumps into it: startup_32 in - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi - * register, which is created by the bootloader (the Launcher in our case). - * - * The startup_32 function does very little: it clears the uninitialized global - * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe, and populates some initial - * page tables. Finally it checks the 'hardware_subarch' field. This was - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's - * assigned number), then it calls us here. - * - * WARNING: be very careful here! We're running at addresses equal to physical - * addresses (around 0), not above PAGE_OFFSET as most code expects - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data without remembering to subtract __PAGE_OFFSET! - * - * The .section line puts this code in .init.text so it will be discarded after - * boot. - */ -.section .init.text, "ax", @progbits -ENTRY(lguest_entry) - /* - * We make the "initialization" hypercall now to tell the Host where - * our lguest_data struct is. - */ - movl $LHCALL_LGUEST_INIT, %eax - movl $lguest_data - __PAGE_OFFSET, %ebx - int $LGUEST_TRAP_ENTRY - - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ - movl $LHCALL_NEW_PGTABLE, %eax - movl $(initial_page_table - __PAGE_OFFSET), %ebx - int $LGUEST_TRAP_ENTRY - - /* Set up the initial stack so we can run C code. */ - movl $(init_thread_union+THREAD_SIZE),%esp - - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ - jmp lguest_init+__PAGE_OFFSET - -/*G:055 - * We create a macro which puts the assembler code between lgstart_ and lgend_ - * markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. - */ -.text -#define LGUEST_PATCH(name, insns...) \ - lgstart_##name: insns; lgend_##name:; \ - .globl lgstart_##name; .globl lgend_##name - -LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 - * But using those wrappers is inefficient (we'll see why that doesn't matter - * for save_fl and irq_disable later). If we write our routines carefully in - * assembler, we can avoid clobbering any registers and avoid jumping through - * the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine to - * enable interrupts: - */ -ENTRY(lg_irq_enable) - /* - * The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). - */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* - * But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. - */ - cmpl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* - * One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! - */ - ret -send_interrupts: - /* - * OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. - */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is the actual hypercall trap. */ - int $LGUEST_TRAP_ENTRY - /* Put eax back the way we found it. */ - popl %eax - ret - -/* - * Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. - */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* - * Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. - */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret -/*:*/ - -/* These demark the EIP where host should never deliver interrupts. */ -.global lguest_noirq_iret - -/*M:004 - * When the Host reflects a trap or injects an interrupt into the Guest, it - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, - * so the Guest iret logic does the right thing when restoring it. However, - * when the Host sets the Guest up for direct traps, such as system calls, the - * processor is the one to push eflags onto the stack, and the interrupt bit - * will be 1 (in reality, interrupts are always enabled in the Guest). - * - * This turns out to be harmless: the only trap which should happen under Linux - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc - * regions), which has to be reflected through the Host anyway. If another - * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! -:*/ - -/*G:045 - * There is one final paravirt_op that the Guest implements, and glancing at it - * you can see why I left it to last. It's *cool*! It's in *assembler*! - * - * The "iret" instruction is used to return from an interrupt or trap. The - * stack looks like this: - * old address - * old code segment & privilege level - * old processor flags ("eflags") - * - * The "iret" instruction pops those values off the stack and restores them all - * at once. The only problem is that eflags includes the Interrupt Flag which - * the Guest can't change: the CPU will simply ignore it when we do an "iret". - * So we have to copy eflags from the stack to lguest_data.irq_enabled before - * we do the "iret". - * - * There are two problems with this: firstly, we can't clobber any registers - * and secondly, the whole thing needs to be atomic. The first problem - * is solved by using "push memory"/"pop memory" instruction pair for copying. - * - * The second is harder: copying eflags to lguest_data.irq_enabled will turn - * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to tell the - * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we only need to protect - * one instruction, iret). - */ -ENTRY(lguest_iret) - pushl 2*4(%esp) - /* - * Note the %ss: segment prefix here. Normal data accesses use the - * "ds" segment, but that will have already been restored for whatever - * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. - */ - popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_iret: - iret diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 5cc78bf57232..3261abb21ef4 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, return 0; /* Buffer overrun */ } +/* + * Find a non-boolean option (i.e. option=argument). In accordance with + * standard Linux practice, if this option is repeated, this returns the + * last instance on the command line. + * + * @cmdline: the cmdline string + * @max_cmdline_size: the maximum size of cmdline + * @option: option string to look for + * @buffer: memory buffer to return the option argument + * @bufsize: size of the supplied memory buffer + * + * Returns the length of the argument (regardless of if it was + * truncated to fit in the buffer), or -1 on not found. + */ +static int +__cmdline_find_option(const char *cmdline, int max_cmdline_size, + const char *option, char *buffer, int bufsize) +{ + char c; + int pos = 0, len = -1; + const char *opptr = NULL; + char *bufptr = buffer; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + st_bufcpy, /* Copying this to buffer */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos++ < max_cmdline_size) { + c = *(char *)cmdline++; + if (!c) + break; + + switch (state) { + case st_wordstart: + if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + /* fall through */ + + case st_wordcmp: + if ((c == '=') && !*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for, prepare to + * copy the argument. + */ + len = 0; + bufptr = buffer; + state = st_bufcpy; + break; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ + + case st_wordskip: + if (myisspace(c)) + state = st_wordstart; + break; + + case st_bufcpy: + if (myisspace(c)) { + state = st_wordstart; + } else { + /* + * Increment len, but don't overrun the + * supplied buffer and leave room for the + * NULL terminator. + */ + if (++len < bufsize) + *bufptr++ = c; + } + break; + } + } + + if (bufsize) + *bufptr = '\0'; + + return len; +} + int cmdline_find_option_bool(const char *cmdline, const char *option) { return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); } + +int cmdline_find_option(const char *cmdline, const char *option, char *buffer, + int bufsize) +{ + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, + buffer, bufsize); +} diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S index f77ba3058b31..066996dba6a2 100644 --- a/arch/x86/math-emu/div_Xsig.S +++ b/arch/x86/math-emu/div_Xsig.S @@ -363,3 +363,4 @@ L_bugged_2: pop %ebx jmp L_exit #endif /* PARANOID */ +ENDPROC(div_Xsig) diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S index 47099628fa4c..2c71527bd917 100644 --- a/arch/x86/math-emu/div_small.S +++ b/arch/x86/math-emu/div_small.S @@ -44,4 +44,4 @@ ENTRY(FPU_div_small) leave ret - +ENDPROC(FPU_div_small) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 0203baefb5c0..d4a7df2205b8 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info) } code_descriptor = FPU_get_ldt_descriptor(FPU_CS); - if (SEG_D_SIZE(code_descriptor)) { + if (code_descriptor.d) { /* The above test may be wrong, the book is not clear */ /* Segmented 32 bit protected mode */ addr_modes.default_mode = SEG32; @@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info) /* 16 bit protected mode */ addr_modes.default_mode = PM16; } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor) + - 1) * SEG_GRANULARITY(code_descriptor) - - 1; + FPU_EIP += code_base = seg_get_base(&code_descriptor); + code_limit = seg_get_limit(&code_descriptor) + 1; + code_limit *= seg_get_granularity(&code_descriptor); + code_limit += code_base - 1; if (code_limit < code_base) code_limit = 0xffffffff; } diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index a179254a5122..699f329f1d40 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -34,17 +34,43 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) return ret; } -#define SEG_D_SIZE(x) ((x).b & (3 << 21)) -#define SEG_G_BIT(x) ((x).b & (1 << 23)) -#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) -#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23))) -#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \ - | (((s).b & 0xff) << 16) | ((s).a >> 16)) -#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff)) -#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11)) -#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9)) -#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ - == (1 << 10)) +#define SEG_TYPE_WRITABLE (1U << 1) +#define SEG_TYPE_EXPANDS_DOWN (1U << 2) +#define SEG_TYPE_EXECUTE (1U << 3) +#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE) +#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE) + +static inline unsigned long seg_get_base(struct desc_struct *d) +{ + unsigned long base = (unsigned long)d->base2 << 24; + + return base | ((unsigned long)d->base1 << 16) | d->base0; +} + +static inline unsigned long seg_get_limit(struct desc_struct *d) +{ + return ((unsigned long)d->limit1 << 16) | d->limit0; +} + +static inline unsigned long seg_get_granularity(struct desc_struct *d) +{ + return d->g ? 4096 : 1; +} + +static inline bool seg_expands_down(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN; +} + +static inline bool seg_execute_only(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE; +} + +static inline bool seg_writable(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; +} #define I387 (¤t->thread.fpu.state) #define FPU_info (I387->soft.info) diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b8ef9f9d2ffc..c48967c6a0e2 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -159,17 +159,18 @@ static long pm_address(u_char FPU_modrm, u_char segment, } descriptor = FPU_get_ldt_descriptor(addr->selector); - base_address = SEG_BASE_ADDR(descriptor); + base_address = seg_get_base(&descriptor); address = base_address + offset; - limit = base_address - + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1; + limit = seg_get_limit(&descriptor) + 1; + limit *= seg_get_granularity(&descriptor); + limit += base_address - 1; if (limit < base_address) limit = 0xffffffff; - if (SEG_EXPAND_DOWN(descriptor)) { - if (SEG_G_BIT(descriptor)) + if (seg_expands_down(&descriptor)) { + if (descriptor.g) { seg_top = 0xffffffff; - else { + } else { seg_top = base_address + (1 << 20); if (seg_top < base_address) seg_top = 0xffffffff; @@ -182,8 +183,8 @@ static long pm_address(u_char FPU_modrm, u_char segment, (address > limit) || (address < base_address) ? 0 : ((limit - address) >= 254 ? 255 : limit - address + 1); } - if (SEG_EXECUTE_ONLY(descriptor) || - (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { + if (seg_execute_only(&descriptor) || + (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { access_limit = 0; } return address; diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S index 717785a53eb4..22e0631bb85a 100644 --- a/arch/x86/math-emu/mul_Xsig.S +++ b/arch/x86/math-emu/mul_Xsig.S @@ -62,6 +62,7 @@ ENTRY(mul32_Xsig) popl %esi leave ret +ENDPROC(mul32_Xsig) ENTRY(mul64_Xsig) @@ -114,6 +115,7 @@ ENTRY(mul64_Xsig) popl %esi leave ret +ENDPROC(mul64_Xsig) @@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig) popl %esi leave ret - +ENDPROC(mul_Xsig_Xsig) diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S index 17315c89ff3d..a9aaf414135d 100644 --- a/arch/x86/math-emu/polynom_Xsig.S +++ b/arch/x86/math-emu/polynom_Xsig.S @@ -133,3 +133,4 @@ L_accum_done: popl %esi leave ret +ENDPROC(polynomial_Xsig) diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S index 8b6352efceef..53ac1a343c69 100644 --- a/arch/x86/math-emu/reg_norm.S +++ b/arch/x86/math-emu/reg_norm.S @@ -94,6 +94,7 @@ L_overflow: call arith_overflow pop %ebx jmp L_exit +ENDPROC(FPU_normalize) @@ -145,3 +146,4 @@ L_exit_nuo_zero: popl %ebx leave ret +ENDPROC(FPU_normalize_nuo) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index d1d4e48b4f67..41af5b208d88 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -706,3 +706,5 @@ L_exception_exit: mov $-1,%eax jmp fpu_reg_round_special_exit #endif /* PARANOID */ + +ENDPROC(FPU_round) diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S index 47c4c2434d85..3b1bc5e9b2f6 100644 --- a/arch/x86/math-emu/reg_u_add.S +++ b/arch/x86/math-emu/reg_u_add.S @@ -165,3 +165,4 @@ L_exit: leave ret #endif /* PARANOID */ +ENDPROC(FPU_u_add) diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S index cc00654b6f9a..796eb5ab921b 100644 --- a/arch/x86/math-emu/reg_u_div.S +++ b/arch/x86/math-emu/reg_u_div.S @@ -469,3 +469,5 @@ L_exit: leave ret #endif /* PARANOID */ + +ENDPROC(FPU_u_div) diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S index 973f12af97df..6196f68cf3c1 100644 --- a/arch/x86/math-emu/reg_u_mul.S +++ b/arch/x86/math-emu/reg_u_mul.S @@ -146,3 +146,4 @@ L_exit: ret #endif /* PARANOID */ +ENDPROC(FPU_u_mul) diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S index 1b6c24801d22..d115b900919a 100644 --- a/arch/x86/math-emu/reg_u_sub.S +++ b/arch/x86/math-emu/reg_u_sub.S @@ -270,3 +270,4 @@ L_exit: popl %esi leave ret +ENDPROC(FPU_u_sub) diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S index bbe0e87718e4..87c99749a495 100644 --- a/arch/x86/math-emu/round_Xsig.S +++ b/arch/x86/math-emu/round_Xsig.S @@ -78,7 +78,7 @@ L_exit: popl %ebx leave ret - +ENDPROC(round_Xsig) @@ -138,4 +138,4 @@ L_n_exit: popl %ebx leave ret - +ENDPROC(norm_Xsig) diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S index 31cdd118e918..c8552edeec75 100644 --- a/arch/x86/math-emu/shr_Xsig.S +++ b/arch/x86/math-emu/shr_Xsig.S @@ -85,3 +85,4 @@ L_more_than_95: popl %esi leave ret +ENDPROC(shr_Xsig) diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S index 518428317985..340dd6897f85 100644 --- a/arch/x86/math-emu/wm_shrx.S +++ b/arch/x86/math-emu/wm_shrx.S @@ -92,6 +92,7 @@ L_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrx) /*---------------------------------------------------------------------------+ @@ -202,3 +203,4 @@ Ls_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrxs) diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index d258f59564e1..695afae38fdf 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -468,3 +468,4 @@ sqrt_more_prec_large: /* Our estimate is too large */ movl $0x7fffff00,%eax jmp sqrt_round_result +ENDPROC(wm_sqrt) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f8..72bf8c01c6e3 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0470826d2bdc..5e3ac6fe6c9e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -13,12 +13,12 @@ */ #include <linux/debugfs.h> +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/seq_file.h> -#include <asm/kasan.h> #include <asm/pgtable.h> /* @@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) { pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = - { "cr3", "pgd", "pud", "pmd", "pte" }; + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; if (!pgprot_val(prot)) { /* Not present */ @@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); /* Bit 7 has a different meaning on level 3 vs 4 */ - if (level <= 3 && pr & _PAGE_PSE) + if (level <= 4 && pr & _PAGE_PSE) pt_dump_cont_printf(m, dmsg, "PSE "); else pt_dump_cont_printf(m, dmsg, " "); - if ((level == 4 && pr & _PAGE_PAT) || - ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + if ((level == 5 && pr & _PAGE_PAT) || + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); @@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) */ static unsigned long normalize_addr(unsigned long u) { -#ifdef CONFIG_X86_64 - return (signed long)(u << 16) >> 16; -#else - return u; -#endif + int shift; + if (!IS_ENABLED(CONFIG_X86_64)) + return u; + + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + return (signed long)(u << shift) >> shift; } /* @@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), 5); start++; } } +#ifdef CONFIG_KASAN + +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_zero_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + if (__pa(pt) == __pa(kasan_zero_pmd) || +#ifdef CONFIG_X86_5LEVEL + __pa(pt) == __pa(kasan_zero_p4d) || +#endif + __pa(pt) == __pa(kasan_zero_pud)) { + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); + note_page(m, st, __pgprot(prot), 5); + return true; + } + return false; +} +#else +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + return false; +} +#endif #if PTRS_PER_PMD > 1 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; - pmd_t *start; + pmd_t *start, *pmd_start; pgprotval_t prot; - start = (pmd_t *)pud_page_vaddr(addr); + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { if (pmd_large(*start) || !pmd_present(*start)) { prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 3); - } else { + note_page(m, st, __pgprot(prot), 4); + } else if (!kasan_page_table(m, st, pmd_start)) { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 4); start++; } } @@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, #if PTRS_PER_PUD > 1 -/* - * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y - * KASAN fills page tables with the same values. Since there is no - * point in checking page table more than once we just skip repeated - * entries. This saves us dozens of seconds during boot. - */ -static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) -{ - return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); -} - static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; - pud_t *start; + pud_t *start, *pud_start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *)p4d_page_vaddr(addr); + pud_start = start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); - if (!pud_none(*start) && - !pud_already_checked(prev_pud, start, st->check_wx)) { + if (!pud_none(*start)) { if (pud_large(*start) || !pud_present(*start)) { prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 2); - } else { + note_page(m, st, __pgprot(prot), 3); + } else if (!kasan_page_table(m, st, pud_start)) { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 3); prev_pud = start; start++; @@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; - p4d_t *start; + p4d_t *start, *p4d_start; pgprotval_t prot; - start = (p4d_t *)pgd_page_vaddr(addr); + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); @@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, if (p4d_large(*start) || !p4d_present(*start)) { prot = p4d_flags(*start); note_page(m, st, __pgprot(prot), 2); - } else { + } else if (!kasan_page_table(m, st, p4d_start)) { walk_pud_level(m, st, *start, P + i * P4D_LEVEL_MULT); } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..c076f710de4c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2a1fa10c6a98..b836a7274e12 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -396,14 +396,18 @@ static void dump_pagetable(unsigned long address) pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", pgd_val(*pgd)); + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde /* * We must not directly access the pte in the highpte @@ -415,9 +419,9 @@ static void dump_pagetable(unsigned long address) goto out; pte = pte_offset_kernel(pmd, address); - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); } #else /* CONFIG_X86_64: */ @@ -565,7 +569,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); + pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; @@ -574,7 +578,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(p4d)) goto bad; - printk("P4D %lx ", p4d_val(*p4d)); + pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_large(*p4d)) goto out; @@ -582,7 +586,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); + pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; @@ -590,7 +594,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); + pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; @@ -598,12 +602,12 @@ static void dump_pagetable(unsigned long address) if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); + pr_cont("PTE %lx", pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); return; bad: - printk("BAD\n"); + pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ @@ -1254,10 +1258,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * This function must have noinline because both callers - * {,trace_}do_page_fault() have notrace on. Having this an actual function - * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, @@ -1490,27 +1490,6 @@ good_area: } NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - /* - * We must have this function tagged with __kprobes, notrace and call - * read_cr2() before calling anything else. To avoid calling any kind - * of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contain all sorts of tracepoints. - */ - - prev_state = exception_enter(); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); -} -NOKPROBE_SYMBOL(do_page_fault); - -#ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) @@ -1521,22 +1500,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, trace_page_fault_kernel(address, regs, error_code); } +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ dotraplinkage void notrace -trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - /* - * The exception_enter and tracepoint processing could - * trigger another page faults (user space callchain - * reading) and destroy the original cr2 value, so read - * the faulting address now. - */ - unsigned long address = read_cr2(); + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(address, regs, error_code); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -NOKPROBE_SYMBOL(trace_do_page_fault); -#endif /* CONFIG_TRACING */ +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 2824607df108..6d06cf33e3de 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/elf.h> +#include <asm/mpx.h> #if 0 /* This is just for testing */ struct page * @@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ info.high_limit = in_compat_syscall() ? - tasksize_32bit() : tasksize_64bit(); + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, + unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; - unsigned long addr; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } @@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; + + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index adab1595f4bd..31cea988fa36 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (!pmd) return -ENOMEM; ident_pmd_init(info, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); } return 0; @@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (!pud) return -ENOMEM; ident_pud_init(info, pud, addr, next); - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } return 0; @@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long next; int result; + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; @@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (result) return result; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* * With p4d folded, pgd is equal to p4d. * The pgd entry has to point to the pud page table in this case. */ pud_t *pud = pud_offset(p4d, 0); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); } } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bf3f1065d6ad..7777ccc0e9f9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,7 +815,7 @@ void __init zone_sizes_init(void) DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, - .state = 0, + .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4c1b5fd0c7ad..34f0e1847dd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -13,6 +13,8 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> +#include <linux/mem_encrypt.h> +#include <linux/efi.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -21,6 +23,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/pat.h> +#include <asm/setup.h> #include "physaddr.h" @@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, } /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_ISA_range(phys_addr, last_addr)) - return (__force void __iomem *)phys_to_virt(phys_addr); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ pfn = phys_addr >> PAGE_SHIFT; @@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr) return; /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. */ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); return; + } addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); @@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) unsigned long offset = phys & ~PAGE_MASK; void *vaddr; - /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ - if (page_is_ram(start >> PAGE_SHIFT)) - return __va(phys); + /* memremap() maps if RAM, otherwise falls back to ioremap() */ + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); - vaddr = ioremap_cache(start, PAGE_SIZE); - /* Only add the offset on success and return NULL if the ioremap() failed: */ + /* Only add the offset on success and return NULL if memremap() failed */ if (vaddr) vaddr += offset; @@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { - if (page_is_ram(phys >> PAGE_SHIFT)) - return; + memunmap((void *)((unsigned long)addr & PAGE_MASK)); +} + +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. And since the encryption key can + * change across reboots, persistent memory should also be mapped + * decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + int is_pmem; + + /* + * Check if the address is part of a persistent memory region. + * This check covers areas added by E820, EFI and ACPI. + */ + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); + if (is_pmem != REGION_DISJOINT) + return true; + + /* + * Check if the non-volatile attribute is set for an EFI + * reserved area. + */ + if (efi_enabled(EFI_BOOT)) { + switch (efi_mem_type(phys_addr)) { + case EFI_RESERVED_TYPE: + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) + return true; + break; + default: + break; + } + } + + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + case E820_TYPE_PRAM: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + + paddr_next = data->next; + len = data->len; + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + + paddr_next = data->next; + len = data->len; + + early_memunmap(data, sizeof(*data)); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!sme_active()) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + return false; + + return true; +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + if (!sme_active()) + return prot; + + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + prot = pgprot_decrypted(prot); + else + prot = pgprot_encrypted(prot); + + return prot; +} + +bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) +{ + return arch_memremap_can_ram_remap(phys_addr, size, 0); +} + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; - iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 02c9d7553409..bc84b73684b7 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -11,8 +11,8 @@ #include <asm/e820/types.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/pgtable.h> -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_MAX_ENTRIES]; static int __init map_range(struct range *range) @@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -153,7 +153,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 000000000000..0fbd09269757 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,593 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/sections.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +unsigned long sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL_GPL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + local_flush_tlb(); + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + __native_flush_tlb(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void) +{ + if (!sme_me_mask) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); + + pr_info("AMD Secure Memory Encryption (SME) active\n"); +} + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) +{ + WARN(PAGE_ALIGN(size) != size, + "size is not page-aligned (%#lx)\n", size); + + /* Make the SWIOTLB buffer area decrypted */ + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); +} + +static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, + unsigned long end) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = start & PGDIR_MASK; + pgd_end = end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); + pgd_size *= sizeof(pgd_t); + + pgd_p = pgd_base + pgd_index(start); + + memset(pgd_p, 0, pgd_size); +} + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, + unsigned long vaddr, pmdval_t pmd_val) +{ + pgd_t *pgd_p; + p4d_t *p4d_p; + pud_t *pud_p; + pmd_t *pmd_p; + + pgd_p = pgd_base + pgd_index(vaddr); + if (native_pgd_val(*pgd_p)) { + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + else + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + } else { + pgd_t pgd; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p = pgtable_area; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); + } else { + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); + } + native_set_pgd(pgd_p, pgd); + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p += p4d_index(vaddr); + if (native_p4d_val(*p4d_p)) { + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); + } else { + p4d_t p4d; + + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); + native_set_p4d(p4d_p, p4d); + } + } + + pud_p += pud_index(vaddr); + if (native_pud_val(*pud_p)) { + if (native_pud_val(*pud_p) & _PAGE_PSE) + goto out; + + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); + } else { + pud_t pud; + + pmd_p = pgtable_area; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); + native_set_pud(pud_p, pud); + } + + pmd_p += pmd_index(vaddr); + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) + native_set_pmd(pmd_p, native_make_pmd(pmd_val)); + +out: + return pgtable_area; +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long p4d_size, pud_size, pmd_size; + unsigned long total; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. That mappings will be covered by 2MB + * PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. Incrementing the count for each covers the case where + * the addresses cross entries. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total = p4d_size + pud_size + pmd_size; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total += p4d_size + pud_size + pmd_size; + + return total; +} + +void __init sme_encrypt_kernel(void) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long pgtable_area_len; + unsigned long paddr, pmd_flags; + unsigned long decrypted_base; + void *pgtable_area; + pgd_t *pgd; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel by building new pagetables with + * the necessary attributes needed to encrypt the kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = workarea_start + workarea_len; + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + pgd = (pgd_t *)native_read_cr3_pa(); + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * to be encrypted. It starts with an empty PGD that will then be + * populated with new PUDs and PMDs as the encrypted and decrypted + * kernel mappings are created. + */ + pgd = pgtable_area; + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); + pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; + + /* Add encrypted kernel (identity) mappings */ + pmd_flags = PMD_FLAGS | _PAGE_ENC; + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base <<= PGDIR_SHIFT; + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* Add decrypted workarea mappings to both kernel mappings */ + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + sme_clear_pgd(pgd, kernel_start + decrypted_base, + kernel_end + decrypted_base); + + sme_clear_pgd(pgd, workarea_start + decrypted_base, + workarea_end + decrypted_base); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init __nostackprotector sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + + /* + * Check for the SME feature: + * CPUID Fn8000_001F[EAX] - Bit 0 + * Secure Memory Encryption support + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & 1)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if SME is enabled */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S new file mode 100644 index 000000000000..730e6d541df1 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -0,0 +1,149 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> + + .text + .code64 +ENTRY(sme_encrypt_execute) + + /* + * Entry parameters: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - length of kernel + * RCX - virtual address of the encryption workarea, including: + * - stack page (PAGE_SIZE) + * - encryption routine page (PAGE_SIZE) + * - intermediate copy buffer (PMD_PAGE_SIZE) + * R8 - physcial address of the pagetables to use for encryption + */ + + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ + + /* Set up a one page stack in the non-encrypted memory area */ + movq %rcx, %rax /* Workarea stack page */ + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ + + push %r12 + movq %rdi, %r10 /* Encrypted kernel */ + movq %rsi, %r11 /* Decrypted kernel */ + movq %rdx, %r12 /* Kernel length */ + + /* Copy encryption routine into the workarea */ + movq %rax, %rdi /* Workarea encryption routine */ + leaq __enc_copy(%rip), %rsi /* Encryption routine */ + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ + rep movsb + + /* Setup registers for call */ + movq %r10, %rdi /* Encrypted kernel */ + movq %r11, %rsi /* Decrypted kernel */ + movq %r8, %rdx /* Pagetables used for encryption */ + movq %r12, %rcx /* Kernel length */ + movq %rax, %r8 /* Workarea encryption routine */ + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + + call *%rax /* Call the encryption routine */ + + pop %r12 + + movq %rbp, %rsp /* Restore original stack pointer */ + pop %rbp + + ret +ENDPROC(sme_encrypt_execute) + +ENTRY(__enc_copy) +/* + * Routine used to encrypt kernel. + * This routine must be run outside of the kernel proper since + * the kernel will be encrypted during the process. So this + * routine is defined here and then copied to an area outside + * of the kernel where it will remain and run decrypted + * during execution. + * + * On entry the registers must be: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - address of the pagetables to use for encryption + * RCX - length of kernel + * R8 - intermediate copy buffer + * + * RAX - points to this routine + * + * The kernel will be encrypted by copying from the non-encrypted + * kernel space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted kernel space. The physical + * addresses of the two kernel space mappings are the same which + * results in the kernel being encrypted "in place". + */ + /* Enable the new page tables */ + mov %rdx, %cr3 + + /* Flush any global TLBs */ + mov %cr4, %rdx + andq $~X86_CR4_PGE, %rdx + mov %rdx, %cr4 + orq $X86_CR4_PGE, %rdx + mov %rdx, %cr4 + + /* Set the PAT register PA5 entry to write-protect */ + push %rcx + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + push %rdx /* Save original PAT value */ + andl $0xffff00ff, %edx /* Clear PA5 */ + orl $0x00000500, %edx /* Set PA5 to WP */ + wrmsr + pop %rdx /* RDX contains original PAT value */ + pop %rcx + + movq %rcx, %r9 /* Save kernel length */ + movq %rdi, %r10 /* Save encrypted kernel address */ + movq %rsi, %r11 /* Save decrypted kernel address */ + + wbinvd /* Invalidate any cache entries */ + + /* Copy/encrypt 2MB at a time */ +1: + movq %r11, %rsi /* Source - decrypted kernel */ + movq %r8, %rdi /* Dest - intermediate copy buffer */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + movq %r8, %rsi /* Source - intermediate copy buffer */ + movq %r10, %rdi /* Dest - encrypted kernel */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + addq $PMD_PAGE_SIZE, %r11 + addq $PMD_PAGE_SIZE, %r10 + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + jnz 1b /* Kernel length not zero? */ + + /* Restore PAT register */ + push %rdx /* Save original PAT value */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + pop %rdx /* Restore original PAT value */ + wrmsr + + ret +.L__enc_copy_end: +ENDPROC(__enc_copy) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index a88cfbfbd078..a99679826846 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -37,21 +37,21 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -unsigned long tasksize_32bit(void) +unsigned long task_size_32bit(void) { return IA32_PAGE_OFFSET; } -unsigned long tasksize_64bit(void) +unsigned long task_size_64bit(int full_addr_space) { - return TASK_SIZE_MAX; + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; } static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if (current->flags & PF_RANDOMIZE) { - max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); max <<= PAGE_SHIFT; } @@ -141,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -151,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), tasksize_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit()); #endif } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1c34b767c84c..9ceaa955d2ba 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -355,10 +355,19 @@ int mpx_enable_management(void) */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); + + /* MPX doesn't support addresses above 47 bits yet. */ + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { + pr_warn_once("%s (%d): MPX cannot handle addresses " + "above 47-bits. Disabling.", + current->comm, current->pid); + ret = -ENXIO; + goto out; + } mm->context.bd_addr = bd_base; if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; - +out: up_write(&mm->mmap_sem); return ret; } @@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, if (ret) force_sig(SIGSEGV, current); } + +/* MPX cannot handle addresses above 47 bits yet. */ +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (!kernel_managing_mpx_tables(current->mm)) + return addr; + if (addr + len <= DEFAULT_MAP_WINDOW) + return addr; + if (flags & MAP_FIXED) + return -ENOMEM; + + /* + * Requested len is larger than the whole area we're allowed to map in. + * Resetting hinting address wouldn't do much good -- fail early. + */ + if (len > DEFAULT_MAP_WINDOW) + return -ENOMEM; + + /* Look for unmap area within DEFAULT_MAP_WINDOW */ + return 0; +} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..d805162e6045 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. + * to max_addr. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 size; int big; int nid = 0; @@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. @@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 min_size; int nid = 0; int i, ret; @@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + /** * numa_emulation - Emulate NUMA nodes * @numa_meminfo: NUMA configuration to massage @@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * Determine the max emulated nid and the default phys nid to use * for unmapped nodes. */ - max_emu_nid = 0; - dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (dfl_phys_nid == NUMA_NO_NODE) - dfl_phys_nid = emu_nid_to_phys[i]; - } - } - if (dfl_phys_nid == NUMA_NO_NODE) { - pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); - goto no_emu; - } + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); /* commit */ *numa_meminfo = ei; + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 757b0bcdf712..dfb7d657cf43 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages) __pgprot(0), 1, 0, NULL); } +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + struct cpa_data cpa; + unsigned long start; + int ret; + + /* Nothing to do if the SME is not active */ + if (!sme_active()) + return 0; + + /* Should not be working on unaligned addresses */ + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) + addr &= PAGE_MASK; + + start = addr; + + memset(&cpa, 0, sizeof(cpa)); + cpa.vaddr = &addr; + cpa.numpages = numpages; + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); + cpa.pgd = init_mm.pgd; + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + vm_unmap_aliases(); + + /* + * Before changing the encryption attribute, we need to flush caches. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 1); + else + cpa_flush_all(1); + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * After changing the encryption attribute, we need to flush TLBs + * again in case any speculative TLB caching occurred (but no need + * to flush caches again). We could just use cpa_flush_all(), but + * in case TLB flushing gets optimized in the cpa_flush_range() + * path use the same logic as above. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 0); + else + cpa_flush_all(0); + + return ret; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, true); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, false); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 45979502f64b..fe7d57a8fb60 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -293,7 +293,7 @@ void init_cache_modes(void) * pat_init - Initialize PAT MSR and PAT table * * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC and WT. + * to enable additional cache attributes, WC, WT and WP. * * This function must be called on all CPUs using the specific sequence of * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this @@ -352,7 +352,7 @@ void pat_init(void) * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved - * 101 5 WC : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * @@ -360,7 +360,7 @@ void pat_init(void) * corresponding types in the presence of PAT errata. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { @@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) + vma_prot = pgprot_decrypted(vma_prot); + return vma_prot; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 508a708eb9a6..218834a3e9ad 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); + tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_page(tlb, page); + tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pud)); + tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(p4d)); + tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 014d07a80053..ce104b962a17 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,42 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) +{ + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -43,12 +79,11 @@ void leave_mm(int cpu) if (loaded_mm == &init_mm) return; - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); + /* Warn if we're not lazy. */ + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); switch_mm(NULL, &init_mm, NULL); } -EXPORT_SYMBOL_GPL(leave_mm); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -63,115 +98,219 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned cpu = smp_processor_id(); struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; /* - * NB: The scheduler will call us with prev == next when - * switching from lazy TLB mode to normal mode if active_mm - * isn't changing. When this happens, there is no guarantee - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + */ + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); if (real_prev == next) { - /* - * There's nothing to do: we always keep the per-mm control - * regs in sync with cpu_tlbstate.loaded_mm. Just - * sanity-check mm_cpumask. - */ - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) - cpumask_set_cpu(cpu, mm_cpumask(next)); - return; - } + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * There's nothing to do: we weren't lazy, and we + * aren't changing our mm. We don't need to flush + * anything, nor do we need to update CR3, CR4, or + * LDTR. + */ + return; + } + + /* Resume remote flushes and then read tlb_gen. */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < + next_tlb_gen) { + /* + * Ideally, we'd have a flush_tlb() variant that + * takes the known CR3 value as input. This would + * be faster on Xen PV and on hypothetical CPUs + * on which INVPCID is fast. + */ + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, + next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | prev_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } - if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. + * We just exited lazy mode, which means that CR4 and/or LDTR + * may be stale. (Changes to the required CR4 and LDTR states + * are not reflected in tlb_gen.) */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); - } + } else { + u16 new_asid; + bool need_flush; + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int index = pgd_index(current_stack_pointer()); + pgd_t *pgd = next->pgd + index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[index]); + } - this_cpu_write(cpu_tlbstate.loaded_mm, next); + /* Stop remote flushes for the previous mm */ + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - cpumask_set_cpu(cpu, mm_cpumask(next)); + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - */ - load_cr3(next->pgd); - - /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we have to - * call the tracepoint specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + /* + * Start remote flushes and then read tlb_gen. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | new_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); + } - /* Stop flush ipis for the previous mm */ - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + } - /* Load per-mm CR4 and LDTR state */ load_mm_cr4(next); switch_ldt(real_prev, next); } +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ static void flush_tlb_func_common(const struct flush_tlb_info *f, bool local, enum tlb_flush_reason reason) { + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { - leave_mm(smp_processor_id()); + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + /* + * We're in lazy mode -- don't flush. We can get here on + * remote flushes due to races and on local flushes if a + * kernel thread coincidentally flushes the mm it's lazily + * still using. + */ return; } - if (f->end == TLB_FLUSH_ALL) { - local_flush_tlb(); - if (local) - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); - } else { + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ unsigned long addr; unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + addr = f->start; while (addr < f->end) { __flush_tlb_single(addr); @@ -180,7 +319,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) @@ -214,6 +362,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); @@ -250,8 +413,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); - /* Synchronize with switch_mm. */ - smp_mb(); + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && @@ -273,6 +436,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), &info); + put_cpu(); } @@ -281,8 +445,6 @@ static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -335,6 +497,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) flush_tlb_others(&batch->cpumask, &info); + cpumask_clear(&batch->cpumask); put_cpu(); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132b0ed4..7a5350d08cef 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap(pa_data, sizeof(*rom)); + data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev) } } pa_data = data->next; - iounmap(data); + memunmap(data); } set_dma_domain_ops(dev); set_dev_domain_options(dev); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f084d8718ac4..6217b23e85f6 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void) /* * Convenience functions to obtain memory types and attributes */ -u32 efi_mem_type(unsigned long phys_addr) +int efi_mem_type(unsigned long phys_addr) { efi_memory_desc_t *md; if (!efi_enabled(EFI_MEMMAP)) - return 0; + return -ENOTSUPP; for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && @@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr) (md->num_pages << EFI_PAGE_SHIFT)))) return md->type; } - return 0; + return -EINVAL; } static int __init arch_parse_efi_cmdline(char *str) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 9bf72f5bfedb..12e83888e5b9 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size) int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { - unsigned long pfn, text; + unsigned long pfn, text, pf; struct page *page; unsigned npages; pgd_t *pgd; @@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd); + /* + * Since the PGD is encrypted, set the encryption mask so that when + * this value is loaded into cr3 the PGD will be decrypted during + * the pagetable walk. + */ + efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd); pgd = efi_pgd; /* @@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * phys_efi_set_virtual_address_map(). */ pfn = pa_memmap >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) { + pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC; + if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) { pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); return 1; } @@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) text = __pa(_text); pfn = text >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) { + pf = _PAGE_RW | _PAGE_ENC; + if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) { pr_err("Failed to map kernel text 1:1\n"); return 1; } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index cd4be19c36dc..1f71980fc5e0 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -1,6 +1,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <asm/set_memory.h> #include <asm/pgtable.h> @@ -59,6 +60,13 @@ static void __init setup_real_mode(void) base = (unsigned char *)real_mode_header; + /* + * If SME is active, the trampoline area will need to be in + * decrypted memory in order to bring up other processors + * successfully. + */ + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); + memcpy(base, real_mode_blob, size); phys_base = __pa(base); @@ -100,6 +108,10 @@ static void __init setup_real_mode(void) trampoline_cr4_features = &trampoline_header->cr4; *trampoline_cr4_features = mmu_cr4_features; + trampoline_header->flags = 0; + if (sme_active()) + trampoline_header->flags |= TH_FLAGS_SME_ACTIVE; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_top_pgt[511].pgd; diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index dac7b20d2f9d..614fd7064d0a 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -30,6 +30,7 @@ #include <asm/msr.h> #include <asm/segment.h> #include <asm/processor-flags.h> +#include <asm/realmode.h> #include "realmode.h" .text @@ -92,6 +93,28 @@ ENTRY(startup_32) movl %edx, %fs movl %edx, %gs + /* + * Check for memory encryption support. This is a safety net in + * case BIOS hasn't done the necessary step of setting the bit in + * the MSR for this AP. If SME is active and we've gotten this far + * then it is safe for us to set the MSR bit and continue. If we + * don't we'll eventually crash trying to execute encrypted + * instructions. + */ + bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags + jnc .Ldone + movl $MSR_K8_SYSCFG, %ecx + rdmsr + bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax + jc .Ldone + + /* + * Memory encryption is enabled but the SME enable bit for this + * CPU has has not been set. It is safe to set it, so do so. + */ + wrmsr +.Ldone: + movl pa_tr_cr4, %eax movl %eax, %cr4 # Enable PAE mode @@ -147,6 +170,7 @@ GLOBAL(trampoline_header) tr_start: .space 8 GLOBAL(tr_efer) .space 8 GLOBAL(tr_cr4) .space 4 + GLOBAL(tr_flags) .space 4 END(trampoline_header) #include "trampoline_common.S" diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 027987638e98..1ecd419811a2 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -17,6 +17,9 @@ config XEN_PV bool "Xen PV guest support" default y depends on XEN + # XEN_PV is not ready to work with 5-level paging. + # Changes to hypervisor are also required. + depends on !X86_5LEVEL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU help @@ -75,4 +78,6 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" depends on XEN && XEN_PVHVM && ACPI + # Pre-built page tables are not ready to handle 5-level paging. + depends on !X86_5LEVEL def_bool n diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 811e4ddb3f37..ae2a2e2d6362 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -263,6 +263,13 @@ static void __init xen_init_capabilities(void) setup_clear_cpu_cap(X86_FEATURE_MTRR); setup_clear_cpu_cap(X86_FEATURE_ACC); setup_clear_cpu_cap(X86_FEATURE_X2APIC); + setup_clear_cpu_cap(X86_FEATURE_SME); + + /* + * Xen PV would need some work to support PCID: CR3 handling as well + * as xen_flush_tlb_others() would need updating. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); if (!xen_initial_domain()) setup_clear_cpu_cap(X86_FEATURE_ACPI); @@ -494,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) static inline bool desc_equal(const struct desc_struct *d1, const struct desc_struct *d2) { - return d1->a == d2->a && d1->b == d2->b; + return !memcmp(d1, d2, sizeof(*d1)); } static void load_TLS_descriptor(struct thread_struct *t, @@ -579,59 +586,91 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } +#ifdef CONFIG_X86_64 +struct trap_array_entry { + void (*orig)(void); + void (*xen)(void); + bool ist_okay; +}; + +static struct trap_array_entry trap_array[] = { + { debug, xen_xendebug, true }, + { int3, xen_xenint3, true }, + { double_fault, xen_double_fault, true }, +#ifdef CONFIG_X86_MCE + { machine_check, xen_machine_check, true }, +#endif + { nmi, xen_nmi, true }, + { overflow, xen_overflow, false }, +#ifdef CONFIG_IA32_EMULATION + { entry_INT80_compat, xen_entry_INT80_compat, false }, +#endif + { page_fault, xen_page_fault, false }, + { divide_error, xen_divide_error, false }, + { bounds, xen_bounds, false }, + { invalid_op, xen_invalid_op, false }, + { device_not_available, xen_device_not_available, false }, + { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, + { invalid_TSS, xen_invalid_TSS, false }, + { segment_not_present, xen_segment_not_present, false }, + { stack_segment, xen_stack_segment, false }, + { general_protection, xen_general_protection, false }, + { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, + { coprocessor_error, xen_coprocessor_error, false }, + { alignment_check, xen_alignment_check, false }, + { simd_coprocessor_error, xen_simd_coprocessor_error, false }, +}; + +static bool get_trap_addr(void **addr, unsigned int ist) +{ + unsigned int nr; + bool ist_okay = false; + + /* + * Replace trap handler addresses by Xen specific ones. + * Check for known traps using IST and whitelist them. + * The debugger ones are the only ones we care about. + * Xen will handle faults like double_fault, * so we should never see + * them. Warn if there's an unexpected IST-using fault handler. + */ + for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { + struct trap_array_entry *entry = trap_array + nr; + + if (*addr == entry->orig) { + *addr = entry->xen; + ist_okay = entry->ist_okay; + break; + } + } + + if (WARN_ON(ist != 0 && !ist_okay)) + return false; + + return true; +} +#endif + static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { unsigned long addr; - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) + if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT) return 0; info->vector = vector; - addr = gate_offset(*val); + addr = gate_offset(val); #ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ + if (!get_trap_addr((void **)&addr, val->bits.ist)) return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } #endif /* CONFIG_X86_64 */ info->address = addr; - info->cs = gate_segment(*val); - info->flags = val->dpl; + info->cs = gate_segment(val); + info->flags = val->bits.dpl; /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) + if (val->bits.type == GATE_INTERRUPT) info->flags |= 1 << 2; return 1; @@ -981,59 +1020,6 @@ void __ref xen_setup_vcpu_info_placement(void) } } -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (xen_have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - static const struct pv_info xen_info __initconst = { .shared_kernel_pmd = 0, @@ -1043,10 +1029,6 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, @@ -1244,7 +1226,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_init_ops = xen_init_ops; + pv_init_ops.patch = paravirt_patch_default; pv_cpu_ops = xen_cpu_ops; x86_platform.get_nmi_reason = xen_get_nmi_reason; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 33e92955e09d..d4eff5676cfa 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { .safe_halt = xen_safe_halt, .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif }; void __init xen_init_irq_ops(void) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index cab28cf2cffb..e437714750f8 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); } return; } - cpumask_copy(mask, mm_cpumask(mm)); /* * It's possible that a vcpu may have a stale reference to our @@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) * look at its actual current cr3 value, and force it to flush * if needed. */ + cpumask_clear(mask); for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) cpumask_set_cpu(cpu, mask); diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index eff224df813f..dcd31fa39b5d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in percpu data) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/asm-offsets.h> @@ -16,7 +10,7 @@ #include <asm/processor-flags.h> #include <asm/frame.h> -#include "xen-asm.h" +#include <linux/linkage.h> /* * Enable events. This clears the event mask and tests the pending @@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_irq_enable_direct) FRAME_END ret ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) /* @@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct) */ ENTRY(xen_irq_disable_direct) movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) +ENDPROC(xen_irq_disable_direct) /* * (xen_)save_fl is used to get the current interrupt enable status. @@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah -ENDPATCH(xen_save_fl_direct) ret ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) /* @@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct) /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jnz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_restore_fl_direct) FRAME_END ret ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) /* diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h deleted file mode 100644 index 465276467a47..000000000000 --- a/arch/x86/xen/xen-asm.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _XEN_XEN_ASM_H -#define _XEN_XEN_ASM_H - -#include <linux/linkage.h> - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index feb6d40a0860..1200e262a116 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/thread_info.h> @@ -18,21 +12,10 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call xen_force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 /* * This is run where a normal iret would be run, with the same stack setup: diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c3df43141e70..dae2cc33afb5 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/errno.h> @@ -20,13 +14,44 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> + +.macro xen_pv_trap name +ENTRY(xen_\name) + pop %rcx + pop %r11 + jmp \name +END(xen_\name) +.endm -ENTRY(xen_adjust_exception_frame) - mov 8+0(%rsp), %rcx - mov 8+8(%rsp), %r11 - ret $16 -ENDPROC(xen_adjust_exception_frame) +xen_pv_trap divide_error +xen_pv_trap debug +xen_pv_trap xendebug +xen_pv_trap int3 +xen_pv_trap xenint3 +xen_pv_trap nmi +xen_pv_trap overflow +xen_pv_trap bounds +xen_pv_trap invalid_op +xen_pv_trap device_not_available +xen_pv_trap double_fault +xen_pv_trap coprocessor_segment_overrun +xen_pv_trap invalid_TSS +xen_pv_trap segment_not_present +xen_pv_trap stack_segment +xen_pv_trap general_protection +xen_pv_trap page_fault +xen_pv_trap spurious_interrupt_bug +xen_pv_trap coprocessor_error +xen_pv_trap alignment_check +#ifdef CONFIG_X86_MCE +xen_pv_trap machine_check +#endif /* CONFIG_X86_MCE */ +xen_pv_trap simd_coprocessor_error +#ifdef CONFIG_IA32_EMULATION +xen_pv_trap entry_INT80_compat +#endif +xen_pv_trap hypervisor_callback hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* @@ -46,9 +71,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 */ ENTRY(xen_iret) pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_iret) -RELOC(xen_iret, 1b+1) + jmp hypercall_iret ENTRY(xen_sysret64) /* @@ -65,9 +88,7 @@ ENTRY(xen_sysret64) pushq %rcx pushq $VGCF_in_syscall -1: jmp hypercall_iret -ENDPATCH(xen_sysret64) -RELOC(xen_sysret64, 1b+1) + jmp hypercall_iret /* * Xen handles syscall callbacks much like ordinary exceptions, which @@ -82,34 +103,47 @@ RELOC(xen_sysret64, 1b+1) * rip * r11 * rsp->rcx - * - * In all the entrypoints, we undo all that to make it look like a - * CPU-generated syscall/sysenter and jump to the normal entrypoint. */ -.macro undo_xen_syscall - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp -.endm - /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) - undo_xen_syscall - jmp entry_SYSCALL_64_after_swapgs + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER_DS and __USER_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER_DS, 4*8(%rsp) + movq $__USER_CS, 1*8(%rsp) + + jmp entry_SYSCALL_64_after_hwframe ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) - undo_xen_syscall - jmp entry_SYSCALL_compat + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSCALL_compat_after_hwframe ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) - undo_xen_syscall + mov 0*8(%rsp), %rcx + mov 1*8(%rsp), %r11 + mov 5*8(%rsp), %rsp jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 72a8e6adebe6..a7525e95d53f 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -58,7 +58,7 @@ ENTRY(hypercall_page) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ - ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) #endif #ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0d5004477db6..c8a6d224f7ed 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -129,23 +129,15 @@ static inline void __init xen_efi_init(void) } #endif -/* Declare an asm function, along with symbols needed to make it - inlineable */ -#define DECL_ASM(ret, name, ...) \ - __visible ret name(__VA_ARGS__); \ - extern char name##_end[] __visible; \ - extern char name##_reloc[] __visible - -DECL_ASM(void, xen_irq_enable_direct, void); -DECL_ASM(void, xen_irq_disable_direct, void); -DECL_ASM(unsigned long, xen_save_fl_direct, void); -DECL_ASM(void, xen_restore_fl_direct, unsigned long); +__visible void xen_irq_enable_direct(void); +__visible void xen_irq_disable_direct(void); +__visible unsigned long xen_save_fl_direct(void); +__visible void xen_restore_fl_direct(unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); __visible void xen_sysret32(void); __visible void xen_sysret64(void); -__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h index b39531babec0..eaaf1ebcc7a4 100644 --- a/arch/xtensa/include/asm/futex.h +++ b/arch/xtensa/include/asm/futex.h @@ -44,18 +44,10 @@ : "r" (uaddr), "I" (-EFAULT), "r" (oparg) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; #if !XCHAL_HAVE_S32C1I return -ENOSYS; @@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (ret) - return ret; + if (!ret) + *oval = oldval; - switch (cmp) { - case FUTEX_OP_CMP_EQ: return (oldval == cmparg); - case FUTEX_OP_CMP_NE: return (oldval != cmparg); - case FUTEX_OP_CMP_LT: return (oldval < cmparg); - case FUTEX_OP_CMP_GE: return (oldval >= cmparg); - case FUTEX_OP_CMP_LE: return (oldval <= cmparg); - case FUTEX_OP_CMP_GT: return (oldval > cmparg); - } - - return -ENOSYS; + return ret; } static inline int diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index a36221cf6363..3bb49681ee24 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -33,11 +33,6 @@ #define arch_spin_is_locked(x) ((x)->slock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 33bfa5270d95..08175df7a69e 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -273,8 +273,8 @@ void __init init_arch(bp_tag_t *bp_start) * Initialize system. Setup memory and reserve regions. */ -extern char _end; -extern char _stext; +extern char _end[]; +extern char _stext[]; extern char _WindowVectors_text_start; extern char _WindowVectors_text_end; extern char _DebugInterruptVector_literal_start; @@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p) } #endif - mem_reserve(__pa(&_stext), __pa(&_end)); + mem_reserve(__pa(_stext), __pa(_end)); #ifdef CONFIG_VECTORS_OFFSET mem_reserve(__pa(&_WindowVectors_text_start), |