diff options
Diffstat (limited to 'arch')
359 files changed, 2897 insertions, 1863 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index ecfd3520b676..6b11c825fc36 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1054,6 +1054,29 @@ config VMAP_STACK backing virtual mappings with real shadow memory, and KASAN_VMALLOC must be enabled. +config HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET + def_bool n + help + An arch should select this symbol if it can support kernel stack + offset randomization with calls to add_random_kstack_offset() + during syscall entry and choose_random_kstack_offset() during + syscall exit. Careful removal of -fstack-protector-strong and + -fstack-protector should also be applied to the entry code and + closely examined, as the artificial stack bump looks like an array + to the compiler, so it will attempt to add canary checks regardless + of the static branch state. + +config RANDOMIZE_KSTACK_OFFSET_DEFAULT + bool "Randomize kernel stack offset on syscall entry" + depends on HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET + help + The kernel stack offset can be randomized (after pt_regs) by + roughly 5 bits of entropy, frustrating memory corruption + attacks that depend on stack address determinism or + cross-syscall address exposures. This feature is controlled + by kernel boot param "randomize_kstack_offset=on/off", and this + config chooses the default boot state. + config ARCH_OPTIONAL_KERNEL_RWX def_bool n diff --git a/arch/arc/boot/dts/haps_hs.dts b/arch/arc/boot/dts/haps_hs.dts index 60d578e2781f..76ad527a0847 100644 --- a/arch/arc/boot/dts/haps_hs.dts +++ b/arch/arc/boot/dts/haps_hs.dts @@ -16,7 +16,7 @@ memory { device_type = "memory"; /* CONFIG_LINUX_RAM_BASE needs to match low mem start */ - reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MB low mem */ + reg = <0x0 0x80000000 0x0 0x40000000 /* 1 GB low mem */ 0x1 0x00000000 0x0 0x40000000>; /* 1 GB highmem */ }; diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index a78d8f745a67..fdbe06c98895 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -96,7 +96,7 @@ stash_usr_regs(struct rt_sigframe __user *sf, struct pt_regs *regs, sizeof(sf->uc.uc_mcontext.regs.scratch)); err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(sigset_t)); - return err; + return err ? -EFAULT : 0; } static int restore_usr_regs(struct pt_regs *regs, struct rt_sigframe __user *sf) @@ -110,7 +110,7 @@ static int restore_usr_regs(struct pt_regs *regs, struct rt_sigframe __user *sf) &(sf->uc.uc_mcontext.regs.scratch), sizeof(sf->uc.uc_mcontext.regs.scratch)); if (err) - return err; + return -EFAULT; set_current_blocked(&set); regs->bta = uregs.scratch.bta; diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index 74ad4256022e..47bab67f8649 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -187,25 +187,26 @@ static void init_unwind_table(struct unwind_table *table, const char *name, const void *table_start, unsigned long table_size, const u8 *header_start, unsigned long header_size) { - const u8 *ptr = header_start + 4; - const u8 *end = header_start + header_size; - table->core.pc = (unsigned long)core_start; table->core.range = core_size; table->init.pc = (unsigned long)init_start; table->init.range = init_size; table->address = table_start; table->size = table_size; - - /* See if the linker provided table looks valid. */ - if (header_size <= 4 - || header_start[0] != 1 - || (void *)read_pointer(&ptr, end, header_start[1]) != table_start - || header_start[2] == DW_EH_PE_omit - || read_pointer(&ptr, end, header_start[2]) <= 0 - || header_start[3] == DW_EH_PE_omit) - header_start = NULL; - + /* To avoid the pointer addition with NULL pointer.*/ + if (header_start != NULL) { + const u8 *ptr = header_start + 4; + const u8 *end = header_start + header_size; + /* See if the linker provided table looks valid. */ + if (header_size <= 4 + || header_start[0] != 1 + || (void *)read_pointer(&ptr, end, header_start[1]) + != table_start + || header_start[2] == DW_EH_PE_omit + || read_pointer(&ptr, end, header_start[2]) <= 0 + || header_start[3] == DW_EH_PE_omit) + header_start = NULL; + } table->hdrsz = header_size; smp_wmb(); table->header = header_start; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 5da96f5df48f..2fae14857dcf 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1293,9 +1293,15 @@ config KASAN_SHADOW_OFFSET config NR_CPUS int "Maximum number of CPUs (2-32)" - range 2 32 + range 2 16 if DEBUG_KMAP_LOCAL + range 2 32 if !DEBUG_KMAP_LOCAL depends on SMP default "4" + help + The maximum number of CPUs that the kernel can support. + Up to 32 CPUs can be supported, or up to 16 if kmap_local() + debugging is enabled, which uses half of the per-CPU fixmap + slots as guard regions. config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi index 5b213a1e68bb..5e33d0e88f5b 100644 --- a/arch/arm/boot/dts/am33xx.dtsi +++ b/arch/arm/boot/dts/am33xx.dtsi @@ -40,6 +40,9 @@ ethernet1 = &cpsw_emac1; spi0 = &spi0; spi1 = &spi1; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; }; cpus { diff --git a/arch/arm/boot/dts/armada-385-turris-omnia.dts b/arch/arm/boot/dts/armada-385-turris-omnia.dts index 646a06420c77..5bd6a66d2c2b 100644 --- a/arch/arm/boot/dts/armada-385-turris-omnia.dts +++ b/arch/arm/boot/dts/armada-385-turris-omnia.dts @@ -32,7 +32,8 @@ ranges = <MBUS_ID(0xf0, 0x01) 0 0xf1000000 0x100000 MBUS_ID(0x01, 0x1d) 0 0xfff00000 0x100000 MBUS_ID(0x09, 0x19) 0 0xf1100000 0x10000 - MBUS_ID(0x09, 0x15) 0 0xf1110000 0x10000>; + MBUS_ID(0x09, 0x15) 0 0xf1110000 0x10000 + MBUS_ID(0x0c, 0x04) 0 0xf1200000 0x100000>; internal-regs { @@ -389,6 +390,7 @@ phy1: ethernet-phy@1 { compatible = "ethernet-phy-ieee802.3-c22"; reg = <1>; + marvell,reg-init = <3 18 0 0x4985>; /* irq is connected to &pcawan pin 7 */ }; diff --git a/arch/arm/boot/dts/at91-sam9x60ek.dts b/arch/arm/boot/dts/at91-sam9x60ek.dts index 73b6b1f89de9..775ceb3acb6c 100644 --- a/arch/arm/boot/dts/at91-sam9x60ek.dts +++ b/arch/arm/boot/dts/at91-sam9x60ek.dts @@ -334,14 +334,6 @@ }; &pinctrl { - atmel,mux-mask = < - /* A B C */ - 0xFFFFFE7F 0xC0E0397F 0xEF00019D /* pioA */ - 0x03FFFFFF 0x02FC7E68 0x00780000 /* pioB */ - 0xffffffff 0xF83FFFFF 0xB800F3FC /* pioC */ - 0x003FFFFF 0x003F8000 0x00000000 /* pioD */ - >; - adc { pinctrl_adc_default: adc_default { atmel,pins = <AT91_PIOB 15 AT91_PERIPH_A AT91_PINCTRL_NONE>; diff --git a/arch/arm/boot/dts/at91-sama5d27_som1.dtsi b/arch/arm/boot/dts/at91-sama5d27_som1.dtsi index 1b1163858b1d..e3251f3e3eaa 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1.dtsi +++ b/arch/arm/boot/dts/at91-sama5d27_som1.dtsi @@ -84,8 +84,8 @@ pinctrl-0 = <&pinctrl_macb0_default>; phy-mode = "rmii"; - ethernet-phy@0 { - reg = <0x0>; + ethernet-phy@7 { + reg = <0x7>; interrupt-parent = <&pioA>; interrupts = <PIN_PD31 IRQ_TYPE_LEVEL_LOW>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/bcm2711.dtsi b/arch/arm/boot/dts/bcm2711.dtsi index 462b1dfb0385..720beec54d61 100644 --- a/arch/arm/boot/dts/bcm2711.dtsi +++ b/arch/arm/boot/dts/bcm2711.dtsi @@ -308,14 +308,6 @@ #reset-cells = <1>; }; - bsc_intr: interrupt-controller@7ef00040 { - compatible = "brcm,bcm2711-l2-intc", "brcm,l2-intc"; - reg = <0x7ef00040 0x30>; - interrupts = <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>; - interrupt-controller; - #interrupt-cells = <1>; - }; - aon_intr: interrupt-controller@7ef00100 { compatible = "brcm,bcm2711-l2-intc", "brcm,l2-intc"; reg = <0x7ef00100 0x30>; @@ -362,8 +354,6 @@ reg = <0x7ef04500 0x100>, <0x7ef00b00 0x300>; reg-names = "bsc", "auto-i2c"; clock-frequency = <97500>; - interrupt-parent = <&bsc_intr>; - interrupts = <0>; status = "disabled"; }; @@ -405,8 +395,6 @@ reg = <0x7ef09500 0x100>, <0x7ef05b00 0x300>; reg-names = "bsc", "auto-i2c"; clock-frequency = <97500>; - interrupt-parent = <&bsc_intr>; - interrupts = <1>; status = "disabled"; }; }; diff --git a/arch/arm/boot/dts/dra7-l4.dtsi b/arch/arm/boot/dts/dra7-l4.dtsi index 3bf90d9e3335..a294a02f2d23 100644 --- a/arch/arm/boot/dts/dra7-l4.dtsi +++ b/arch/arm/boot/dts/dra7-l4.dtsi @@ -1168,7 +1168,7 @@ }; }; - target-module@34000 { /* 0x48034000, ap 7 46.0 */ + timer3_target: target-module@34000 { /* 0x48034000, ap 7 46.0 */ compatible = "ti,sysc-omap4-timer", "ti,sysc"; reg = <0x34000 0x4>, <0x34010 0x4>; @@ -1195,7 +1195,7 @@ }; }; - target-module@36000 { /* 0x48036000, ap 9 4e.0 */ + timer4_target: target-module@36000 { /* 0x48036000, ap 9 4e.0 */ compatible = "ti,sysc-omap4-timer", "ti,sysc"; reg = <0x36000 0x4>, <0x36010 0x4>; diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi index ce1194744f84..53d68786a61f 100644 --- a/arch/arm/boot/dts/dra7.dtsi +++ b/arch/arm/boot/dts/dra7.dtsi @@ -46,6 +46,7 @@ timer { compatible = "arm,armv7-timer"; + status = "disabled"; /* See ARM architected timer wrap erratum i940 */ interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(2) | IRQ_TYPE_LEVEL_LOW)>, <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(2) | IRQ_TYPE_LEVEL_LOW)>, <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(2) | IRQ_TYPE_LEVEL_LOW)>, @@ -1241,3 +1242,22 @@ assigned-clock-parents = <&sys_32k_ck>; }; }; + +/* Local timers, see ARM architected timer wrap erratum i940 */ +&timer3_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + assigned-clocks = <&l4per_clkctrl DRA7_L4PER_TIMER3_CLKCTRL 24>; + assigned-clock-parents = <&timer_sys_clk_div>; + }; +}; + +&timer4_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + assigned-clocks = <&l4per_clkctrl DRA7_L4PER_TIMER4_CLKCTRL 24>; + assigned-clock-parents = <&timer_sys_clk_div>; + }; +}; diff --git a/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi b/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi index 7a1e53195785..f28a96fcf23e 100644 --- a/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi +++ b/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi @@ -433,6 +433,7 @@ pinctrl-0 = <&pinctrl_usdhc2>; cd-gpios = <&gpio1 4 GPIO_ACTIVE_LOW>; wp-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>; + vmmc-supply = <&vdd_sd1_reg>; status = "disabled"; }; @@ -442,5 +443,6 @@ &pinctrl_usdhc3_cdwp>; cd-gpios = <&gpio1 27 GPIO_ACTIVE_LOW>; wp-gpios = <&gpio1 29 GPIO_ACTIVE_HIGH>; + vmmc-supply = <&vdd_sd0_reg>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi index c593597b2119..5a1e10def6ef 100644 --- a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi +++ b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi @@ -210,9 +210,6 @@ micrel,led-mode = <1>; clocks = <&clks IMX6UL_CLK_ENET_REF>; clock-names = "rmii-ref"; - reset-gpios = <&gpio_spi 1 GPIO_ACTIVE_LOW>; - reset-assert-us = <10000>; - reset-deassert-us = <100>; }; @@ -222,9 +219,6 @@ micrel,led-mode = <1>; clocks = <&clks IMX6UL_CLK_ENET2_REF>; clock-names = "rmii-ref"; - reset-gpios = <&gpio_spi 2 GPIO_ACTIVE_LOW>; - reset-assert-us = <10000>; - reset-deassert-us = <100>; }; }; }; @@ -243,6 +237,22 @@ status = "okay"; }; +&gpio_spi { + eth0-phy-hog { + gpio-hog; + gpios = <1 GPIO_ACTIVE_HIGH>; + output-high; + line-name = "eth0-phy"; + }; + + eth1-phy-hog { + gpio-hog; + gpios = <2 GPIO_ACTIVE_HIGH>; + output-high; + line-name = "eth1-phy"; + }; +}; + &i2c1 { clock-frequency = <100000>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts b/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts index ecbb2cc5b9ab..79cc45728cd2 100644 --- a/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts +++ b/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts @@ -14,5 +14,6 @@ }; &gpmi { + fsl,use-minimum-ecc; status = "okay"; }; diff --git a/arch/arm/boot/dts/omap3.dtsi b/arch/arm/boot/dts/omap3.dtsi index 9dcae1f2bc99..c5b9da0d7e6c 100644 --- a/arch/arm/boot/dts/omap3.dtsi +++ b/arch/arm/boot/dts/omap3.dtsi @@ -24,6 +24,9 @@ i2c0 = &i2c1; i2c1 = &i2c2; i2c2 = &i2c3; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; serial0 = &uart1; serial1 = &uart2; serial2 = &uart3; diff --git a/arch/arm/boot/dts/omap4.dtsi b/arch/arm/boot/dts/omap4.dtsi index 72e4f6481776..4a9f9496a867 100644 --- a/arch/arm/boot/dts/omap4.dtsi +++ b/arch/arm/boot/dts/omap4.dtsi @@ -22,6 +22,11 @@ i2c1 = &i2c2; i2c2 = &i2c3; i2c3 = &i2c4; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; + mmc3 = &mmc4; + mmc4 = &mmc5; serial0 = &uart1; serial1 = &uart2; serial2 = &uart3; diff --git a/arch/arm/boot/dts/omap44xx-clocks.dtsi b/arch/arm/boot/dts/omap44xx-clocks.dtsi index 532868591107..1f1c04d8f472 100644 --- a/arch/arm/boot/dts/omap44xx-clocks.dtsi +++ b/arch/arm/boot/dts/omap44xx-clocks.dtsi @@ -770,14 +770,6 @@ ti,max-div = <2>; }; - sha2md5_fck: sha2md5_fck@15c8 { - #clock-cells = <0>; - compatible = "ti,gate-clock"; - clocks = <&l3_div_ck>; - ti,bit-shift = <1>; - reg = <0x15c8>; - }; - usb_phy_cm_clk32k: usb_phy_cm_clk32k@640 { #clock-cells = <0>; compatible = "ti,gate-clock"; diff --git a/arch/arm/boot/dts/omap5.dtsi b/arch/arm/boot/dts/omap5.dtsi index e025b7c9a357..ee821d0ab364 100644 --- a/arch/arm/boot/dts/omap5.dtsi +++ b/arch/arm/boot/dts/omap5.dtsi @@ -25,6 +25,11 @@ i2c2 = &i2c3; i2c3 = &i2c4; i2c4 = &i2c5; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; + mmc3 = &mmc4; + mmc4 = &mmc5; serial0 = &uart1; serial1 = &uart2; serial2 = &uart3; diff --git a/arch/arm/boot/dts/sam9x60.dtsi b/arch/arm/boot/dts/sam9x60.dtsi index 84066c1298df..ec45ced3cde6 100644 --- a/arch/arm/boot/dts/sam9x60.dtsi +++ b/arch/arm/boot/dts/sam9x60.dtsi @@ -606,6 +606,15 @@ compatible = "microchip,sam9x60-pinctrl", "atmel,at91sam9x5-pinctrl", "atmel,at91rm9200-pinctrl", "simple-bus"; ranges = <0xfffff400 0xfffff400 0x800>; + /* mux-mask corresponding to sam9x60 SoC in TFBGA228L package */ + atmel,mux-mask = < + /* A B C */ + 0xffffffff 0xffe03fff 0xef00019d /* pioA */ + 0x03ffffff 0x02fc7e7f 0x00780000 /* pioB */ + 0xffffffff 0xffffffff 0xf83fffff /* pioC */ + 0x003fffff 0x003f8000 0x00000000 /* pioD */ + >; + pioA: gpio@fffff400 { compatible = "microchip,sam9x60-gpio", "atmel,at91sam9x5-gpio", "atmel,at91rm9200-gpio"; reg = <0xfffff400 0x200>; diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index 472e56d09eea..1da3f41359aa 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S @@ -99,28 +99,6 @@ __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr .endm - .macro __rev, out, in - .if __LINUX_ARM_ARCH__ < 6 - lsl t0, \in, #24 - and t1, \in, #0xff00 - and t2, \in, #0xff0000 - orr \out, t0, \in, lsr #24 - orr \out, \out, t1, lsl #8 - orr \out, \out, t2, lsr #8 - .else - rev \out, \in - .endif - .endm - - .macro __adrl, out, sym, c - .if __LINUX_ARM_ARCH__ < 7 - ldr\c \out, =\sym - .else - movw\c \out, #:lower16:\sym - movt\c \out, #:upper16:\sym - .endif - .endm - .macro do_crypt, round, ttab, ltab, bsz push {r3-r11, lr} @@ -133,10 +111,10 @@ ldr r7, [in, #12] #ifdef CONFIG_CPU_BIG_ENDIAN - __rev r4, r4 - __rev r5, r5 - __rev r6, r6 - __rev r7, r7 + rev_l r4, t0 + rev_l r5, t0 + rev_l r6, t0 + rev_l r7, t0 #endif eor r4, r4, r8 @@ -144,7 +122,7 @@ eor r6, r6, r10 eor r7, r7, r11 - __adrl ttab, \ttab + mov_l ttab, \ttab /* * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into * L1 cache, assuming cacheline size >= 32. This is a hardening measure @@ -180,7 +158,7 @@ 2: .ifb \ltab add ttab, ttab, #1 .else - __adrl ttab, \ltab + mov_l ttab, \ltab // Prefetch inverse S-box for final round; see explanation above .set i, 0 .rept 256 / 64 @@ -194,10 +172,10 @@ \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds #ifdef CONFIG_CPU_BIG_ENDIAN - __rev r4, r4 - __rev r5, r5 - __rev r6, r6 - __rev r7, r7 + rev_l r4, t0 + rev_l r5, t0 + rev_l r6, t0 + rev_l r7, t0 #endif ldr out, [sp] diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c index 34d73200e7fa..4b59d027ba4a 100644 --- a/arch/arm/crypto/blake2b-neon-glue.c +++ b/arch/arm/crypto/blake2b-neon-glue.c @@ -85,8 +85,8 @@ static int __init blake2b_neon_mod_init(void) static void __exit blake2b_neon_mod_exit(void) { - return crypto_unregister_shashes(blake2b_neon_algs, - ARRAY_SIZE(blake2b_neon_algs)); + crypto_unregister_shashes(blake2b_neon_algs, + ARRAY_SIZE(blake2b_neon_algs)); } module_init(blake2b_neon_mod_init); diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S index bed897e9a181..86345751bbf3 100644 --- a/arch/arm/crypto/blake2s-core.S +++ b/arch/arm/crypto/blake2s-core.S @@ -8,6 +8,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> // Registers used to hold message words temporarily. There aren't // enough ARM registers to hold the whole message block, so we have to @@ -38,6 +39,23 @@ #endif .endm +.macro _le32_bswap a, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp +#endif +.endm + +.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp + _le32_bswap \a, \tmp + _le32_bswap \b, \tmp + _le32_bswap \c, \tmp + _le32_bswap \d, \tmp + _le32_bswap \e, \tmp + _le32_bswap \f, \tmp + _le32_bswap \g, \tmp + _le32_bswap \h, \tmp +.endm + // Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. // (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two // columns/diagonals. s0-s1 are the word offsets to the message words the first @@ -180,8 +198,10 @@ ENTRY(blake2s_compress_arch) tst r1, #3 bne .Lcopy_block_misaligned ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 stmia r12!, {r2-r9} ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 stmia r12, {r2-r9} .Lcopy_block_done: str r1, [sp, #68] // Update message pointer @@ -268,6 +288,7 @@ ENTRY(blake2s_compress_arch) 1: #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ldr r3, [r1], #4 + _le32_bswap r3, r4 #else ldrb r3, [r1, #0] ldrb r4, [r1, #1] diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/crypto/chacha-scalar-core.S index 2985b80a45b5..083fe1ab96d0 100644 --- a/arch/arm/crypto/chacha-scalar-core.S +++ b/arch/arm/crypto/chacha-scalar-core.S @@ -41,32 +41,15 @@ X14 .req r12 X15 .req r14 -.macro __rev out, in, t0, t1, t2 -.if __LINUX_ARM_ARCH__ >= 6 - rev \out, \in -.else - lsl \t0, \in, #24 - and \t1, \in, #0xff00 - and \t2, \in, #0xff0000 - orr \out, \t0, \in, lsr #24 - orr \out, \out, \t1, lsl #8 - orr \out, \out, \t2, lsr #8 -.endif -.endm - -.macro _le32_bswap x, t0, t1, t2 +.macro _le32_bswap_4x a, b, c, d, tmp #ifdef __ARMEB__ - __rev \x, \x, \t0, \t1, \t2 + rev_l \a, \tmp + rev_l \b, \tmp + rev_l \c, \tmp + rev_l \d, \tmp #endif .endm -.macro _le32_bswap_4x a, b, c, d, t0, t1, t2 - _le32_bswap \a, \t0, \t1, \t2 - _le32_bswap \b, \t0, \t1, \t2 - _le32_bswap \c, \t0, \t1, \t2 - _le32_bswap \d, \t0, \t1, \t2 -.endm - .macro __ldrd a, b, src, offset #if __LINUX_ARM_ARCH__ >= 6 ldrd \a, \b, [\src, #\offset] @@ -200,7 +183,7 @@ add X1, X1, r9 add X2, X2, r10 add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + _le32_bswap_4x X0, X1, X2, X3, r8 ldmia r12!, {r8-r11} eor X0, X0, r8 eor X1, X1, r9 @@ -216,7 +199,7 @@ ldmia r12!, {X0-X3} add X6, r10, X6, ror #brot add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + _le32_bswap_4x X4, X5, X6, X7, r8 eor X4, X4, X0 eor X5, X5, X1 eor X6, X6, X2 @@ -231,7 +214,7 @@ add r1, r1, r9 // x9 add r6, r6, r10 // x10 add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + _le32_bswap_4x r0, r1, r6, r7, r8 ldmia r12!, {r8-r11} eor r0, r0, r8 // x8 eor r1, r1, r9 // x9 @@ -245,7 +228,7 @@ add r3, r9, r3, ror #drot // x13 add r4, r10, r4, ror #drot // x14 add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + _le32_bswap_4x r2, r3, r4, r5, r9 ldr r9, [sp, #72] // load LEN eor r2, r2, r0 // x12 eor r3, r3, r1 // x13 @@ -301,7 +284,7 @@ add X1, X1, r9 add X2, X2, r10 add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + _le32_bswap_4x X0, X1, X2, X3, r8 stmia r14!, {X0-X3} // Save keystream for x4-x7 @@ -311,7 +294,7 @@ add X5, r9, X5, ror #brot add X6, r10, X6, ror #brot add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + _le32_bswap_4x X4, X5, X6, X7, r8 add r8, sp, #64 stmia r14!, {X4-X7} @@ -323,7 +306,7 @@ add r1, r1, r9 // x9 add r6, r6, r10 // x10 add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + _le32_bswap_4x r0, r1, r6, r7, r8 stmia r14!, {r0,r1,r6,r7} __ldrd r8, r9, sp, 144 __ldrd r10, r11, sp, 152 @@ -331,7 +314,7 @@ add r3, r9, r3, ror #drot // x13 add r4, r10, r4, ror #drot // x14 add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + _le32_bswap_4x r2, r3, r4, r5, r9 stmia r14, {r2-r5} // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN diff --git a/arch/arm/crypto/curve25519-core.S b/arch/arm/crypto/curve25519-core.S index be18af52e7dc..b697fa5d059a 100644 --- a/arch/arm/crypto/curve25519-core.S +++ b/arch/arm/crypto/curve25519-core.S @@ -10,8 +10,8 @@ #include <linux/linkage.h> .text -.fpu neon .arch armv7-a +.fpu neon .align 4 ENTRY(curve25519_neon) diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c index 3023c1acfa19..c31bd8f7c092 100644 --- a/arch/arm/crypto/poly1305-glue.c +++ b/arch/arm/crypto/poly1305-glue.c @@ -29,7 +29,7 @@ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit) static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_init_arm(&dctx->h, key); dctx->s[0] = get_unaligned_le32(key + 16); diff --git a/arch/arm/include/asm/paravirt.h b/arch/arm/include/asm/paravirt.h index cdbf02d9c1d4..95d5b0d625cd 100644 --- a/arch/arm/include/asm/paravirt.h +++ b/arch/arm/include/asm/paravirt.h @@ -3,23 +3,19 @@ #define _ASM_ARM_PARAVIRT_H #ifdef CONFIG_PARAVIRT +#include <linux/static_call_types.h> + struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; -struct pv_time_ops { - unsigned long long (*steal_clock)(int cpu); -}; - -struct paravirt_patch_template { - struct pv_time_ops time; -}; +u64 dummy_steal_clock(int cpu); -extern struct paravirt_patch_template pv_ops; +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); static inline u64 paravirt_steal_clock(int cpu) { - return pv_ops.time.steal_clock(cpu); + return static_call(pv_steal_clock)(cpu); } #endif diff --git a/arch/arm/kernel/paravirt.c b/arch/arm/kernel/paravirt.c index 4cfed91fe256..7dd9806369fb 100644 --- a/arch/arm/kernel/paravirt.c +++ b/arch/arm/kernel/paravirt.c @@ -9,10 +9,15 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/types.h> +#include <linux/static_call.h> #include <asm/paravirt.h> struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; -EXPORT_SYMBOL_GPL(pv_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); diff --git a/arch/arm/mach-footbridge/cats-pci.c b/arch/arm/mach-footbridge/cats-pci.c index 0b2fd7e2e9b4..90b1e9be430e 100644 --- a/arch/arm/mach-footbridge/cats-pci.c +++ b/arch/arm/mach-footbridge/cats-pci.c @@ -15,14 +15,14 @@ #include <asm/mach-types.h> /* cats host-specific stuff */ -static int irqmap_cats[] __initdata = { IRQ_PCI, IRQ_IN0, IRQ_IN1, IRQ_IN3 }; +static int irqmap_cats[] = { IRQ_PCI, IRQ_IN0, IRQ_IN1, IRQ_IN3 }; static u8 cats_no_swizzle(struct pci_dev *dev, u8 *pin) { return 0; } -static int __init cats_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int cats_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (dev->irq >= 255) return -1; /* not a valid interrupt. */ diff --git a/arch/arm/mach-footbridge/ebsa285-pci.c b/arch/arm/mach-footbridge/ebsa285-pci.c index 6f28aaa9ca79..c3f280d08fa7 100644 --- a/arch/arm/mach-footbridge/ebsa285-pci.c +++ b/arch/arm/mach-footbridge/ebsa285-pci.c @@ -14,9 +14,9 @@ #include <asm/mach/pci.h> #include <asm/mach-types.h> -static int irqmap_ebsa285[] __initdata = { IRQ_IN3, IRQ_IN1, IRQ_IN0, IRQ_PCI }; +static int irqmap_ebsa285[] = { IRQ_IN3, IRQ_IN1, IRQ_IN0, IRQ_PCI }; -static int __init ebsa285_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int ebsa285_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (dev->vendor == PCI_VENDOR_ID_CONTAQ && dev->device == PCI_DEVICE_ID_CONTAQ_82C693) diff --git a/arch/arm/mach-footbridge/netwinder-pci.c b/arch/arm/mach-footbridge/netwinder-pci.c index 9473aa0305e5..e8304392074b 100644 --- a/arch/arm/mach-footbridge/netwinder-pci.c +++ b/arch/arm/mach-footbridge/netwinder-pci.c @@ -18,7 +18,7 @@ * We now use the slot ID instead of the device identifiers to select * which interrupt is routed where. */ -static int __init netwinder_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int netwinder_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (slot) { case 0: /* host bridge */ diff --git a/arch/arm/mach-footbridge/personal-pci.c b/arch/arm/mach-footbridge/personal-pci.c index 4391e433a4b2..9d19aa98a663 100644 --- a/arch/arm/mach-footbridge/personal-pci.c +++ b/arch/arm/mach-footbridge/personal-pci.c @@ -14,13 +14,12 @@ #include <asm/mach/pci.h> #include <asm/mach-types.h> -static int irqmap_personal_server[] __initdata = { +static int irqmap_personal_server[] = { IRQ_IN0, IRQ_IN1, IRQ_IN2, IRQ_IN3, 0, 0, 0, IRQ_DOORBELLHOST, IRQ_DMA1, IRQ_DMA2, IRQ_PCI }; -static int __init personal_server_map_irq(const struct pci_dev *dev, u8 slot, - u8 pin) +static int personal_server_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char line; diff --git a/arch/arm/mach-imx/avic.c b/arch/arm/mach-imx/avic.c index 322caa21bcb3..21bce4049cec 100644 --- a/arch/arm/mach-imx/avic.c +++ b/arch/arm/mach-imx/avic.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/irqchip.h> #include <linux/io.h> #include <linux/of.h> #include <linux/of_address.h> @@ -162,7 +163,7 @@ static void __exception_irq_entry avic_handle_irq(struct pt_regs *regs) * interrupts. It registers the interrupt enable and disable functions * to the kernel for each interrupt source. */ -void __init mxc_init_irq(void __iomem *irqbase) +static void __init mxc_init_irq(void __iomem *irqbase) { struct device_node *np; int irq_base; @@ -220,3 +221,16 @@ void __init mxc_init_irq(void __iomem *irqbase) printk(KERN_INFO "MXC IRQ initialized\n"); } + +static int __init imx_avic_init(struct device_node *node, + struct device_node *parent) +{ + void __iomem *avic_base; + + avic_base = of_iomap(node, 0); + BUG_ON(!avic_base); + mxc_init_irq(avic_base); + return 0; +} + +IRQCHIP_DECLARE(imx_avic, "fsl,avic", imx_avic_init); diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h index 2b004cc4f95e..474dedb73bc7 100644 --- a/arch/arm/mach-imx/common.h +++ b/arch/arm/mach-imx/common.h @@ -22,7 +22,6 @@ void mx35_map_io(void); void imx21_init_early(void); void imx31_init_early(void); void imx35_init_early(void); -void mxc_init_irq(void __iomem *); void mx31_init_irq(void); void mx35_init_irq(void); void mxc_set_cpu_type(unsigned int type); diff --git a/arch/arm/mach-imx/mach-imx1.c b/arch/arm/mach-imx/mach-imx1.c index 32df3b8012f9..8eca92d66a2e 100644 --- a/arch/arm/mach-imx/mach-imx1.c +++ b/arch/arm/mach-imx/mach-imx1.c @@ -17,16 +17,6 @@ static void __init imx1_init_early(void) mxc_set_cpu_type(MXC_CPU_MX1); } -static void __init imx1_init_irq(void) -{ - void __iomem *avic_addr; - - avic_addr = ioremap(MX1_AVIC_ADDR, SZ_4K); - WARN_ON(!avic_addr); - - mxc_init_irq(avic_addr); -} - static const char * const imx1_dt_board_compat[] __initconst = { "fsl,imx1", NULL @@ -34,7 +24,6 @@ static const char * const imx1_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX1_DT, "Freescale i.MX1 (Device Tree Support)") .init_early = imx1_init_early, - .init_irq = imx1_init_irq, .dt_compat = imx1_dt_board_compat, .restart = mxc_restart, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx25.c b/arch/arm/mach-imx/mach-imx25.c index 95de48a1aa7d..51927bd08aef 100644 --- a/arch/arm/mach-imx/mach-imx25.c +++ b/arch/arm/mach-imx/mach-imx25.c @@ -22,17 +22,6 @@ static void __init imx25_dt_init(void) imx_aips_allow_unprivileged_access("fsl,imx25-aips"); } -static void __init mx25_init_irq(void) -{ - struct device_node *np; - void __iomem *avic_base; - - np = of_find_compatible_node(NULL, NULL, "fsl,avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - mxc_init_irq(avic_base); -} - static const char * const imx25_dt_board_compat[] __initconst = { "fsl,imx25", NULL @@ -42,6 +31,5 @@ DT_MACHINE_START(IMX25_DT, "Freescale i.MX25 (Device Tree Support)") .init_early = imx25_init_early, .init_machine = imx25_dt_init, .init_late = imx25_pm_init, - .init_irq = mx25_init_irq, .dt_compat = imx25_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx27.c b/arch/arm/mach-imx/mach-imx27.c index 262422a9c196..e325c9468105 100644 --- a/arch/arm/mach-imx/mach-imx27.c +++ b/arch/arm/mach-imx/mach-imx27.c @@ -56,17 +56,6 @@ static void __init imx27_init_early(void) mxc_set_cpu_type(MXC_CPU_MX27); } -static void __init mx27_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - mxc_init_irq(avic_base); -} - static const char * const imx27_dt_board_compat[] __initconst = { "fsl,imx27", NULL @@ -75,7 +64,6 @@ static const char * const imx27_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX27_DT, "Freescale i.MX27 (Device Tree Support)") .map_io = mx27_map_io, .init_early = imx27_init_early, - .init_irq = mx27_init_irq, .init_late = imx27_pm_init, .dt_compat = imx27_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx31.c b/arch/arm/mach-imx/mach-imx31.c index dc69dfe600df..e9a1092b6093 100644 --- a/arch/arm/mach-imx/mach-imx31.c +++ b/arch/arm/mach-imx/mach-imx31.c @@ -14,6 +14,5 @@ static const char * const imx31_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX31_DT, "Freescale i.MX31 (Device Tree Support)") .map_io = mx31_map_io, .init_early = imx31_init_early, - .init_irq = mx31_init_irq, .dt_compat = imx31_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx35.c b/arch/arm/mach-imx/mach-imx35.c index ec5c3068715c..0fc08218b77d 100644 --- a/arch/arm/mach-imx/mach-imx35.c +++ b/arch/arm/mach-imx/mach-imx35.c @@ -27,6 +27,5 @@ DT_MACHINE_START(IMX35_DT, "Freescale i.MX35 (Device Tree Support)") .l2c_aux_mask = ~0, .map_io = mx35_map_io, .init_early = imx35_init_early, - .init_irq = mx35_init_irq, .dt_compat = imx35_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mm-imx3.c b/arch/arm/mach-imx/mm-imx3.c index 5056438e5b42..28db97289ee8 100644 --- a/arch/arm/mach-imx/mm-imx3.c +++ b/arch/arm/mach-imx/mm-imx3.c @@ -109,18 +109,6 @@ void __init imx31_init_early(void) mx3_ccm_base = of_iomap(np, 0); BUG_ON(!mx3_ccm_base); } - -void __init mx31_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,imx31-avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - - mxc_init_irq(avic_base); -} #endif /* ifdef CONFIG_SOC_IMX31 */ #ifdef CONFIG_SOC_IMX35 @@ -158,16 +146,4 @@ void __init imx35_init_early(void) mx3_ccm_base = of_iomap(np, 0); BUG_ON(!mx3_ccm_base); } - -void __init mx35_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,imx35-avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - - mxc_init_irq(avic_base); -} #endif /* ifdef CONFIG_SOC_IMX35 */ diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c index cd711bfc591f..2c647bdf8d25 100644 --- a/arch/arm/mach-keystone/keystone.c +++ b/arch/arm/mach-keystone/keystone.c @@ -65,7 +65,7 @@ static void __init keystone_init(void) static long long __init keystone_pv_fixup(void) { long long offset; - phys_addr_t mem_start, mem_end; + u64 mem_start, mem_end; mem_start = memblock_start_of_DRAM(); mem_end = memblock_end_of_DRAM(); @@ -78,7 +78,7 @@ static long long __init keystone_pv_fixup(void) if (mem_start < KEYSTONE_HIGH_PHYS_START || mem_end > KEYSTONE_HIGH_PHYS_END) { pr_crit("Invalid address space for memory (%08llx-%08llx)\n", - (u64)mem_start, (u64)mem_end); + mem_start, mem_end); return 0; } diff --git a/arch/arm/mach-omap1/ams-delta-fiq-handler.S b/arch/arm/mach-omap1/ams-delta-fiq-handler.S index 14a6c3eb3298..f745a65d3bd7 100644 --- a/arch/arm/mach-omap1/ams-delta-fiq-handler.S +++ b/arch/arm/mach-omap1/ams-delta-fiq-handler.S @@ -15,6 +15,7 @@ #include <linux/platform_data/gpio-omap.h> #include <asm/assembler.h> +#include <asm/irq.h> #include "ams-delta-fiq.h" #include "board-ams-delta.h" diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c index 7290f033fd2d..1610c567a6a3 100644 --- a/arch/arm/mach-omap2/board-generic.c +++ b/arch/arm/mach-omap2/board-generic.c @@ -33,7 +33,7 @@ static void __init __maybe_unused omap_generic_init(void) } /* Clocks are needed early, see drivers/clocksource for the rest */ -void __init __maybe_unused omap_init_time_of(void) +static void __init __maybe_unused omap_init_time_of(void) { omap_clk_init(); timer_probe(); diff --git a/arch/arm/mach-omap2/omap-secure.c b/arch/arm/mach-omap2/omap-secure.c index f70d561f37f7..0659ab4cb0af 100644 --- a/arch/arm/mach-omap2/omap-secure.c +++ b/arch/arm/mach-omap2/omap-secure.c @@ -9,6 +9,7 @@ */ #include <linux/arm-smccc.h> +#include <linux/cpu_pm.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/io.h> @@ -20,6 +21,7 @@ #include "common.h" #include "omap-secure.h" +#include "soc.h" static phys_addr_t omap_secure_memblock_base; @@ -213,3 +215,40 @@ void __init omap_secure_init(void) { omap_optee_init_check(); } + +/* + * Dummy dispatcher call after core OSWR and MPU off. Updates the ROM return + * address after MMU has been re-enabled after CPU1 has been woken up again. + * Otherwise the ROM code will attempt to use the earlier physical return + * address that got set with MMU off when waking up CPU1. Only used on secure + * devices. + */ +static int cpu_notifier(struct notifier_block *nb, unsigned long cmd, void *v) +{ + switch (cmd) { + case CPU_CLUSTER_PM_EXIT: + omap_secure_dispatcher(OMAP4_PPA_SERVICE_0, + FLAG_START_CRITICAL, + 0, 0, 0, 0, 0); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block secure_notifier_block = { + .notifier_call = cpu_notifier, +}; + +static int __init secure_pm_init(void) +{ + if (omap_type() == OMAP2_DEVICE_TYPE_GP || !soc_is_omap44xx()) + return 0; + + cpu_pm_register_notifier(&secure_notifier_block); + + return 0; +} +omap_arch_initcall(secure_pm_init); diff --git a/arch/arm/mach-omap2/omap-secure.h b/arch/arm/mach-omap2/omap-secure.h index 4aaa95706d39..172069f31616 100644 --- a/arch/arm/mach-omap2/omap-secure.h +++ b/arch/arm/mach-omap2/omap-secure.h @@ -50,6 +50,7 @@ #define OMAP5_DRA7_MON_SET_ACR_INDEX 0x107 /* Secure PPA(Primary Protected Application) APIs */ +#define OMAP4_PPA_SERVICE_0 0x21 #define OMAP4_PPA_L2_POR_INDEX 0x23 #define OMAP4_PPA_CPU_ACTRL_SMP_INDEX 0x25 diff --git a/arch/arm/mach-omap2/pmic-cpcap.c b/arch/arm/mach-omap2/pmic-cpcap.c index 09076ad0576d..668dc84fd31e 100644 --- a/arch/arm/mach-omap2/pmic-cpcap.c +++ b/arch/arm/mach-omap2/pmic-cpcap.c @@ -246,10 +246,10 @@ int __init omap4_cpcap_init(void) omap_voltage_register_pmic(voltdm, &omap443x_max8952_mpu); if (of_machine_is_compatible("motorola,droid-bionic")) { - voltdm = voltdm_lookup("mpu"); + voltdm = voltdm_lookup("core"); omap_voltage_register_pmic(voltdm, &omap_cpcap_core); - voltdm = voltdm_lookup("mpu"); + voltdm = voltdm_lookup("iva"); omap_voltage_register_pmic(voltdm, &omap_cpcap_iva); } else { voltdm = voltdm_lookup("core"); diff --git a/arch/arm/mach-omap2/sr_device.c b/arch/arm/mach-omap2/sr_device.c index 62df666c2bd0..605925684b0a 100644 --- a/arch/arm/mach-omap2/sr_device.c +++ b/arch/arm/mach-omap2/sr_device.c @@ -88,34 +88,26 @@ static void __init sr_set_nvalues(struct omap_volt_data *volt_data, extern struct omap_sr_data omap_sr_pdata[]; -static int __init sr_dev_init(struct omap_hwmod *oh, void *user) +static int __init sr_init_by_name(const char *name, const char *voltdm) { struct omap_sr_data *sr_data = NULL; struct omap_volt_data *volt_data; - struct omap_smartreflex_dev_attr *sr_dev_attr; static int i; - if (!strncmp(oh->name, "smartreflex_mpu_iva", 20) || - !strncmp(oh->name, "smartreflex_mpu", 16)) + if (!strncmp(name, "smartreflex_mpu_iva", 20) || + !strncmp(name, "smartreflex_mpu", 16)) sr_data = &omap_sr_pdata[OMAP_SR_MPU]; - else if (!strncmp(oh->name, "smartreflex_core", 17)) + else if (!strncmp(name, "smartreflex_core", 17)) sr_data = &omap_sr_pdata[OMAP_SR_CORE]; - else if (!strncmp(oh->name, "smartreflex_iva", 16)) + else if (!strncmp(name, "smartreflex_iva", 16)) sr_data = &omap_sr_pdata[OMAP_SR_IVA]; if (!sr_data) { - pr_err("%s: Unknown instance %s\n", __func__, oh->name); + pr_err("%s: Unknown instance %s\n", __func__, name); return -EINVAL; } - sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr; - if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) { - pr_err("%s: No voltage domain specified for %s. Cannot initialize\n", - __func__, oh->name); - goto exit; - } - - sr_data->name = oh->name; + sr_data->name = name; if (cpu_is_omap343x()) sr_data->ip_type = 1; else @@ -136,10 +128,10 @@ static int __init sr_dev_init(struct omap_hwmod *oh, void *user) } } - sr_data->voltdm = voltdm_lookup(sr_dev_attr->sensor_voltdm_name); + sr_data->voltdm = voltdm_lookup(voltdm); if (!sr_data->voltdm) { pr_err("%s: Unable to get voltage domain pointer for VDD %s\n", - __func__, sr_dev_attr->sensor_voltdm_name); + __func__, voltdm); goto exit; } @@ -160,6 +152,20 @@ exit: return 0; } +static int __init sr_dev_init(struct omap_hwmod *oh, void *user) +{ + struct omap_smartreflex_dev_attr *sr_dev_attr; + + sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr; + if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) { + pr_err("%s: No voltage domain specified for %s. Cannot initialize\n", + __func__, oh->name); + return 0; + } + + return sr_init_by_name(oh->name, sr_dev_attr->sensor_voltdm_name); +} + /* * API to be called from board files to enable smartreflex * autocompensation at init. @@ -169,7 +175,42 @@ void __init omap_enable_smartreflex_on_init(void) sr_enable_on_init = true; } +static const char * const omap4_sr_instances[] = { + "mpu", + "iva", + "core", +}; + +static const char * const dra7_sr_instances[] = { + "mpu", + "core", +}; + int __init omap_devinit_smartreflex(void) { + const char * const *sr_inst = NULL; + int i, nr_sr = 0; + + if (soc_is_omap44xx()) { + sr_inst = omap4_sr_instances; + nr_sr = ARRAY_SIZE(omap4_sr_instances); + + } else if (soc_is_dra7xx()) { + sr_inst = dra7_sr_instances; + nr_sr = ARRAY_SIZE(dra7_sr_instances); + } + + if (nr_sr) { + const char *name, *voltdm; + + for (i = 0; i < nr_sr; i++) { + name = kasprintf(GFP_KERNEL, "smartreflex_%s", sr_inst[i]); + voltdm = sr_inst[i]; + sr_init_by_name(name, voltdm); + } + + return 0; + } + return omap_hwmod_for_each_by_class("smartreflex", sr_dev_init, NULL); } diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c index d1010ec26e9f..d237bd030238 100644 --- a/arch/arm/mach-pxa/mainstone.c +++ b/arch/arm/mach-pxa/mainstone.c @@ -502,16 +502,20 @@ static inline void mainstone_init_keypad(void) {} #endif static int mst_pcmcia0_irqs[11] = { - [0 ... 10] = -1, + [0 ... 4] = -1, [5] = MAINSTONE_S0_CD_IRQ, + [6 ... 7] = -1, [8] = MAINSTONE_S0_STSCHG_IRQ, + [9] = -1, [10] = MAINSTONE_S0_IRQ, }; static int mst_pcmcia1_irqs[11] = { - [0 ... 10] = -1, + [0 ... 4] = -1, [5] = MAINSTONE_S1_CD_IRQ, + [6 ... 7] = -1, [8] = MAINSTONE_S1_STSCHG_IRQ, + [9] = -1, [10] = MAINSTONE_S1_IRQ, }; diff --git a/arch/arm/mach-pxa/pxa_cplds_irqs.c b/arch/arm/mach-pxa/pxa_cplds_irqs.c index 45c19ca96f7a..ec0d9b094744 100644 --- a/arch/arm/mach-pxa/pxa_cplds_irqs.c +++ b/arch/arm/mach-pxa/pxa_cplds_irqs.c @@ -147,22 +147,20 @@ static int cplds_probe(struct platform_device *pdev) } irq_set_irq_wake(fpga->irq, 1); - fpga->irqdomain = irq_domain_add_linear(pdev->dev.of_node, - CPLDS_NB_IRQ, - &cplds_irq_domain_ops, fpga); + if (base_irq) + fpga->irqdomain = irq_domain_add_legacy(pdev->dev.of_node, + CPLDS_NB_IRQ, + base_irq, 0, + &cplds_irq_domain_ops, + fpga); + else + fpga->irqdomain = irq_domain_add_linear(pdev->dev.of_node, + CPLDS_NB_IRQ, + &cplds_irq_domain_ops, + fpga); if (!fpga->irqdomain) return -ENODEV; - if (base_irq) { - ret = irq_create_strict_mappings(fpga->irqdomain, base_irq, 0, - CPLDS_NB_IRQ); - if (ret) { - dev_err(&pdev->dev, "couldn't create the irq mapping %d..%d\n", - base_irq, base_irq + CPLDS_NB_IRQ); - return ret; - } - } - return 0; } diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index a25b660c3017..c1e12aab67b8 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -387,8 +387,7 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) pte_t *pte = pte_offset_fixmap(pmd_off_k(vaddr), vaddr); /* Make sure fixmap region does not exceed available allocation. */ - BUILD_BUG_ON(FIXADDR_START + (__end_of_fixed_addresses * PAGE_SIZE) > - FIXADDR_END); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) < FIXADDR_START); BUG_ON(idx >= __end_of_fixed_addresses); /* we only support device mappings until pgprot_kernel has been set */ diff --git a/arch/arm/mm/pmsa-v7.c b/arch/arm/mm/pmsa-v7.c index 88950e41a3a9..59d916ccdf25 100644 --- a/arch/arm/mm/pmsa-v7.c +++ b/arch/arm/mm/pmsa-v7.c @@ -235,6 +235,7 @@ void __init pmsav7_adjust_lowmem_bounds(void) phys_addr_t mem_end; phys_addr_t reg_start, reg_end; unsigned int mem_max_regions; + bool first = true; int num; u64 i; @@ -263,7 +264,7 @@ void __init pmsav7_adjust_lowmem_bounds(void) #endif for_each_mem_range(i, ®_start, ®_end) { - if (i == 0) { + if (first) { phys_addr_t phys_offset = PHYS_OFFSET; /* @@ -275,6 +276,7 @@ void __init pmsav7_adjust_lowmem_bounds(void) mem_start = reg_start; mem_end = reg_end; specified_mem_size = mem_end - mem_start; + first = false; } else { /* * memblock auto merges contiguous blocks, remove diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c index 2de019f7503e..8359748a19a1 100644 --- a/arch/arm/mm/pmsa-v8.c +++ b/arch/arm/mm/pmsa-v8.c @@ -95,10 +95,11 @@ void __init pmsav8_adjust_lowmem_bounds(void) { phys_addr_t mem_end; phys_addr_t reg_start, reg_end; + bool first = true; u64 i; for_each_mem_range(i, ®_start, ®_end) { - if (i == 0) { + if (first) { phys_addr_t phys_offset = PHYS_OFFSET; /* @@ -107,6 +108,7 @@ void __init pmsav8_adjust_lowmem_bounds(void) if (reg_start != phys_offset) panic("First memory bank must be contiguous from PHYS_OFFSET"); mem_end = reg_end; + first = false; } else { /* * memblock auto merges contiguous blocks, remove diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index c4b49b322e8a..f5f790c6e5f8 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -204,7 +204,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) static struct undef_hook uprobes_arm_break_hook = { .instr_mask = 0x0fffffff, .instr_val = (UPROBE_SWBP_ARM_INSN & 0x0fffffff), - .cpsr_mask = MODE_MASK, + .cpsr_mask = (PSR_T_BIT | MODE_MASK), .cpsr_val = USR_MODE, .fn = uprobe_trap_handler, }; @@ -212,7 +212,7 @@ static struct undef_hook uprobes_arm_break_hook = { static struct undef_hook uprobes_arm_ss_hook = { .instr_mask = 0x0fffffff, .instr_val = (UPROBE_SS_ARM_INSN & 0x0fffffff), - .cpsr_mask = MODE_MASK, + .cpsr_mask = (PSR_T_BIT | MODE_MASK), .cpsr_val = USR_MODE, .fn = uprobe_trap_handler, }; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5656e7aacd69..f1a032ed2274 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -146,6 +146,7 @@ config ARM64 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_PFN_VALID select HAVE_ARCH_PREL32_RELOCATIONS + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_STACKLEAK select HAVE_ARCH_THREAD_STRUCT_WHITELIST @@ -810,6 +811,16 @@ config QCOM_FALKOR_ERRATUM_E1041 If unsure, say Y. +config NVIDIA_CARMEL_CNP_ERRATUM + bool "NVIDIA Carmel CNP: CNP on Carmel semantically different than ARM cores" + default y + help + If CNP is enabled on Carmel cores, non-sharable TLBIs on a core will not + invalidate shared TLB entries installed by a different core, as it would + on standard ARM cores. + + If unsure, say Y. + config SOCIONEXT_SYNQUACER_PREITS bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y @@ -1396,10 +1407,13 @@ config ARM64_PAN config AS_HAS_LDAPR def_bool $(as-instr,.arch_extension rcpc) +config AS_HAS_LSE_ATOMICS + def_bool $(as-instr,.arch_extension lse) + config ARM64_LSE_ATOMICS bool default ARM64_USE_LSE_ATOMICS - depends on $(as-instr,.arch_extension lse) + depends on AS_HAS_LSE_ATOMICS config ARM64_USE_LSE_ATOMICS bool "Atomic instructions" @@ -1656,6 +1670,7 @@ config ARM64_MTE default y depends on ARM64_AS_HAS_MTE && ARM64_TAGGED_ADDR_ABI depends on AS_HAS_ARMV8_5 + depends on AS_HAS_LSE_ATOMICS # Required for tag checking in the uaccess routines depends on ARM64_PAN select ARCH_USES_HIGH_VMA_FLAGS diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts index 437ffe3628a5..596a25907432 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts @@ -19,3 +19,7 @@ }; }; }; + +&mmc0 { + broken-cd; /* card detect is broken on *some* boards */ +}; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi index 3402cec87035..df62044ff7a7 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi @@ -34,7 +34,7 @@ vmmc-supply = <®_dcdc1>; disable-wp; bus-width = <4>; - cd-gpios = <&pio 5 6 GPIO_ACTIVE_LOW>; /* PF6 */ + cd-gpios = <&pio 5 6 GPIO_ACTIVE_HIGH>; /* PF6 push-pull switch */ status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts index 4f4755152fce..b5808047d6e4 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts @@ -289,10 +289,6 @@ vcc-pm-supply = <®_aldo1>; }; -&rtc { - clocks = <&ext_osc32k>; -}; - &spdif { status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi index 49e979794094..af8b7d0ef750 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi @@ -995,9 +995,9 @@ compatible = "allwinner,sun8i-a23-rsb"; reg = <0x07083000 0x400>; interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&r_ccu 13>; + clocks = <&r_ccu CLK_R_APB2_RSB>; clock-frequency = <3000000>; - resets = <&r_ccu 7>; + resets = <&r_ccu RST_R_APB2_RSB>; pinctrl-names = "default"; pinctrl-0 = <&r_rsb_pins>; status = "disabled"; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi index 7de6b376d792..9058cfa4980f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi @@ -198,6 +198,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 5a8a1dc4262d..28c51e521cb2 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -348,6 +348,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = <0 75 0x4>; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index 1d6dfd189c7f..39458305e333 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -354,6 +354,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", diff --git a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h index 5ccc4cc91959..a003e6af3353 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h +++ b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h @@ -124,7 +124,7 @@ #define MX8MM_IOMUXC_SD1_CMD_USDHC1_CMD 0x0A4 0x30C 0x000 0x0 0x0 #define MX8MM_IOMUXC_SD1_CMD_GPIO2_IO1 0x0A4 0x30C 0x000 0x5 0x0 #define MX8MM_IOMUXC_SD1_DATA0_USDHC1_DATA0 0x0A8 0x310 0x000 0x0 0x0 -#define MX8MM_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x31 0x000 0x5 0x0 +#define MX8MM_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x310 0x000 0x5 0x0 #define MX8MM_IOMUXC_SD1_DATA1_USDHC1_DATA1 0x0AC 0x314 0x000 0x0 0x0 #define MX8MM_IOMUXC_SD1_DATA1_GPIO2_IO3 0x0AC 0x314 0x000 0x5 0x0 #define MX8MM_IOMUXC_SD1_DATA2_USDHC1_DATA2 0x0B0 0x318 0x000 0x0 0x0 diff --git a/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts b/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts index 0e1a6d953389..122c95ddad30 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts @@ -35,7 +35,7 @@ &i2c2 { clock-frequency = <400000>; - pinctrl-names = "default"; + pinctrl-names = "default", "gpio"; pinctrl-0 = <&pinctrl_i2c2>; pinctrl-1 = <&pinctrl_i2c2_gpio>; sda-gpios = <&gpio5 17 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi index 44a8c2337cee..f3965ec5b31d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi @@ -67,7 +67,7 @@ &i2c1 { clock-frequency = <400000>; - pinctrl-names = "default"; + pinctrl-names = "default", "gpio"; pinctrl-0 = <&pinctrl_i2c1>; pinctrl-1 = <&pinctrl_i2c1_gpio>; sda-gpios = <&gpio5 15 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; diff --git a/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h index b94b02080a34..68e8fa172974 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h +++ b/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h @@ -130,7 +130,7 @@ #define MX8MQ_IOMUXC_SD1_CMD_USDHC1_CMD 0x0A4 0x30C 0x000 0x0 0x0 #define MX8MQ_IOMUXC_SD1_CMD_GPIO2_IO1 0x0A4 0x30C 0x000 0x5 0x0 #define MX8MQ_IOMUXC_SD1_DATA0_USDHC1_DATA0 0x0A8 0x310 0x000 0x0 0x0 -#define MX8MQ_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x31 0x000 0x5 0x0 +#define MX8MQ_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x310 0x000 0x5 0x0 #define MX8MQ_IOMUXC_SD1_DATA1_USDHC1_DATA1 0x0AC 0x314 0x000 0x0 0x0 #define MX8MQ_IOMUXC_SD1_DATA1_GPIO2_IO3 0x0AC 0x314 0x000 0x5 0x0 #define MX8MQ_IOMUXC_SD1_DATA2_USDHC1_DATA2 0x0B0 0x318 0x000 0x0 0x0 diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index d239ab70ed99..53e817c5f6f3 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -1,7 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0+ OR MIT) /* * Device Tree file for CZ.NIC Turris Mox Board - * 2019 by Marek Behun <marek.behun@nic.cz> + * 2019 by Marek Behún <kabel@kernel.org> */ /dts-v1/; diff --git a/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi b/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi index 64179a372ecf..c6f5df2deccf 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi @@ -310,9 +310,11 @@ }; CP11X_LABEL(sata0): sata@540000 { - compatible = "marvell,armada-8k-ahci"; + compatible = "marvell,armada-8k-ahci", + "generic-ahci"; reg = <0x540000 0x30000>; dma-coherent; + interrupts = <107 IRQ_TYPE_LEVEL_HIGH>; clocks = <&CP11X_LABEL(clk) 1 15>, <&CP11X_LABEL(clk) 1 16>; #address-cells = <1>; @@ -320,12 +322,10 @@ status = "disabled"; sata-port@0 { - interrupts = <109 IRQ_TYPE_LEVEL_HIGH>; reg = <0>; }; sata-port@1 { - interrupts = <107 IRQ_TYPE_LEVEL_HIGH>; reg = <1>; }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts b/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts index 9f5f5e1fa82e..683743f81849 100644 --- a/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts @@ -10,7 +10,7 @@ model = "NVIDIA Jetson TX2 Developer Kit"; compatible = "nvidia,p2771-0000", "nvidia,tegra186"; - aconnect { + aconnect@2900000 { status = "okay"; dma-controller@2930000 { diff --git a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi index fd9177447711..fcd71bfc6707 100644 --- a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi @@ -23,7 +23,7 @@ }; chosen { - bootargs = "earlycon console=ttyS0,115200n8"; + bootargs = "earlycon console=ttyS0,115200n8 fw_devlink=on"; stdout-path = "serial0:115200n8"; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra186.dtsi b/arch/arm64/boot/dts/nvidia/tegra186.dtsi index 02b26b39cedc..9f75bbf00cf7 100644 --- a/arch/arm64/boot/dts/nvidia/tegra186.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra186.dtsi @@ -73,7 +73,7 @@ snps,rxpbl = <8>; }; - aconnect { + aconnect@2900000 { compatible = "nvidia,tegra186-aconnect", "nvidia,tegra210-aconnect"; clocks = <&bpmp TEGRA186_CLK_APE>, diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts index 2888efc42ba1..d618f197a1d3 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts @@ -651,6 +651,8 @@ reg = <0x1a>; interrupt-parent = <&gpio>; interrupts = <TEGRA194_MAIN_GPIO(S, 5) GPIO_ACTIVE_HIGH>; + clocks = <&bpmp TEGRA194_CLK_AUD_MCLK>; + clock-names = "mclk"; realtek,jd-src = <2>; sound-name-prefix = "CVB-RT"; @@ -658,7 +660,6 @@ rt5658_ep: endpoint { remote-endpoint = <&i2s1_dap_ep>; mclk-fs = <256>; - clocks = <&bpmp TEGRA194_CLK_AUD_MCLK>; }; }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi index 7da3d48cb410..14da4206ea66 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi @@ -5,6 +5,10 @@ model = "NVIDIA Jetson Xavier NX (SD-card)"; compatible = "nvidia,p3668-0000", "nvidia,tegra194"; + aliases { + mmc0 = "/bus@0/mmc@3400000"; + }; + bus@0 { /* SDMMC1 (SD/MMC) */ mmc@3400000 { diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi index b7808648cfe4..f5a9ebbfb12f 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi @@ -5,6 +5,10 @@ model = "NVIDIA Jetson Xavier NX (eMMC)"; compatible = "nvidia,p3668-0001", "nvidia,tegra194"; + aliases { + mmc0 = "/bus@0/mmc@3460000"; + }; + bus@0 { /* SDMMC4 (eMMC) */ mmc@3460000 { diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi index 4f12721c332b..f16b0aa8a374 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi @@ -14,7 +14,6 @@ i2c5 = "/bus@0/i2c@31c0000"; i2c6 = "/bus@0/i2c@c250000"; i2c7 = "/bus@0/i2c@31e0000"; - mmc0 = "/bus@0/mmc@3460000"; rtc0 = "/bpmp/i2c/pmic@3c"; rtc1 = "/bus@0/rtc@c2a0000"; serial0 = &tcu; diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index bbdb54702aa7..247011356d11 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -359,6 +359,7 @@ ST5( mov v4.16b, vctr.16b ) ins vctr.d[0], x8 /* apply carry to N counter blocks for N := x12 */ + cbz x12, 2f adr x16, 1f sub x16, x16, x12, lsl #3 br x16 diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 683de671741a..9c3d86e397bf 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -25,7 +25,7 @@ asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_init_arm64(&dctx->h, key); dctx->s[0] = get_unaligned_le32(key + 16); diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h index 5df500dcc627..8a078fc662ac 100644 --- a/arch/arm64/include/asm/alternative-macros.h +++ b/arch/arm64/include/asm/alternative-macros.h @@ -97,9 +97,9 @@ .popsection .subsection 1 663: \insn2 -664: .previous - .org . - (664b-663b) + (662b-661b) +664: .org . - (664b-663b) + (662b-661b) .org . - (662b-661b) + (664b-663b) + .previous .endif .endm @@ -169,11 +169,11 @@ */ .macro alternative_endif 664: + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) .if .Lasm_alt_mode==0 .previous .endif - .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) .endm /* diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 93a161b3bf3f..dc52b733675d 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -37,7 +37,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) } while (--n > 0); sum += ((sum >> 32) | (sum << 32)); - return csum_fold((__force u32)(sum >> 32)); + return csum_fold((__force __wsum)(sum >> 32)); } #define ip_fast_csum ip_fast_csum diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index b77d997b173b..c40f2490cd7b 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -66,7 +66,8 @@ #define ARM64_WORKAROUND_1508412 58 #define ARM64_HAS_LDAPR 59 #define ARM64_KVM_PROTECTED_MODE 60 +#define ARM64_WORKAROUND_NVIDIA_CARMEL_CNP 61 -#define ARM64_NCAPS 61 +#define ARM64_NCAPS 62 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 4e90c2debf70..94d4025acc0b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -278,6 +278,7 @@ #define CPTR_EL2_DEFAULT CPTR_EL2_RES1 /* Hyp Debug Configuration Register bits */ +#define MDCR_EL2_TTRF (1 << 19) #define MDCR_EL2_TPMS (1 << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h index cf3a0fd7c1a7..9aa193e0e8f2 100644 --- a/arch/arm64/include/asm/paravirt.h +++ b/arch/arm64/include/asm/paravirt.h @@ -3,23 +3,19 @@ #define _ASM_ARM64_PARAVIRT_H #ifdef CONFIG_PARAVIRT +#include <linux/static_call_types.h> + struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; -struct pv_time_ops { - unsigned long long (*steal_clock)(int cpu); -}; - -struct paravirt_patch_template { - struct pv_time_ops time; -}; +u64 dummy_steal_clock(int cpu); -extern struct paravirt_patch_template pv_ops; +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); static inline u64 paravirt_steal_clock(int cpu) { - return pv_ops.time.steal_clock(cpu); + return static_call(pv_steal_clock)(cpu); } int __init pv_time_init(void); diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ca2cd75d3286..efc10e9041a0 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -251,6 +251,8 @@ unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); +asmlinkage void arm64_preempt_schedule_irq(void); + #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 9f4e3b266f21..6623c99f0984 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -55,6 +55,8 @@ void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec void arch_release_task_struct(struct task_struct *tsk); +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src); #endif diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h index 3333950b5909..ea487218db79 100644 --- a/arch/arm64/include/asm/word-at-a-time.h +++ b/arch/arm64/include/asm/word-at-a-time.h @@ -53,7 +53,7 @@ static inline unsigned long find_zero(unsigned long mask) */ static inline unsigned long load_unaligned_zeropad(const void *addr) { - unsigned long ret, offset; + unsigned long ret, tmp; /* Load word from unaligned pointer addr */ asm( @@ -61,9 +61,9 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) "2:\n" " .pushsection .fixup,\"ax\"\n" " .align 2\n" - "3: and %1, %2, #0x7\n" - " bic %2, %2, #0x7\n" - " ldr %0, [%2]\n" + "3: bic %1, %2, #0x7\n" + " ldr %0, [%1]\n" + " and %1, %2, #0x7\n" " lsl %1, %1, #0x3\n" #ifndef __AARCH64EB__ " lsr %0, %0, %1\n" @@ -73,7 +73,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) " b 2b\n" " .popsection\n" _ASM_EXTABLE(1b, 3b) - : "=&r" (ret), "=&r" (offset) + : "=&r" (ret), "=&r" (tmp) : "r" (addr), "Q" (*(unsigned long *)addr)); return ret; diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index ed65576ce710..6cc97730790e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -9,6 +9,11 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) +# Remove stack protector to avoid triggering unneeded stack canary +# checks due to randomize_kstack_offset. +CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong +CFLAGS_syscall.o += -fno-stack-protector + # Object file lists. obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ entry-common.o entry-fpsimd.o process.o ptrace.o \ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 506a1cd37973..e2c20c036442 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -526,6 +526,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { 1, 0), }, #endif +#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM + { + /* NVIDIA Carmel */ + .desc = "NVIDIA Carmel CNP erratum", + .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, + ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + }, +#endif { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 066030717a4c..e5281e1c8f1d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -383,7 +383,6 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { * of support. */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_TRACEVER_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6), ARM64_FTR_END, }; @@ -1321,7 +1320,10 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) * may share TLB entries with a CPU stuck in the crashed * kernel. */ - if (is_kdump_kernel()) + if (is_kdump_kernel()) + return false; + + if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 77605aec25fe..51fcf99d5351 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -353,7 +353,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) * with the CLIDR_EL1 fields to avoid triggering false warnings * when there is a mismatch across the CPUs. Keep track of the * effective value of the CTR_EL0 in our internal records for - * acurate sanity check and feature enablement. + * accurate sanity check and feature enablement. */ info->reg_ctr = read_cpuid_effective_cachetype(); info->reg_dczid = read_cpuid(DCZID_EL0); diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index e6e284265f19..58303a9ec32c 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -64,5 +64,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + *ppos += count; + return count; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a31a0a713c85..6acfc5e6b5e0 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -148,16 +148,18 @@ alternative_cb_end .endm /* Check for MTE asynchronous tag check faults */ - .macro check_mte_async_tcf, flgs, tmp + .macro check_mte_async_tcf, tmp, ti_flags #ifdef CONFIG_ARM64_MTE + .arch_extension lse alternative_if_not ARM64_MTE b 1f alternative_else_nop_endif mrs_s \tmp, SYS_TFSRE0_EL1 tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */ - orr \flgs, \flgs, #_TIF_MTE_ASYNC_FAULT - str \flgs, [tsk, #TSK_TI_FLAGS] + mov \tmp, #_TIF_MTE_ASYNC_FAULT + add \ti_flags, tsk, #TSK_TI_FLAGS + stset \tmp, [\ti_flags] msr_s SYS_TFSRE0_EL1, xzr 1: #endif @@ -244,7 +246,7 @@ alternative_else_nop_endif disable_step_tsk x19, x20 /* Check for asynchronous tag check faults in user space */ - check_mte_async_tcf x19, x22 + check_mte_async_tcf x22, x23 apply_ssbd 1, x22, x23 ptrauth_keys_install_kernel tsk, x20, x22, x23 diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index c07d7a034941..75fed4460407 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -18,6 +18,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/static_call.h> #include <asm/paravirt.h> #include <asm/pvclock-abi.h> @@ -26,8 +27,12 @@ struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; -EXPORT_SYMBOL_GPL(pv_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); struct pv_time_stolen_time_region { struct pvclock_vcpu_stolen_time *kaddr; @@ -45,7 +50,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); /* return stolen time in ns by asking the hypervisor */ -static u64 pv_steal_clock(int cpu) +static u64 para_steal_clock(int cpu) { struct pv_time_stolen_time_region *reg; @@ -150,7 +155,7 @@ int __init pv_time_init(void) if (ret) return ret; - pv_ops.time.steal_clock = pv_steal_clock; + static_call_update(pv_steal_clock, para_steal_clock); static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 66aac2881ba8..85645b2b0c7a 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -267,10 +267,12 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) if (!instruction_pointer(regs)) BUG(); - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { restore_previous_kprobe(kcb); - else + } else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; case KPROBE_HIT_ACTIVE: diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 325c83b1a24d..6e60aa3b5ea9 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -57,6 +57,8 @@ #include <asm/processor.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> +#include <asm/switch_to.h> +#include <asm/system_misc.h> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 5bfd9b87f85d..4ea9392f86e0 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -134,7 +134,7 @@ SYM_FUNC_START(_cpu_resume) */ bl cpu_do_resume -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) mov x0, sp bl kasan_unpoison_task_stack_below #endif diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ad20981dfda4..d55bdfb7789c 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -194,8 +194,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) #ifdef CONFIG_STACKTRACE -void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, - struct task_struct *task, struct pt_regs *regs) +noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { struct stackframe frame; @@ -203,8 +204,8 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, start_backtrace(&frame, regs->regs[29], regs->pc); else if (task == current) start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)arch_stack_walk); + (unsigned long)__builtin_frame_address(1), + (unsigned long)__builtin_return_address(0)); else start_backtrace(&frame, thread_saved_fp(task), thread_saved_pc(task)); diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index b9cf12b271d7..263d6c1a525f 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -5,6 +5,7 @@ #include <linux/errno.h> #include <linux/nospec.h> #include <linux/ptrace.h> +#include <linux/randomize_kstack.h> #include <linux/syscalls.h> #include <asm/daifflags.h> @@ -43,6 +44,8 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, { long ret; + add_random_kstack_offset(); + if (scno < sc_nr) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; @@ -55,6 +58,19 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, ret = lower_32_bits(ret); regs->regs[0] = ret; + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * but not enough for arm64 stack utilization comfort. To keep + * reasonable stack head room, reduce the maximum offset to 9 bits. + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints: the AAPCS mandates a + * 16-byte (i.e. 4-bit) aligned SP at function boundaries. + * + * The resulting 5 bits of entropy is seen in SP[8:4]. + */ + choose_random_kstack_offset(get_random_int() & 0x1FF); } static inline bool has_syscall_work(unsigned long flags) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 7a7e425616b5..dbc890511631 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -89,6 +89,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) * - Debug ROM Address (MDCR_EL2_TDRA) * - OS related registers (MDCR_EL2_TDOSA) * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) + * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) * * Additionally, KVM only traps guest accesses to the debug registers if * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY @@ -112,6 +113,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | + MDCR_EL2_TTRF | MDCR_EL2_TPMCR | MDCR_EL2_TDRA | MDCR_EL2_TDOSA); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index ee3682b9873c..39f8f7f9227c 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -429,6 +429,13 @@ u64 __vgic_v3_get_gic_config(void) if (has_vhe()) flags = local_daif_save(); + /* + * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates + * that to be able to set ICC_SRE_EL1.SRE to 0, all the + * interrupt overrides must be set. You've got to love this. + */ + sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); write_gicreg(0, ICC_SRE_EL1); isb(); @@ -436,6 +443,8 @@ u64 __vgic_v3_get_gic_config(void) write_gicreg(sre, ICC_SRE_EL1); isb(); + sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); if (has_vhe()) local_daif_restore(flags); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 15a6c98ee92f..2f1b156021a6 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -86,7 +86,7 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, } break; case GICD_TYPER2: - if (kvm_vgic_global_state.has_gicv4_1) + if (kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi()) value = GICD_TYPER2_nASSGIcap; break; case GICD_IIDR: @@ -119,7 +119,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; /* Not a GICv4.1? No HW SGIs */ - if (!kvm_vgic_global_state.has_gicv4_1) + if (!kvm_vgic_global_state.has_gicv4_1 || !gic_cpuif_has_vsgi()) val &= ~GICD_CTLR_nASSGIreq; /* Dist stays enabled? nASSGIreq is RO */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 7484ea4f6ba0..5d9550fdb9cf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1448,6 +1448,22 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; + u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + u64 end_linear_pa = __pa(PAGE_END - 1); + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + /* + * Check for a wrap, it is possible because of randomized linear + * mapping the start physical address is actually bigger than + * the end physical address. In this case set start to zero + * because [0, end_linear_pa] range must still be able to cover + * all addressable physical addresses. + */ + if (start_linear_pa > end_linear_pa) + start_linear_pa = 0; + } + + WARN_ON(start_linear_pa > end_linear_pa); /* * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] @@ -1455,8 +1471,9 @@ struct range arch_get_mappable_range(void) * range which can be mapped inside this linear mapping range, must * also be derived from its end points. */ - mhp_range.start = __pa(_PAGE_OFFSET(vabits_actual)); - mhp_range.end = __pa(PAGE_END - 1); + mhp_range.start = start_linear_pa; + mhp_range.end = end_linear_pa; + return mhp_range; } diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 34e91224adc3..8de5b987edb9 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -314,7 +314,7 @@ config FORCE_MAX_ZONEORDER int "Maximum zone order" default "11" -config RAM_BASE +config DRAM_BASE hex "DRAM start addr (the same with memory-section in dts)" default 0x0 diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h index 3b91fc3cf36f..ed7451478b1b 100644 --- a/arch/csky/include/asm/page.h +++ b/arch/csky/include/asm/page.h @@ -28,7 +28,7 @@ #define SSEG_SIZE 0x20000000 #define LOWMEM_LIMIT (SSEG_SIZE * 2) -#define PHYS_OFFSET_OFFSET (CONFIG_RAM_BASE & (SSEG_SIZE - 1)) +#define PHYS_OFFSET_OFFSET (CONFIG_DRAM_BASE & (SSEG_SIZE - 1)) #ifndef __ASSEMBLY__ diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig index ca0d596c800d..8916a2850c48 100644 --- a/arch/ia64/configs/generic_defconfig +++ b/arch/ia64/configs/generic_defconfig @@ -55,8 +55,6 @@ CONFIG_CHR_DEV_SG=m CONFIG_SCSI_FC_ATTRS=y CONFIG_SCSI_SYM53C8XX_2=y CONFIG_SCSI_QLOGIC_1280=y -CONFIG_ATA=y -CONFIG_ATA_PIIX=y CONFIG_SATA_VITESSE=y CONFIG_MD=y CONFIG_BLK_DEV_MD=m diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index b3aa46090101..08179135905c 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -54,8 +54,7 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) { - /* FIXME: should this be bspstore + nr_dirty regs? */ - return regs->ar_bspstore; + return regs->r12; } static inline int is_syscall_success(struct pt_regs *regs) @@ -79,11 +78,6 @@ static inline long regs_return_value(struct pt_regs *regs) unsigned long __ip = instruction_pointer(regs); \ (__ip & ~3UL) + ((__ip & 3UL) << 2); \ }) -/* - * Why not default? Because user_stack_pointer() on ia64 gives register - * stack backing store instead... - */ -#define current_user_stack_pointer() (current_pt_regs()->r12) /* given a pointer to a task_struct, return the user's pt_regs */ # define task_pt_regs(t) (((struct pt_regs *) ((char *) (t) + IA64_STK_OFFSET)) - 1) diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 8b5b8e6bc9d9..dd5bfed52031 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -59,7 +59,7 @@ show_##name(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ u32 cpu=dev->id; \ - return sprintf(buf, "%lx\n", name[cpu]); \ + return sprintf(buf, "%llx\n", name[cpu]); \ } #define store(name) \ @@ -86,9 +86,9 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu); - printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]); - printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]); - printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n", + printk(KERN_DEBUG "err_type_info=%llx,\n", err_type_info[cpu]); + printk(KERN_DEBUG "err_struct_info=%llx,\n", err_struct_info[cpu]); + printk(KERN_DEBUG "err_data_buffer=%llx, %llx, %llx.\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -117,8 +117,8 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]); - printk(KERN_DEBUG "capabilities=%lx,\n", capabilities[cpu]); - printk(KERN_DEBUG "resources=%lx\n", resources[cpu]); + printk(KERN_DEBUG "capabilities=%llx,\n", capabilities[cpu]); + printk(KERN_DEBUG "resources=%llx\n", resources[cpu]); #endif return size; } @@ -131,7 +131,7 @@ show_virtual_to_phys(struct device *dev, struct device_attribute *attr, char *buf) { unsigned int cpu=dev->id; - return sprintf(buf, "%lx\n", phys_addr[cpu]); + return sprintf(buf, "%llx\n", phys_addr[cpu]); } static ssize_t @@ -145,7 +145,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG - printk("Virtual address %lx is not existing.\n",virt_addr); + printk("Virtual address %llx is not existing.\n", virt_addr); #endif return -EINVAL; } @@ -163,7 +163,7 @@ show_err_data_buffer(struct device *dev, { unsigned int cpu=dev->id; - return sprintf(buf, "%lx, %lx, %lx\n", + return sprintf(buf, "%llx, %llx, %llx\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -178,13 +178,13 @@ store_err_data_buffer(struct device *dev, int ret; #ifdef ERR_INJ_DEBUG - printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n", + printk("write err_data_buffer=[%llx,%llx,%llx] on cpu%d\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3, cpu); #endif - ret=sscanf(buf, "%lx, %lx, %lx", + ret = sscanf(buf, "%llx, %llx, %llx", &err_data_buffer[cpu].data1, &err_data_buffer[cpu].data2, &err_data_buffer[cpu].data3); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index d4cae2fc69ca..adf6521525f4 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1824,7 +1824,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = (void *)__get_free_pages(GFP_KERNEL, + data = (void *)__get_free_pages(GFP_ATOMIC, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 03b3a02375ff..c310b4c99fb3 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -95,7 +95,7 @@ static int __init build_node_maps(unsigned long start, unsigned long len, * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been * called yet. Note that node 0 will also count all non-existent cpus. */ -static int __meminit early_nr_cpus_node(int node) +static int early_nr_cpus_node(int node) { int cpu, n = 0; @@ -110,7 +110,7 @@ static int __meminit early_nr_cpus_node(int node) * compute_pernodesize - compute size of pernode data * @node: the node id. */ -static unsigned long __meminit compute_pernodesize(int node) +static unsigned long compute_pernodesize(int node) { unsigned long pernodesize = 0, cpus; @@ -367,7 +367,7 @@ static void __init reserve_pernode_space(void) } } -static void __meminit scatter_node_data(void) +static void scatter_node_data(void) { pg_data_t **dst; int node; diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index 9e8f0cc30a2c..2411ea9ef578 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -167,7 +167,7 @@ static inline __attribute_const__ int __virt_to_node_shift(void) ((__p) - pgdat->node_mem_map) + pgdat->node_start_pfn; \ }) #else -#define ARCH_PFN_OFFSET (m68k_memory[0].addr) +#define ARCH_PFN_OFFSET (m68k_memory[0].addr >> PAGE_SHIFT) #include <asm-generic/memory_model.h> #endif diff --git a/arch/mips/crypto/poly1305-glue.c b/arch/mips/crypto/poly1305-glue.c index fc881b46d911..bc6110fb98e0 100644 --- a/arch/mips/crypto/poly1305-glue.c +++ b/arch/mips/crypto/poly1305-glue.c @@ -17,7 +17,7 @@ asmlinkage void poly1305_init_mips(void *state, const u8 *key); asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit); asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce); -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_init_mips(&dctx->h, key); dctx->s[0] = get_unaligned_le32(key + 16); diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 279be0153f8b..23a140327a0b 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -43,7 +43,7 @@ #include <asm/prom.h> #ifdef CONFIG_MIPS_ELF_APPENDED_DTB -const char __section(".appended_dtb") __appended_dtb[0x100000]; +char __section(".appended_dtb") __appended_dtb[0x100000]; #endif /* CONFIG_MIPS_ELF_APPENDED_DTB */ struct cpuinfo_mips cpu_data[NR_CPUS] __read_mostly; diff --git a/arch/mips/netlogic/common/irq.c b/arch/mips/netlogic/common/irq.c index cf33dd8a487e..c25a2ce5e29f 100644 --- a/arch/mips/netlogic/common/irq.c +++ b/arch/mips/netlogic/common/irq.c @@ -276,10 +276,6 @@ asmlinkage void plat_irq_dispatch(void) } #ifdef CONFIG_CPU_XLP -static const struct irq_domain_ops xlp_pic_irq_domain_ops = { - .xlate = irq_domain_xlate_onetwocell, -}; - static int __init xlp_of_pic_init(struct device_node *node, struct device_node *parent) { @@ -324,7 +320,7 @@ static int __init xlp_of_pic_init(struct device_node *node, xlp_pic_domain = irq_domain_add_legacy(node, n_picirqs, nlm_irq_to_xirq(socid, PIC_IRQ_BASE), PIC_IRQ_BASE, - &xlp_pic_irq_domain_ops, NULL); + &irq_domain_simple_ops, NULL); if (xlp_pic_domain == NULL) { pr_err("PIC %pOFn: Creating legacy domain failed!\n", node); return -EINVAL; diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index 6eb98a7ad27d..ad5344ef5d33 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -238,7 +238,7 @@ void flush_dcache_page(struct page *page) { struct address_space *mapping; - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (mapping && !mapping_mapped(mapping)) set_bit(PG_dcache_dirty, &page->flags); else { diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h index cf5ee9b0b393..84ee232278a6 100644 --- a/arch/parisc/include/asm/cmpxchg.h +++ b/arch/parisc/include/asm/cmpxchg.h @@ -72,7 +72,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size) #endif case 4: return __cmpxchg_u32((unsigned int *)ptr, (unsigned int)old, (unsigned int)new_); - case 1: return __cmpxchg_u8((u8 *)ptr, (u8)old, (u8)new_); + case 1: return __cmpxchg_u8((u8 *)ptr, old & 0xff, new_ & 0xff); } __cmpxchg_called_with_bad_pointer(); return old; diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 11ece0d07374..b5fbcd2c1780 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -272,7 +272,6 @@ on downward growing arches, it looks like this: regs->gr[23] = 0; \ } while(0) -struct task_struct; struct mm_struct; /* Free all resources held by a thread. */ diff --git a/arch/parisc/math-emu/fpu.h b/arch/parisc/math-emu/fpu.h index 853c19c03828..dec951d40286 100644 --- a/arch/parisc/math-emu/fpu.h +++ b/arch/parisc/math-emu/fpu.h @@ -5,34 +5,10 @@ * Floating-point emulation code * Copyright (C) 2001 Hewlett-Packard (Paul Bame) <bame@debian.org> */ -/* - * BEGIN_DESC - * - * File: - * @(#) pa/fp/fpu.h $Revision: 1.1 $ - * - * Purpose: - * <<please update with a synopis of the functionality provided by this file>> - * - * - * END_DESC -*/ - -#ifdef __NO_PA_HDRS - PA header file -- do not include this header file for non-PA builds. -#endif - #ifndef _MACHINE_FPU_INCLUDED /* allows multiple inclusion */ #define _MACHINE_FPU_INCLUDED -#if 0 -#ifndef _SYS_STDSYMS_INCLUDED -# include <sys/stdsyms.h> -#endif /* _SYS_STDSYMS_INCLUDED */ -#include <machine/pdc/pdc_rqsts.h> -#endif - #define PA83_FPU_FLAG 0x00000001 #define PA89_FPU_FLAG 0x00000002 #define PA2_0_FPU_FLAG 0x00000010 @@ -43,21 +19,19 @@ #define COPR_FP 0x00000080 /* Floating point -- Coprocessor 0 */ #define SFU_MPY_DIVIDE 0x00008000 /* Multiply/Divide __ SFU 0 */ - #define EM_FPU_TYPE_OFFSET 272 /* version of EMULATION software for COPR,0,0 instruction */ #define EMULATION_VERSION 4 /* - * The only was to differeniate between TIMEX and ROLEX (or PCX-S and PCX-T) - * is thorough the potential type field from the PDC_MODEL call. The - * following flags are used at assist this differeniation. + * The only way to differentiate between TIMEX and ROLEX (or PCX-S and PCX-T) + * is through the potential type field from the PDC_MODEL call. + * The following flags are used to assist this differentiation. */ #define ROLEX_POTENTIAL_KEY_FLAGS PDC_MODEL_CPU_KEY_WORD_TO_IO #define TIMEX_POTENTIAL_KEY_FLAGS (PDC_MODEL_CPU_KEY_QUAD_STORE | \ PDC_MODEL_CPU_KEY_RECIP_SQRT) - #endif /* ! _MACHINE_FPU_INCLUDED */ diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c index b1e577cbf00c..88e8ea73bfa7 100644 --- a/arch/powerpc/crypto/sha1-spe-glue.c +++ b/arch/powerpc/crypto/sha1-spe-glue.c @@ -107,7 +107,7 @@ static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data, src += bytes; len -= bytes; - }; + } memcpy((char *)sctx->buffer, src, len); return 0; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 6084fa499aa3..f66b63e81c3b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -191,3 +191,7 @@ $(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE targets += prom_init_check clean-files := vmlinux.lds + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32/vdso32.so.dbg +$(obj)/vdso64_wrapper.o : $(obj)/vdso64/vdso64.so.dbg diff --git a/arch/powerpc/kernel/ptrace/Makefile b/arch/powerpc/kernel/ptrace/Makefile index 8ebc11d1168d..77abd1a5a508 100644 --- a/arch/powerpc/kernel/ptrace/Makefile +++ b/arch/powerpc/kernel/ptrace/Makefile @@ -6,11 +6,11 @@ CFLAGS_ptrace-view.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' obj-y += ptrace.o ptrace-view.o -obj-$(CONFIG_PPC_FPU_REGS) += ptrace-fpu.o +obj-y += ptrace-fpu.o obj-$(CONFIG_COMPAT) += ptrace32.o obj-$(CONFIG_VSX) += ptrace-vsx.o ifneq ($(CONFIG_VSX),y) -obj-$(CONFIG_PPC_FPU_REGS) += ptrace-novsx.o +obj-y += ptrace-novsx.o endif obj-$(CONFIG_ALTIVEC) += ptrace-altivec.o obj-$(CONFIG_SPE) += ptrace-spe.o diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h index 3487f2c9735c..eafe5f0f6289 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-decl.h +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -165,22 +165,8 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data); extern const struct user_regset_view user_ppc_native_view; /* ptrace-fpu */ -#ifdef CONFIG_PPC_FPU_REGS int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data); int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data); -#else -static inline int -ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data) -{ - return -EIO; -} - -static inline int -ptrace_put_fpr(struct task_struct *child, int index, unsigned long data) -{ - return -EIO; -} -#endif /* ptrace-(no)adv */ void ppc_gethwdinfo(struct ppc_debug_info *dbginfo); diff --git a/arch/powerpc/kernel/ptrace/ptrace-fpu.c b/arch/powerpc/kernel/ptrace/ptrace-fpu.c index 8301cb52dd99..5dca19361316 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-fpu.c +++ b/arch/powerpc/kernel/ptrace/ptrace-fpu.c @@ -8,32 +8,42 @@ int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data) { +#ifdef CONFIG_PPC_FPU_REGS unsigned int fpidx = index - PT_FPR0; +#endif if (index > PT_FPSCR) return -EIO; +#ifdef CONFIG_PPC_FPU_REGS flush_fp_to_thread(child); if (fpidx < (PT_FPSCR - PT_FPR0)) memcpy(data, &child->thread.TS_FPR(fpidx), sizeof(long)); else *data = child->thread.fp_state.fpscr; +#else + *data = 0; +#endif return 0; } int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data) { +#ifdef CONFIG_PPC_FPU_REGS unsigned int fpidx = index - PT_FPR0; +#endif if (index > PT_FPSCR) return -EIO; +#ifdef CONFIG_PPC_FPU_REGS flush_fp_to_thread(child); if (fpidx < (PT_FPSCR - PT_FPR0)) memcpy(&child->thread.TS_FPR(fpidx), &data, sizeof(long)); else child->thread.fp_state.fpscr = data; +#endif return 0; } diff --git a/arch/powerpc/kernel/ptrace/ptrace-novsx.c b/arch/powerpc/kernel/ptrace/ptrace-novsx.c index b3b36835658a..7433f3db979a 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-novsx.c +++ b/arch/powerpc/kernel/ptrace/ptrace-novsx.c @@ -21,12 +21,16 @@ int fpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { +#ifdef CONFIG_PPC_FPU_REGS BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != offsetof(struct thread_fp_state, fpr[32])); flush_fp_to_thread(target); return membuf_write(&to, &target->thread.fp_state, 33 * sizeof(u64)); +#else + return membuf_write(&to, &empty_zero_page, 33 * sizeof(u64)); +#endif } /* @@ -46,6 +50,7 @@ int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { +#ifdef CONFIG_PPC_FPU_REGS BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != offsetof(struct thread_fp_state, fpr[32])); @@ -53,4 +58,7 @@ int fpr_set(struct task_struct *target, const struct user_regset *regset, return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.fp_state, 0, -1); +#else + return 0; +#endif } diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 2bad8068f598..6ccffc65ac97 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -522,13 +522,11 @@ static const struct user_regset native_regsets[] = { .size = sizeof(long), .align = sizeof(long), .regset_get = gpr_get, .set = gpr_set }, -#ifdef CONFIG_PPC_FPU_REGS [REGSET_FPR] = { .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, .size = sizeof(double), .align = sizeof(double), .regset_get = fpr_get, .set = fpr_set }, -#endif #ifdef CONFIG_ALTIVEC [REGSET_VMX] = { .core_note_type = NT_PPC_VMX, .n = 34, diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 75ee918a120a..f651b992fe01 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -775,7 +775,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, else prepare_save_user_regs(1); - if (!user_write_access_begin(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) goto badframe; /* Put the siginfo & fill in most of the ucontext */ @@ -809,17 +809,15 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, unsafe_put_user(PPC_INST_ADDI + __NR_rt_sigreturn, &mctx->mc_pad[0], failed); unsafe_put_user(PPC_INST_SC, &mctx->mc_pad[1], failed); + asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0])); } unsafe_put_sigset_t(&frame->uc.uc_sigmask, oldset, failed); - user_write_access_end(); + user_access_end(); if (copy_siginfo_to_user(&frame->info, &ksig->info)) goto badframe; - if (tramp == (unsigned long)mctx->mc_pad) - flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long)); - regs->link = tramp; #ifdef CONFIG_PPC_FPU_REGS @@ -844,7 +842,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, return 0; failed: - user_write_access_end(); + user_access_end(); badframe: signal_fault(tsk, regs, "handle_rt_signal32", frame); @@ -879,7 +877,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, else prepare_save_user_regs(1); - if (!user_write_access_begin(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) goto badframe; sc = (struct sigcontext __user *) &frame->sctx; @@ -908,11 +906,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, /* Set up the sigreturn trampoline: li r0,sigret; sc */ unsafe_put_user(PPC_INST_ADDI + __NR_sigreturn, &mctx->mc_pad[0], failed); unsafe_put_user(PPC_INST_SC, &mctx->mc_pad[1], failed); + asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0])); } - user_write_access_end(); - - if (tramp == (unsigned long)mctx->mc_pad) - flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long)); + user_access_end(); regs->link = tramp; @@ -935,7 +931,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, return 0; failed: - user_write_access_end(); + user_access_end(); badframe: signal_fault(tsk, regs, "handle_signal32", frame); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 764170fdb0f7..3805519a6469 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -887,7 +887,8 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, want_v = hpte_encode_avpn(vpn, psize, ssize); - flags = (newpp & 7) | H_AVPN; + flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN; + flags |= (newpp & HPTE_R_KEY_HI) >> 48; if (mmu_has_feature(MMU_FTR_KERNEL_RO)) /* Move pp0 into bit 8 (IBM 55) */ flags |= (newpp & HPTE_R_PP0) >> 55; diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index ea4d6a660e0d..e83e0891272d 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -452,12 +452,28 @@ static int do_suspend(void) return ret; } +/** + * struct pseries_suspend_info - State shared between CPUs for join/suspend. + * @counter: Threads are to increment this upon resuming from suspend + * or if an error is received from H_JOIN. The thread which performs + * the first increment (i.e. sets it to 1) is responsible for + * waking the other threads. + * @done: False if join/suspend is in progress. True if the operation is + * complete (successful or not). + */ +struct pseries_suspend_info { + atomic_t counter; + bool done; +}; + static int do_join(void *arg) { - atomic_t *counter = arg; + struct pseries_suspend_info *info = arg; + atomic_t *counter = &info->counter; long hvrc; int ret; +retry: /* Must ensure MSR.EE off for H_JOIN. */ hard_irq_disable(); hvrc = plpar_hcall_norets(H_JOIN); @@ -473,8 +489,20 @@ static int do_join(void *arg) case H_SUCCESS: /* * The suspend is complete and this cpu has received a - * prod. + * prod, or we've received a stray prod from unrelated + * code (e.g. paravirt spinlocks) and we need to join + * again. + * + * This barrier orders the return from H_JOIN above vs + * the load of info->done. It pairs with the barrier + * in the wakeup/prod path below. */ + smp_mb(); + if (READ_ONCE(info->done) == false) { + pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying", + smp_processor_id()); + goto retry; + } ret = 0; break; case H_BAD_MODE: @@ -488,6 +516,13 @@ static int do_join(void *arg) if (atomic_inc_return(counter) == 1) { pr_info("CPU %u waking all threads\n", smp_processor_id()); + WRITE_ONCE(info->done, true); + /* + * This barrier orders the store to info->done vs subsequent + * H_PRODs to wake the other CPUs. It pairs with the barrier + * in the H_SUCCESS case above. + */ + smp_mb(); prod_others(); } /* @@ -535,11 +570,16 @@ static int pseries_suspend(u64 handle) int ret; while (true) { - atomic_t counter = ATOMIC_INIT(0); + struct pseries_suspend_info info; unsigned long vasi_state; int vasi_err; - ret = stop_machine(do_join, &counter, cpu_online_mask); + info = (struct pseries_suspend_info) { + .counter = ATOMIC_INIT(0), + .done = false, + }; + + ret = stop_machine(do_join, &info, cpu_online_mask); if (ret == 0) break; /* diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 87d7b52f278f..4515a10c5d22 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -153,7 +153,7 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y depends on MMU - select SPARSEMEM_STATIC if 32BIT && SPARSMEM + select SPARSEMEM_STATIC if 32BIT && SPARSEMEM select SPARSEMEM_VMEMMAP_ENABLE if 64BIT config ARCH_SELECT_MEMORY_MODEL @@ -314,7 +314,7 @@ endchoice # Common NUMA Features config NUMA bool "NUMA Memory Allocation and Scheduler Support" - depends on SMP + depends on SMP && MMU select GENERIC_ARCH_NUMA select OF_NUMA select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 824b2c9da75b..f944062c9d99 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -306,7 +306,9 @@ do { \ * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable - * to the result of dereferencing @ptr. + * to the result of dereferencing @ptr. The value of @x is copied to avoid + * re-ordering where @x is evaluated inside the block that enables user-space + * access (thus bypassing user space protection if @x is a function). * * Caller must check the pointer with access_ok() before calling this * function. @@ -316,12 +318,13 @@ do { \ #define __put_user(x, ptr) \ ({ \ __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ + __typeof__(*__gu_ptr) __val = (x); \ long __pu_err = 0; \ \ __chk_user_ptr(__gu_ptr); \ \ __enable_user_access(); \ - __put_user_nocheck(x, __gu_ptr, __pu_err); \ + __put_user_nocheck(__val, __gu_ptr, __pu_err); \ __disable_user_access(); \ \ __pu_err; \ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 744f3209c48d..83095faa680e 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -130,6 +130,9 @@ skip_context_tracking: */ andi t0, s1, SR_PIE beqz t0, 1f + /* kprobes, entered via ebreak, must have interrupts disabled. */ + li t0, EXC_BREAKPOINT + beq s4, t0, 1f #ifdef CONFIG_TRACE_IRQFLAGS call trace_hardirqs_on #endif @@ -447,6 +450,7 @@ ENDPROC(__switch_to) #endif .section ".rodata" + .align LGREG /* Exception vector table */ ENTRY(excp_vect_table) RISCV_PTR do_trap_insn_misaligned diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index 17ca5e923bb0..aab85a82f419 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -9,10 +9,16 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; struct pt_regs *regs; struct kprobe_ctlblk *kcb; + int bit; + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; regs = ftrace_get_regs(fregs); kcb = get_kprobe_ctlblk(); @@ -45,6 +51,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, */ __this_cpu_write(current_kprobe, NULL); } +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 3f893c9d9d85..2b3e0cb90d78 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -14,7 +14,7 @@ #include <asm/stacktrace.h> -register const unsigned long sp_in_global __asm__("sp"); +register unsigned long sp_in_global __asm__("sp"); #ifdef CONFIG_FRAME_POINTER diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 0879b5df11b9..1357abf79570 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -178,6 +178,7 @@ asmlinkage __visible void do_trap_break(struct pt_regs *regs) else die(regs, "Kernel BUG"); } +NOKPROBE_SYMBOL(do_trap_break); #ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long pc) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 8f17519208c7..c5dbd55cbf7c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -328,3 +328,4 @@ good_area: } return; } +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 4f85c6d0ddf8..937d13ce9ab8 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -216,7 +216,7 @@ void __init kasan_init(void) break; kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); - }; + } for (i = 0; i < PTRS_PER_PTE; i++) set_pte(&kasan_early_shadow_pte[i], diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index ee056f4a4fa3..2b543163d90a 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -12,6 +12,7 @@ enum stack_type { STACK_TYPE_IRQ, STACK_TYPE_NODAT, STACK_TYPE_RESTART, + STACK_TYPE_MCCK, }; struct stack_info { diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h index 7b3cdb4a5f48..73ee89142666 100644 --- a/arch/s390/include/asm/vdso/data.h +++ b/arch/s390/include/asm/vdso/data.h @@ -6,7 +6,7 @@ #include <vdso/datapage.h> struct arch_vdso_data { - __u64 tod_steering_delta; + __s64 tod_steering_delta; __u64 tod_steering_end; }; diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index af013b4244d3..2da027359798 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -37,10 +37,12 @@ static int diag8_noresponse(int cmdlen) static int diag8_response(int cmdlen, char *response, int *rlen) { + unsigned long _cmdlen = cmdlen | 0x40000000L; + unsigned long _rlen = *rlen; register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf; register unsigned long reg3 asm ("3") = (addr_t) response; - register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L; - register unsigned long reg5 asm ("5") = *rlen; + register unsigned long reg4 asm ("4") = _cmdlen; + register unsigned long reg5 asm ("5") = _rlen; asm volatile( " diag %2,%0,0x8\n" diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 0dc4b258b98d..db1bc00229ca 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -79,6 +79,15 @@ static bool in_nodat_stack(unsigned long sp, struct stack_info *info) return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); } +static bool in_mcck_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.mcck_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_MCCK, top - THREAD_SIZE, top); +} + static bool in_restart_stack(unsigned long sp, struct stack_info *info) { unsigned long frame_size, top; @@ -108,7 +117,8 @@ int get_stack_info(unsigned long sp, struct task_struct *task, /* Check per-cpu stacks */ if (!in_irq_stack(sp, info) && !in_nodat_stack(sp, info) && - !in_restart_stack(sp, info)) + !in_restart_stack(sp, info) && + !in_mcck_stack(sp, info)) goto unknown; recursion_check: diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index c10b9f31eef7..12de7a9c85b3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -401,15 +401,13 @@ ENTRY(\name) brasl %r14,.Lcleanup_sie_int #endif 0: CHECK_STACK __LC_SAVE_AREA_ASYNC - lgr %r11,%r15 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - stg %r11,__SF_BACKCHAIN(%r15) j 2f 1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP lctlg %c1,%c1,__LC_KERNEL_ASCE lg %r15,__LC_KERNEL_STACK - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) -2: la %r11,STACK_FRAME_OVERHEAD(%r15) +2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 @@ -445,6 +443,7 @@ INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq * Load idle PSW. */ ENTRY(psw_idle) + stg %r14,(__SF_GPRS+8*8)(%r15) stg %r3,__SF_EMPTY(%r15) larl %r1,psw_idle_exit stg %r1,__SF_EMPTY+8(%r15) diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 601c21791338..714269e10eec 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -174,7 +174,7 @@ void noinstr do_ext_irq(struct pt_regs *regs) memcpy(®s->int_code, &S390_lowcore.ext_cpu_addr, 4); regs->int_parm = S390_lowcore.ext_params; - regs->int_parm_long = *(unsigned long *)S390_lowcore.ext_params2; + regs->int_parm_long = S390_lowcore.ext_params2; from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; if (from_idle) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 60da976eee6f..72134f9f6ff5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -354,7 +354,7 @@ static int __init stack_realloc(void) if (!new) panic("Couldn't allocate machine check stack"); WRITE_ONCE(S390_lowcore.mcck_stack, new + STACK_INIT_OFFSET); - memblock_free(old, THREAD_SIZE); + memblock_free_late(old, THREAD_SIZE); return 0; } early_initcall(stack_realloc); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 7f1266c24f6b..101477b3e263 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -24,12 +24,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 165da961f901..326cb8f75f58 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -80,10 +80,12 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; + int cs; /* Initialize TOD steering parameters */ tod_steering_end = tod_clock_base.tod; - vdso_data->arch_data.tod_steering_end = tod_steering_end; + for (cs = 0; cs < CS_BASES; cs++) + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -366,6 +368,7 @@ static void clock_sync_global(unsigned long delta) { unsigned long now, adj; struct ptff_qto qto; + int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; @@ -381,7 +384,10 @@ static void clock_sync_global(unsigned long delta) panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - vdso_data->arch_data.tod_steering_end = tod_steering_end; + for (cs = 0; cs < CS_BASES; cs++) { + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; + } /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2792879d398e..f3db131be563 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -165,6 +165,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_ASM_MODVERSIONS select HAVE_CMPXCHG_DOUBLE @@ -571,6 +572,7 @@ config X86_UV depends on X86_EXTENDED_PLATFORM depends on NUMA depends on EFI + depends on KEXEC_CORE depends on X86_X2APIC depends on PCI help @@ -777,6 +779,7 @@ if HYPERVISOR_GUEST config PARAVIRT bool "Enable paravirtualization code" + depends on HAVE_STATIC_CALL help This changes the kernel so it can modify itself when it is run under a hypervisor, potentially improving performance significantly @@ -1406,7 +1409,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1518,6 +1521,7 @@ config AMD_MEM_ENCRYPT select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -1931,6 +1935,7 @@ config X86_SGX depends on CRYPTO_SHA256=y select SRCU select MMU_NOTIFIER + select NUMA_KEEP_MEMINFO if NUMA help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d6d5a28c3bf..78faf9c7e3ae 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,12 +27,13 @@ endif REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse + -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) REALMODE_CFLAGS += -ffreestanding REALMODE_CFLAGS += -fno-stack-protector REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) +REALMODE_CFLAGS += $(CLANG_FLAGS) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e0bc3988c3fa..6e5522aebbbd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -46,6 +46,7 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h +KBUILD_CFLAGS += $(CLANG_FLAGS) # sev-es.c indirectly inludes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index c4bb0f9363f5..95a223b3e56a 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -5,7 +5,7 @@ * Early support for invoking 32-bit EFI services from a 64-bit kernel. * * Because this thunking occurs before ExitBootServices() we have to - * restore the firmware's 32-bit GDT before we make EFI serivce calls, + * restore the firmware's 32-bit GDT before we make EFI service calls, * since the firmware's 32-bit IDT is still currently installed and it * needs to be able to service interrupts. * diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index e94874f4bbc1..a2347ded77ea 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -34,6 +34,7 @@ #include <asm/asm-offsets.h> #include <asm/bootparam.h> #include <asm/desc_defs.h> +#include <asm/trapnr.h> #include "pgtable.h" /* @@ -107,9 +108,19 @@ SYM_FUNC_START(startup_32) movl %eax, %gs movl %eax, %ss -/* setup a stack and make sure cpu supports long mode. */ + /* Setup a stack and load CS from current GDT */ leal rva(boot_stack_end)(%ebp), %esp + pushl $__KERNEL32_CS + leal rva(1f)(%ebp), %eax + pushl %eax + lretl +1: + + /* Setup Exception handling for SEV-ES */ + call startup32_load_idt + + /* Make sure cpu supports long mode. */ call verify_cpu testl %eax, %eax jnz .Lno_longmode @@ -172,11 +183,21 @@ SYM_FUNC_START(startup_32) */ call get_sev_encryption_bit xorl %edx, %edx +#ifdef CONFIG_AMD_MEM_ENCRYPT testl %eax, %eax jz 1f subl $32, %eax /* Encryption bit is always above bit 31 */ bts %eax, %edx /* Set encryption mask for page tables */ + /* + * Mark SEV as active in sev_status so that startup32_check_sev_cbit() + * will do a check. The sev_status memory will be fully initialized + * with the contents of MSR_AMD_SEV_STATUS later in + * set_sev_encryption_mask(). For now it is sufficient to know that SEV + * is active. + */ + movl $1, rva(sev_status)(%ebp) 1: +#endif /* Initialize Page tables to 0 */ leal rva(pgtable)(%ebx), %edi @@ -231,7 +252,7 @@ SYM_FUNC_START(startup_32) /* * Setup for the jump to 64bit mode * - * When the jump is performend we will be in long mode but + * When the jump is performed we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 * (and in turn EFER.LMA = 1). To jump into 64bit mode we use * the new gdt/idt that has __KERNEL_CS with CS.L = 1. @@ -261,6 +282,9 @@ SYM_FUNC_START(startup_32) movl %esi, %edx 1: #endif + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit + pushl $__KERNEL_CS pushl %eax @@ -694,6 +718,19 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA_START(boot32_idt_desc) + .word boot32_idt_end - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) + .balign 8 +SYM_DATA_START(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) +#endif + #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif @@ -786,6 +823,137 @@ SYM_DATA_START_LOCAL(loaded_image_proto) SYM_DATA_END(loaded_image_proto) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + __HEAD + .code32 +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * + * Physical offset is expected in %ebp + */ +SYM_FUNC_START(startup32_set_idt_entry) + push %ebx + push %ecx + + /* IDT entry address to %ebx */ + leal rva(boot32_idt)(%ebp), %ebx + shl $3, %edx + addl %edx, %ebx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + movl $__KERNEL32_CS, %ecx # Target code segment selector + shl $16, %ecx + orl %ecx, %edx + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ebx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ebx) + + pop %ecx + pop %ebx + ret +SYM_FUNC_END(startup32_set_idt_entry) +#endif + +SYM_FUNC_START(startup32_load_idt) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* #VC handler */ + leal rva(startup32_vc_handler)(%ebp), %eax + movl $X86_TRAP_VC, %edx + call startup32_set_idt_entry + + /* Load IDT */ + leal rva(boot32_idt)(%ebp), %eax + movl %eax, rva(boot32_idt_desc+2)(%ebp) + lidt rva(boot32_idt_desc)(%ebp) +#endif + ret +SYM_FUNC_END(startup32_load_idt) + +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Check for non-zero sev_status */ + movl rva(sev_status)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + movl %eax, rva(sev_check_data)(%ebp) + movl %ebx, rva(sev_check_data+4)(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, rva(sev_check_data)(%ebp) + jne 3f + cmpl %ebx, rva(sev_check_data+4)(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %edx + popl %ecx + popl %ebx + popl %eax +#endif + ret +SYM_FUNC_END(startup32_check_sev_cbit) + /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 804a502ee0d2..9b93567d663a 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -52,3 +52,17 @@ void load_stage2_idt(void) load_boot_idt(&boot_idt_desc); } + +void cleanup_exception_handling(void) +{ + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + + /* Set a null-idt, disabling #PF and #VC handling */ + boot_idt_desc.size = 0; + boot_idt_desc.address = 0; + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b92fffbe761f..e36690778497 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -639,9 +639,9 @@ static bool process_mem_region(struct mem_vector *region, if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); - return 1; + return true; } - return 0; + return false; } #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index aa561795efd1..c1e81a848b2a 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -23,12 +23,6 @@ SYM_FUNC_START(get_sev_encryption_bit) push %ecx push %edx - /* Check if running under a hypervisor */ - movl $1, %eax - cpuid - bt $31, %ecx /* Check the hypervisor bit */ - jnc .Lno_sev - movl $0x80000000, %eax /* CPUID to check the highest leaf */ cpuid cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ @@ -67,10 +61,132 @@ SYM_FUNC_START(get_sev_encryption_bit) ret SYM_FUNC_END(get_sev_encryption_bit) +/** + * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using + * the GHCB MSR protocol + * + * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX) + * @%edx: CPUID Function + * + * Returns 0 in %eax on success, non-zero on failure + * %edx returns CPUID value on success + */ +SYM_CODE_START_LOCAL(sev_es_req_cpuid) + shll $30, %eax + orl $0x00000004, %eax + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall # VMGEXIT + rdmsr + + /* Check response */ + movl %eax, %ecx + andl $0x3ffff000, %ecx # Bits [12-29] MBZ + jnz 2f + + /* Check return code */ + andl $0xfff, %eax + cmpl $5, %eax + jne 2f + + /* All good - return success */ + xorl %eax, %eax +1: + ret +2: + movl $-1, %eax + jmp 1b +SYM_CODE_END(sev_es_req_cpuid) + +SYM_CODE_START(startup32_vc_handler) + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Keep CPUID function in %ebx */ + movl %eax, %ebx + + /* Check if error-code == SVM_EXIT_CPUID */ + cmpl $0x72, 16(%esp) + jne .Lfail + + movl $0, %eax # Request CPUID[fn].EAX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 12(%esp) # Store result + + movl $1, %eax # Request CPUID[fn].EBX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 8(%esp) # Store result + + movl $2, %eax # Request CPUID[fn].ECX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 4(%esp) # Store result + + movl $3, %eax # Request CPUID[fn].EDX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 0(%esp) # Store result + + /* + * Sanity check CPUID results from the Hypervisor. See comment in + * do_vc_no_ghcb() for more details on why this is necessary. + */ + + /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */ + cmpl $0x80000000, %ebx + jne .Lcheck_sev + cmpl $0x8000001f, 12(%esp) + jb .Lfail + jmp .Ldone + +.Lcheck_sev: + /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */ + cmpl $0x8000001f, %ebx + jne .Ldone + btl $1, 12(%esp) + jnc .Lfail + +.Ldone: + popl %edx + popl %ecx + popl %ebx + popl %eax + + /* Remove error code */ + addl $4, %esp + + /* Jump over CPUID instruction */ + addl $2, (%esp) + + iret +.Lfail: + /* Send terminate request to Hypervisor */ + movl $0x100, %eax + xorl %edx, %edx + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall + + /* If request fails, go to hlt loop */ + hlt + jmp .Lfail +SYM_CODE_END(startup32_vc_handler) + .code64 #include "../../kernel/sev_verify_cbit.S" - SYM_FUNC_START(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 267e7f93050e..dde042f64cca 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -430,8 +430,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, error("Destination address too large"); #endif #ifndef CONFIG_RELOCATABLE - if ((unsigned long)output != LOAD_PHYSICAL_ADDR) - error("Destination address does not match LOAD_PHYSICAL_ADDR"); if (virt_addr != LOAD_PHYSICAL_ADDR) error("Destination virtual address changed when not relocatable"); #endif @@ -443,11 +441,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); - /* - * Flush GHCB from cache and map it encrypted again when running as - * SEV-ES guest. - */ - sev_es_shutdown_ghcb(); + /* Disable exception handling before booting the kernel */ + cleanup_exception_handling(); return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 901ea5ebec22..e5612f035498 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -155,6 +155,12 @@ extern pteval_t __default_kernel_pte_mask; extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; extern struct desc_ptr boot_idt_desc; +#ifdef CONFIG_X86_64 +void cleanup_exception_handling(void); +#else +static inline void cleanup_exception_handling(void) { } +#endif + /* IDT Entry Points */ void boot_page_fault(void); void boot_stage1_vc(void); diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c index 27826c265aab..d904bd56b3e3 100644 --- a/arch/x86/boot/compressed/sev-es.c +++ b/arch/x86/boot/compressed/sev-es.c @@ -200,14 +200,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) } finish: - if (result == ES_OK) { + if (result == ES_OK) vc_finish_insn(&ctxt); - } else if (result != ES_RETRY) { - /* - * For now, just halt the machine. That makes debugging easier, - * later we just call sev_es_terminate() here. - */ - while (true) - asm volatile("hlt\n"); - } + else if (result != ES_RETRY) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 7c4c7b2fbf05..98cf3b4e4c9f 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -24,7 +24,7 @@ /* * Copyright 2012 Xyratex Technology Limited * - * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation. */ #include <linux/init.h> #include <linux/module.h> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 5af8021b98ce..6706b6cb1d0f 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -114,11 +114,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) ); } -/* Computes the field substraction of two field elements */ +/* Computes the field subtraction of two field elements */ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( - /* Compute the raw substraction of f1-f2 */ + /* Compute the raw subtraction of f1-f2 */ " movq 0(%1), %%r8;" " subq 0(%2), %%r8;" " movq 8(%1), %%r9;" @@ -135,7 +135,7 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) " mov $38, %%rcx;" " cmovc %%rcx, %%rax;" - /* Step 2: Substract carry*38 from the original difference */ + /* Step 2: Subtract carry*38 from the original difference */ " sub %%rax, %%r8;" " sbb $0, %%r9;" " sbb $0, %%r10;" diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 646da46e8d10..1dfb8af48a3c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -16,7 +16,7 @@ #include <asm/simd.h> asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_KEY_SIZE]); + const u8 key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, const size_t len, const u32 padbit); asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], @@ -81,7 +81,7 @@ static void convert_to_base2_64(void *ctx) state->is_base2_26 = 0; } -static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) { poly1305_init_x86_64(ctx, key); } @@ -129,7 +129,7 @@ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], poly1305_emit_avx(ctx, mac, nonce); } -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_simd_init(&dctx->h, key); dctx->s[0] = get_unaligned_le32(&key[16]); diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index fc23552afe37..bca4cea757ce 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -88,7 +88,7 @@ /* * Combined G1 & G2 function. Reordered with help of rotates to have moves - * at begining. + * at beginning. */ #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ /* G1,1 && G2,1 */ \ diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 03725696397c..3507cf2064f1 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -117,7 +117,7 @@ static bool is_blacklisted_cpu(void) * storing blocks in 64bit registers to allow three blocks to * be processed parallel. Parallel operation then allows gaining * more performance than was trade off, on out-of-order CPUs. - * However Atom does not benefit from this parallellism and + * However Atom does not benefit from this parallelism and * should be blacklisted. */ return true; diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 4efd39aacb9f..7b2542b13ebd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -38,6 +38,7 @@ #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { + add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); @@ -83,6 +84,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { unsigned int nr = syscall_32_enter(regs); + add_random_kstack_offset(); /* * Subtlety here: if ptrace pokes something larger than 2^32-1 into * orig_ax, the unsigned int return value truncates it. This may @@ -102,6 +104,7 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) unsigned int nr = syscall_32_enter(regs); int res; + add_random_kstack_offset(); /* * This cannot use syscall_enter_from_user_mode() as it has to * fetch EBP before invoking any of the syscall entry work diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index df8c017e6161..ff0034740900 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -40,7 +40,7 @@ #include <asm/processor-flags.h> #include <asm/irq_vectors.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/asm.h> #include <asm/smap.h> #include <asm/frame.h> @@ -209,7 +209,7 @@ * * Lets build a 5 entry IRET frame after that, such that struct pt_regs * is complete and in particular regs->sp is correct. This gives us - * the original 6 enties as gap: + * the original 6 entries as gap: * * 14*4(%esp) - <previous context> * 13*4(%esp) - gap / flags @@ -430,7 +430,7 @@ * will soon execute iret and the tracer was already set to * the irqstate after the IRET: */ - DISABLE_INTERRUPTS(CLBR_ANY) + cli lss (%esp), %esp /* switch to espfix segment */ .Lend_\@: #endif /* CONFIG_X86_ESPFIX32 */ @@ -1077,7 +1077,7 @@ restore_all_switch_stack: * when returning from IPI handler and when returning from * scheduler to user-space. */ - INTERRUPT_RETURN + iret .section .fixup, "ax" SYM_CODE_START(asm_iret_error) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 400908dff42e..a16a5294d55f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -305,7 +305,7 @@ SYM_CODE_END(ret_from_fork) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY pushq %rax - SAVE_FLAGS(CLBR_RAX) + SAVE_FLAGS testl $X86_EFLAGS_IF, %eax jz .Lokay_\@ ud2 @@ -511,7 +511,7 @@ SYM_CODE_START(\asmsym) /* * No need to switch back to the IST stack. The current stack is either * identical to the stack in the IRET frame or the VC fall-back stack, - * so it is definitly mapped even with PTI enabled. + * so it is definitely mapped even with PTI enabled. */ jmp paranoid_exit diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 2d0f3d8bcc25..edfe9780f6d1 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -218,7 +218,7 @@ int main(int argc, char **argv) /* * Figure out the struct name. If we're writing to a .so file, - * generate raw output insted. + * generate raw output instead. */ name = strdup(argv[3]); namelen = strlen(name); diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index de1fff7188aa..6ddd7a937b3e 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -6,7 +6,7 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> .text .globl __kernel_vsyscall @@ -29,7 +29,7 @@ __kernel_vsyscall: * anyone with an AMD CPU, for example). Nonetheless, we try to keep * it working approximately as well as it ever worked. * - * This link may eludicate some of the history: + * This link may elucidate some of the history: * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 * personally, I find it hard to understand what's going on there. * diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 825e829ffff1..235a5794296a 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -358,7 +358,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) mmap_write_lock(mm); /* * Check if we have already mapped vdso blob - fail to prevent - * abusing from userspace install_speciall_mapping, which may + * abusing from userspace install_special_mapping, which may * not do accounting and rlimit right. * We could search vma near context.vdso, but it's a slowpath, * so let's explicitly check all VMAs to be completely sure. diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S index 86a0e94f68df..99dafac992e2 100644 --- a/arch/x86/entry/vdso/vsgx.S +++ b/arch/x86/entry/vdso/vsgx.S @@ -137,7 +137,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave) /* * If the return from callback is zero or negative, return immediately, - * else re-execute ENCLU with the postive return value interpreted as + * else re-execute ENCLU with the positive return value interpreted as * the requested ENCLU function. */ cmp $0, %eax diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2c1791c4a518..9687a8aef01c 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -623,7 +623,7 @@ static void amd_pmu_disable_all(void) /* * Check each counter for overflow and wait for it to be reset by the * NMI if it has overflowed. This relies on the fact that all active - * counters are always enabled when this function is caled and + * counters are always enabled when this function is called and * ARCH_PERFMON_EVENTSEL_INT is always set. */ for (idx = 0; idx < x86_pmu.num_counters; idx++) { diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index 0e5c036fd7be..e6493a67f1c6 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -17,7 +17,7 @@ #define IOMMU_PC_DEVID_MATCH_REG 0x20 #define IOMMU_PC_COUNTER_REPORT_REG 0x28 -/* maximun specified bank/counters */ +/* maximum specified bank/counters */ #define PC_MAX_SPEC_BNKS 64 #define PC_MAX_SPEC_CNTRS 16 diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 18df17129695..4c31cae4707e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -765,7 +765,7 @@ struct perf_sched { }; /* - * Initialize interator that runs through all events and counters. + * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 731dd8d0dbb1..6320d2cfd9d3 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -594,7 +594,7 @@ static __init int bts_init(void) * we cannot use the user mapping since it will not be available * if we're not running the owning process. * - * With PTI we can't use the kernal map either, because its not + * With PTI we can't use the kernel map either, because its not * there when we run userspace. * * For now, disable this driver when using PTI. diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 37ce38403cb8..3fd69bd5fa6e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -137,7 +137,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ + INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMPTY */ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ @@ -2186,7 +2186,7 @@ static void intel_pmu_enable_all(int added) * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either * in sequence on the same PMC or on different PMCs. * - * In practise it appears some of these events do in fact count, and + * In practice it appears some of these events do in fact count, and * we need to program all 4 events. */ static void intel_pmu_nhm_workaround(void) @@ -2435,7 +2435,7 @@ static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) /* * The metric is reported as an 8bit integer fraction - * suming up to 0xff. + * summing up to 0xff. * slots-in-metric = (Metric / 0xff) * slots */ val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff; @@ -2776,7 +2776,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * processing loop coming after that the function, otherwise * phony regular samples may be generated in the sampling buffer * not marked with the EXACT tag. Another possibility is to have - * one PEBS event and at least one non-PEBS event whic hoverflows + * one PEBS event and at least one non-PEBS event which overflows * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will * not be set, yet the overflow status bit for the PEBS counter will * be on Skylake. @@ -2824,7 +2824,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* - * Intel Perf mertrics + * Intel Perf metrics */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; @@ -4516,7 +4516,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), @@ -4594,7 +4594,7 @@ static bool check_msr(unsigned long msr, u64 mask) /* * Disable the check for real HW, so we don't - * mess with potentionaly enabled registers: + * mess with potentially enabled registers: */ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) return true; @@ -4659,7 +4659,7 @@ static __init void intel_arch_events_quirk(void) { int bit; - /* disable event that reported as not presend by cpuid */ + /* disable event that reported as not present by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; pr_warn("CPUID marked event: \'%s\' unavailable\n", diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index d32b302719fe..5aabb0e2964a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1805,7 +1805,7 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) * * [-period, 0] * - * the difference between two consequtive reads is: + * the difference between two consecutive reads is: * * A) value2 - value1; * when no overflows have happened in between, diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 21890dacfcfe..acb04ef3da3f 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1198,7 +1198,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) /* * The LBR logs any address in the IP, even if the IP just * faulted. This means userspace can control the from address. - * Ensure we don't blindy read any address by validating it is + * Ensure we don't blindly read any address by validating it is * a known text address. */ if (kernel_text_address(from)) { diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index a4cc66005ce8..971dffe0b77d 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -24,7 +24,7 @@ struct p4_event_bind { unsigned int escr_msr[2]; /* ESCR MSR for this event */ unsigned int escr_emask; /* valid ESCR EventMask bits */ unsigned int shared; /* event is shared across threads */ - char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ + char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ }; struct p4_pebs_bind { @@ -45,7 +45,7 @@ struct p4_pebs_bind { * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of * event configuration to find out which values are to be * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT - * resgisters + * registers */ static struct p4_pebs_bind p4_pebs_bind_map[] = { P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), @@ -1313,7 +1313,7 @@ static __initconst const struct x86_pmu p4_pmu = { .get_event_constraints = x86_get_event_constraints, /* * IF HT disabled we may need to use all - * ARCH_P4_MAX_CCCR counters simulaneously + * ARCH_P4_MAX_CCCR counters simultaneously * though leave it restricted at moment assuming * HT is on */ diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index e94af4a54d0d..915847655c06 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -362,7 +362,7 @@ static bool pt_event_valid(struct perf_event *event) /* * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config - * clears the assomption that BranchEn must always be enabled, + * clears the assumption that BranchEn must always be enabled, * as was the case with the first implementation of PT. * If this bit is not set, the legacy behavior is preserved * for compatibility with the older userspace. diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b79951d0707c..4bba0491068c 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -280,17 +280,17 @@ * | [63] | 00h | VALID - When set, indicates the CPU bus * numbers have been initialized. (RO) * |[62:48]| --- | Reserved - * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned + * |[47:40]| 00h | BUS_NUM_5 - Return the bus number BIOS assigned * CPUBUSNO(5). (RO) - * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned + * |[39:32]| 00h | BUS_NUM_4 - Return the bus number BIOS assigned * CPUBUSNO(4). (RO) - * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned + * |[31:24]| 00h | BUS_NUM_3 - Return the bus number BIOS assigned * CPUBUSNO(3). (RO) - * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned + * |[23:16]| 00h | BUS_NUM_2 - Return the bus number BIOS assigned * CPUBUSNO(2). (RO) - * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned + * |[15:8] | 00h | BUS_NUM_1 - Return the bus number BIOS assigned * CPUBUSNO(1). (RO) - * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned + * | [7:0] | 00h | BUS_NUM_0 - Return the bus number BIOS assigned * CPUBUSNO(0). (RO) */ #define SKX_MSR_CPU_BUS_NUMBER 0x300 @@ -1159,7 +1159,6 @@ enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, BDX_PCI_QPI_PORT2_FILTER, - HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -2857,22 +2856,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { NULL, }; -void hswep_uncore_cpu_init(void) +#define HSWEP_PCU_DID 0x2fc0 +#define HSWEP_PCU_CAPID4_OFFET 0x94 +#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3) + +static bool hswep_has_limit_sbox(unsigned int device) { - int pkg = boot_cpu_data.logical_proc_id; + struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL); + u32 capid4; + + if (!dev) + return false; + + pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + if (!hswep_get_chop(capid4)) + return true; + return false; +} + +void hswep_uncore_cpu_init(void) +{ if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; /* Detect 6-8 core systems with only two SBOXes */ - if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - u32 capid4; - - pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3], - 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - hswep_uncore_sbox.num_boxes = 2; - } + if (hswep_has_limit_sbox(HSWEP_PCU_DID)) + hswep_uncore_sbox.num_boxes = 2; uncore_msr_uncores = hswep_msr_uncores; } @@ -3135,11 +3145,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; @@ -3231,27 +3236,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { EVENT_CONSTRAINT_END }; +#define BDX_PCU_DID 0x6fc0 + void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; - /* BDX-DE doesn't have SBOX */ - if (boot_cpu_data.x86_model == 86) { - uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; /* Detect systems with no SBOXes */ - } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - struct pci_dev *pdev; - u32 capid4; - - pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]; - pci_read_config_dword(pdev, 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; - } + if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID)) + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } @@ -3472,11 +3468,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, BDX_PCI_QPI_PORT2_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index e68827e604ad..949d845c922b 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -494,7 +494,7 @@ static __init void zhaoxin_arch_events_quirk(void) { int bit; - /* disable event that reported as not presend by cpuid */ + /* disable event that reported as not present by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; pr_warn("CPUID marked event: \'%s\' unavailable\n", diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index b81047dec1da..e7b94f636cc1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); static inline bool hv_reenlightenment_available(void) { /* - * Check for required features and priviliges to make TSC frequency + * Check for required features and privileges to make TSC frequency * change notifications work. */ return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && @@ -292,7 +292,7 @@ static int hv_suspend(void) /* * Reset the hypercall page as it is going to be invalidated - * accross hibernation. Setting hv_hypercall_pg to NULL ensures + * across hibernation. Setting hv_hypercall_pg to NULL ensures * that any subsequent hypercall operation fails safely instead of * crashing due to an access of an invalid page. The hypercall page * pointer is restored on resume. diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 62da760d6d5a..cd7b14322035 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -9,7 +9,7 @@ * Functions to keep the agpgart mappings coherent with the MMU. The * GART gives the CPU a physical alias of pages in memory. The alias * region is mapped uncacheable. Make sure there are no conflicting - * mappings with different cachability attributes for the same + * mappings with different cacheability attributes for the same * page. This avoids data corruption on some CPUs. */ diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h deleted file mode 100644 index 464034db299f..000000000000 --- a/arch/x86/include/asm/alternative-asm.h +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_ALTERNATIVE_ASM_H -#define _ASM_X86_ALTERNATIVE_ASM_H - -#ifdef __ASSEMBLY__ - -#include <asm/asm.h> - -#ifdef CONFIG_SMP - .macro LOCK_PREFIX -672: lock - .pushsection .smp_locks,"a" - .balign 4 - .long 672b - . - .popsection - .endm -#else - .macro LOCK_PREFIX - .endm -#endif - -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -.macro ANNOTATE_IGNORE_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . - .popsection -.endm - -/* - * Issue one struct alt_instr descriptor entry (need to put it into - * the section .altinstructions, see below). This entry contains - * enough information for the alternatives patching code to patch an - * instruction. See apply_alternatives(). - */ -.macro altinstruction_entry orig alt feature orig_len alt_len pad_len - .long \orig - . - .long \alt - . - .word \feature - .byte \orig_len - .byte \alt_len - .byte \pad_len -.endm - -/* - * Define an alternative between two instructions. If @feature is - * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. ".skip" directive takes care of proper instruction padding - * in case @newinstr is longer than @oldinstr. - */ -.macro ALTERNATIVE oldinstr, newinstr, feature -140: - \oldinstr -141: - .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr -144: - .popsection -.endm - -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f - -/* - * gas compatible max based on the idea from: - * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax - * - * The additional "-" is needed because gas uses a "true" value of -1. - */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) - - -/* - * Same as ALTERNATIVE macro above but for two alternatives. If CPU - * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has - * @feature2, it replaces @oldinstr with @feature2. - */ -.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 -140: - \oldinstr -141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b - altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr1 -144: - \newinstr2 -145: - .popsection -.endm - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_ALTERNATIVE_ASM_H */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13adca37c99a..17b36090d448 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -2,13 +2,17 @@ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H -#ifndef __ASSEMBLY__ - #include <linux/types.h> -#include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> +#define ALTINSTR_FLAG_INV (1 << 15) +#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) + +#ifndef __ASSEMBLY__ + +#include <linux/stddef.h> + /* * Alternative inline assembly for SMP. * @@ -150,7 +154,7 @@ static inline int alternatives_text_reserved(void *start, void *end) " .byte " alt_rlen(num) "\n" /* replacement len */ \ " .byte " alt_pad_len "\n" /* pad len */ -#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ +#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \ "# ALT: replacement " #num "\n" \ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n" @@ -161,7 +165,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ALTINSTR_REPLACEMENT(newinstr, 1) \ ".popsection\n" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ @@ -171,10 +175,15 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ - ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ALTINSTR_REPLACEMENT(newinstr1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, 2) \ ".popsection\n" +/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ +#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ + ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ + newinstr_yes, feature) + #define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \ OLDINSTR_3(oldinsn, 1, 2, 3) \ ".pushsection .altinstructions,\"a\"\n" \ @@ -183,9 +192,9 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feat3, 3) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinsn1, feat1, 1) \ - ALTINSTR_REPLACEMENT(newinsn2, feat2, 2) \ - ALTINSTR_REPLACEMENT(newinsn3, feat3, 3) \ + ALTINSTR_REPLACEMENT(newinsn1, 1) \ + ALTINSTR_REPLACEMENT(newinsn2, 2) \ + ALTINSTR_REPLACEMENT(newinsn3, 3) \ ".popsection\n" /* @@ -206,6 +215,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") +#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \ + asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory") + /* * Alternative inline assembly with input. * @@ -271,6 +283,116 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +672: lock + .pushsection .smp_locks,"a" + .balign 4 + .long 672b - . + .popsection + .endm +#else + .macro LOCK_PREFIX + .endm +#endif + +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +.macro ANNOTATE_IGNORE_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.ignore_alts + .long .Lannotate_\@ - . + .popsection +.endm + +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len + .long \orig - . + .long \alt - . + .word \feature + .byte \orig_len + .byte \alt_len + .byte \pad_len +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +#define old_len 141b-140b +#define new_len1 144f-143f +#define new_len2 145f-144f + +/* + * gas compatible max based on the idea from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. + */ +#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + + +/* + * Same as ALTERNATIVE macro above but for two alternatives. If CPU + * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has + * @feature2, it replaces @oldinstr with @feature2. + */ +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_short(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection +.endm + +/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ +#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ + ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ + newinstr_yes, feature + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 4d4ec5cbdc51..94fbe6ae7431 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -22,7 +22,7 @@ extern void __add_wrong_size(void) /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to * -1 because sizeof will never return -1, thereby making those switch - * case statements guaranteeed dead code which the compiler will + * case statements guaranteed dead code which the compiler will * eliminate, and allowing the "missing symbol in the default case" to * indicate a usage error. */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1728d4ce5730..16a51e7288d5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -8,6 +8,7 @@ #include <asm/asm.h> #include <linux/bitops.h> +#include <asm/alternative.h> enum cpuid_leafs { @@ -175,39 +176,15 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" - "2:\n" - ".skip -(((5f-4f) - (2b-1b)) > 0) * " - "((5f-4f) - (2b-1b)),0x90\n" - "3:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 4f - .\n" /* repl offset */ - " .word %P[always]\n" /* always replace */ - " .byte 3b - 1b\n" /* src len */ - " .byte 5f - 4f\n" /* repl len */ - " .byte 3b - 2b\n" /* pad len */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "4: jmp %l[t_no]\n" - "5:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 0\n" /* no replacement */ - " .word %P[feature]\n" /* feature bit */ - " .byte 3b - 1b\n" /* src len */ - " .byte 0\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .altinstr_aux,\"ax\"\n" - "6:\n" - " testb %[bitnum],%[cap_byte]\n" - " jnz %l[t_yes]\n" - " jmp %l[t_no]\n" - ".previous\n" + asm_volatile_goto( + ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" : : [feature] "i" (bit), - [always] "i" (X86_FEATURE_ALWAYS), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index faec3d92d09b..624116562086 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -236,6 +236,8 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -290,6 +292,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9224d40cdefe..7d7500806af8 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -283,12 +283,12 @@ extern u32 elf_hwcap2; * * The decision process for determining the results are: * - * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | - * ELF: | | | | + * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | + * ELF: | | | | * ---------------------|------------|------------------|----------------| - * missing PT_GNU_STACK | exec-all | exec-all | exec-none | - * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | - * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | + * missing PT_GNU_STACK | exec-all | exec-all | exec-none | + * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | + * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | * * exec-all : all PROT_READ user mappings are executable, except when * backed by files on a noexec-filesystem. diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 2b87b191b3b8..14ebd2196569 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_ENTRY_COMMON_H #define _ASM_X86_ENTRY_COMMON_H +#include <linux/randomize_kstack.h> #include <linux/user-return-notifier.h> #include <asm/nospec-branch.h> @@ -70,6 +71,21 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, */ current_thread_info()->status &= ~(TS_COMPAT | TS_I386_REGS_POKED); #endif + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * but not enough for x86 stack utilization comfort. To keep + * reasonable stack head room, reduce the maximum offset to 8 bits. + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints (see cc_stack_align4/8 in + * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32) + * low bits from any entropy chosen here. + * + * Therefore, final stack offset entropy will be 5 (x86_64) or + * 6 (ia32) bits. + */ + choose_random_kstack_offset(rdtsc() & 0xFF); } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 5eb3bdf36a41..e35e342673c7 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -547,7 +547,7 @@ SYM_CODE_END(spurious_entries_start) /* * Dummy trap number so the low level ASM macro vector number checks do not * match which results in emitting plain IDTENTRY stubs without bells and - * whistels. + * whistles. */ #define X86_TRAP_OTHER 0xFFFF diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h index 3cb002b1d0f9..994638ef171b 100644 --- a/arch/x86/include/asm/intel_pconfig.h +++ b/arch/x86/include/asm/intel_pconfig.h @@ -38,7 +38,7 @@ enum pconfig_leaf { #define MKTME_INVALID_ENC_ALG 4 #define MKTME_DEVICE_BUSY 5 -/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */ +/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */ struct mktme_key_program { u16 keyid; u32 keyid_ctrl; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 423b788f495e..ebe8d2ea44fe 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -3,7 +3,7 @@ #define _ASM_X86_INTEL_PT_H #define PT_CPUID_LEAVES 2 -#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_CPUID_REGS_NUM 4 /* number of registers (eax, ebx, ecx, edx) */ enum pt_capabilities { PT_CAP_max_subleaf = 0, diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d726459d08e5..841a5d104afa 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -159,7 +159,7 @@ static inline void *phys_to_virt(phys_addr_t address) /* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable - * promitions in legacy drivers. + * promotions in legacy drivers. */ static inline unsigned int isa_virt_to_bus(volatile void *address) { diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 9b2a0ff76c73..562854c60808 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -190,7 +190,7 @@ /* * Macro to invoke __do_softirq on the irq stack. This is only called from - * task context when bottom halfs are about to be reenabled and soft + * task context when bottom halves are about to be reenabled and soft * interrupts are pending to be processed. The interrupt stack cannot be in * use here. */ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 144d70ea4393..c5ce9845c999 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -109,18 +109,13 @@ static __always_inline unsigned long arch_local_irq_save(void) } #else -#define ENABLE_INTERRUPTS(x) sti -#define DISABLE_INTERRUPTS(x) cli - #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_ENTRY -#define SAVE_FLAGS(x) pushfq; popq %rax +#define SAVE_FLAGS pushfq; popq %rax #endif #define INTERRUPT_RETURN jmp native_iret -#else -#define INTERRUPT_RETURN iret #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h index 97bbb4a9083a..05b48b33baf0 100644 --- a/arch/x86/include/asm/kfence.h +++ b/arch/x86/include/asm/kfence.h @@ -56,8 +56,13 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect) else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - /* Flush this CPU's TLB. */ + /* + * Flush this CPU's TLB, assuming whoever did the allocation/free is + * likely to continue running on this CPU. + */ + preempt_disable(); flush_tlb_one_kernel(addr); + preempt_enable(); return true; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3768819693e5..10eca9e8f7f6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1488,7 +1488,7 @@ extern u64 kvm_mce_cap_supported; /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context - * should be resued as is, i.e. skip initialization of + * should be reused as is, i.e. skip initialization of * emulation context, instruction fetch and decode. * * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. @@ -1513,7 +1513,7 @@ extern u64 kvm_mce_cap_supported; * * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware * backdoor emulation, which is opt in via module param. - * VMware backoor emulation handles select instructions + * VMware backdoor emulation handles select instructions * and reinjects the #GP for all other cases. * * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index ccf60a809a17..e7be720062a8 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -63,7 +63,7 @@ typedef int (*hyperv_fill_flush_list_func)( static __always_inline void hv_setup_sched_clock(void *sched_clock) { #ifdef CONFIG_PARAVIRT - pv_ops.time.sched_clock = sched_clock; + paravirt_set_sched_clock(sched_clock); #endif } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 32c496fb11b6..fe335d8c1676 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -629,8 +629,6 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define MSR_IA32_TSCDEADLINE 0x000006e0 - #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index cb9ad6b73973..c14fb80b9a07 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,7 +7,6 @@ #include <linux/objtool.h> #include <asm/alternative.h> -#include <asm/alternative-asm.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> @@ -33,7 +32,7 @@ /* * Google experimented with loop-unrolling and this turned out to be - * the optimal version — two calls, each with their own speculation + * the optimal version - two calls, each with their own speculation * trap should their return address end up getting used, in a loop. */ #define __FILL_RETURN_BUFFER(reg, nr, sp) \ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4abf110e2243..43992e5c52c2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,11 +15,20 @@ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> +#include <linux/static_call_types.h> #include <asm/frame.h> -static inline unsigned long long paravirt_sched_clock(void) +u64 dummy_steal_clock(int cpu); +u64 dummy_sched_clock(void); + +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); +DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)); + +static inline u64 paravirt_sched_clock(void) { - return PVOP_CALL0(unsigned long long, time.sched_clock); + return static_call(pv_sched_clock)(); } struct static_key; @@ -33,9 +42,13 @@ bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { - return PVOP_CALL1(u64, time.steal_clock, cpu); + return static_call(pv_steal_clock)(cpu); } +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init paravirt_set_cap(void); +#endif + /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { @@ -122,7 +135,9 @@ static inline void write_cr0(unsigned long x) static inline unsigned long read_cr2(void) { - return PVOP_CALLEE0(unsigned long, mmu.read_cr2); + return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, + "mov %%cr2, %%rax;", + ALT_NOT(X86_FEATURE_XENPV)); } static inline void write_cr2(unsigned long x) @@ -132,12 +147,14 @@ static inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { - return PVOP_CALL0(unsigned long, mmu.read_cr3); + return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, + "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } static inline void write_cr3(unsigned long x) { - PVOP_VCALL1(mmu.write_cr3, x); + PVOP_ALT_VCALL1(mmu.write_cr3, x, + "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); } static inline void __write_cr4(unsigned long x) @@ -157,7 +174,7 @@ static inline void halt(void) static inline void wbinvd(void) { - PVOP_VCALL0(cpu.wbinvd); + PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); } static inline u64 paravirt_read_msr(unsigned msr) @@ -371,22 +388,28 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { - return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) }; + return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pteval_t pte_val(pte_t pte) { - return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); + return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline pgd_t __pgd(pgdval_t val) { - return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) }; + return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pgdval_t pgd_val(pgd_t pgd) { - return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); + return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -419,12 +442,15 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { - return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) }; + return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pmdval_t pmd_val(pmd_t pmd) { - return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); + return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -436,14 +462,16 @@ static inline pud_t __pud(pudval_t val) { pudval_t ret; - ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val); + ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { - return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud); + return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void pud_clear(pud_t *pudp) @@ -462,14 +490,17 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline p4d_t __p4d(p4dval_t val) { - p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val); + p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { - return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d); + return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -556,7 +587,9 @@ static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { - PVOP_VCALLEE1(lock.queued_spin_unlock, lock); + PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, + "movb $0, (%%" _ASM_ARG1 ");", + ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) @@ -571,7 +604,9 @@ static __always_inline void pv_kick(int cpu) static __always_inline bool pv_vcpu_is_preempted(long cpu) { - return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu); + return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, + "xor %%" _ASM_AX ", %%" _ASM_AX ";", + ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); @@ -645,17 +680,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #ifdef CONFIG_PARAVIRT_XXL static inline notrace unsigned long arch_local_save_flags(void) { - return PVOP_CALLEE0(unsigned long, irq.save_fl); + return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", + ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace void arch_local_irq_disable(void) { - PVOP_VCALLEE0(irq.irq_disable); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace void arch_local_irq_enable(void) { - PVOP_VCALLEE0(irq.irq_enable); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace unsigned long arch_local_irq_save(void) @@ -700,84 +736,27 @@ extern void default_banner(void); .popsection -#define COND_PUSH(set, mask, reg) \ - .if ((~(set)) & mask); push %reg; .endif -#define COND_POP(set, mask, reg) \ - .if ((~(set)) & mask); pop %reg; .endif - #ifdef CONFIG_X86_64 - -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_RAX, rax); \ - COND_PUSH(set, CLBR_RCX, rcx); \ - COND_PUSH(set, CLBR_RDX, rdx); \ - COND_PUSH(set, CLBR_RSI, rsi); \ - COND_PUSH(set, CLBR_RDI, rdi); \ - COND_PUSH(set, CLBR_R8, r8); \ - COND_PUSH(set, CLBR_R9, r9); \ - COND_PUSH(set, CLBR_R10, r10); \ - COND_PUSH(set, CLBR_R11, r11) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_R11, r11); \ - COND_POP(set, CLBR_R10, r10); \ - COND_POP(set, CLBR_R9, r9); \ - COND_POP(set, CLBR_R8, r8); \ - COND_POP(set, CLBR_RDI, rdi); \ - COND_POP(set, CLBR_RSI, rsi); \ - COND_POP(set, CLBR_RDX, rdx); \ - COND_POP(set, CLBR_RCX, rcx); \ - COND_POP(set, CLBR_RAX, rax) +#ifdef CONFIG_PARAVIRT_XXL #define PARA_PATCH(off) ((off) / 8) #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#else -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_EAX, eax); \ - COND_PUSH(set, CLBR_EDI, edi); \ - COND_PUSH(set, CLBR_ECX, ecx); \ - COND_PUSH(set, CLBR_EDX, edx) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_EDX, edx); \ - COND_POP(set, CLBR_ECX, ecx); \ - COND_POP(set, CLBR_EDI, edi); \ - COND_POP(set, CLBR_EAX, eax) - -#define PARA_PATCH(off) ((off) / 4) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4) -#define PARA_INDIRECT(addr) *%cs:addr -#endif -#ifdef CONFIG_PARAVIRT_XXL #define INTERRUPT_RETURN \ - PARA_SITE(PARA_PATCH(PV_CPU_iret), \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);) - -#define DISABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) - -#define ENABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) -#endif + ANNOTATE_RETPOLINE_SAFE; \ + ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \ + X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;") -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY -#define SAVE_FLAGS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +.macro PARA_IRQ_save_fl + PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) +.endm + +#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ + ALT_NOT(X86_FEATURE_XENPV) #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ @@ -800,5 +779,11 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif + +#ifndef CONFIG_PARAVIRT_SPINLOCKS +static inline void paravirt_set_cap(void) +{ +} +#endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index de87087d3bde..ae692c3194e9 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -3,7 +3,6 @@ #define _ASM_X86_PARAVIRT_TYPES_H /* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 #define CLBR_EAX (1 << 0) #define CLBR_ECX (1 << 1) #define CLBR_EDX (1 << 2) @@ -15,7 +14,6 @@ #define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) #define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) #else #define CLBR_RAX CLBR_EAX #define CLBR_RCX CLBR_ECX @@ -32,12 +30,9 @@ #define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ CLBR_RCX | CLBR_R8 | CLBR_R9) #define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) #endif /* X86_64 */ -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) - #ifndef __ASSEMBLY__ #include <asm/desc_defs.h> @@ -73,19 +68,6 @@ struct pv_info { const char *name; }; -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, void *insn_buff, - unsigned long addr, unsigned len); -} __no_randomize_layout; - #ifdef CONFIG_PARAVIRT_XXL struct pv_lazy_ops { /* Set deferred update mode, used for batching operations. */ @@ -95,11 +77,6 @@ struct pv_lazy_ops { } __no_randomize_layout; #endif -struct pv_time_ops { - unsigned long long (*sched_clock)(void); - unsigned long long (*steal_clock)(int cpu); -} __no_randomize_layout; - struct pv_cpu_ops { /* hooks for various privileged instructions */ void (*io_delay)(void); @@ -156,10 +133,6 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); #endif @@ -290,8 +263,6 @@ struct pv_lock_ops { * number for each function using the offset which we use to indicate * what to patch. */ struct paravirt_patch_template { - struct pv_init_ops init; - struct pv_time_ops time; struct pv_cpu_ops cpu; struct pv_irq_ops irq; struct pv_mmu_ops mmu; @@ -300,6 +271,7 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; +extern void (*paravirt_iret)(void); #define PARAVIRT_PATCH(x) \ (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) @@ -331,11 +303,7 @@ extern struct paravirt_patch_template pv_ops; /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" -unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len); -unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len); -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end); - -unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len); +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); int paravirt_disable_iospace(void); @@ -371,7 +339,7 @@ int paravirt_disable_iospace(void); * on the stack. All caller-save registers (eax,edx,ecx) are expected * to be modified (either clobbered or used for return values). * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * conventions, returning at %rax, with parameters going on %rdi, %rsi, * %rdx, and %rcx. Note that for this reason, x86_64 does not need any * special handling for dealing with 4 arguments, unlike i386. * However, x86_64 also have to clobber all caller saved registers, which @@ -414,11 +382,9 @@ int paravirt_disable_iospace(void); * makes sure the incoming and outgoing types are always correct. */ #ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ +#define PVOP_CALL_ARGS \ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) #define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) @@ -434,12 +400,10 @@ int paravirt_disable_iospace(void); #define VEXTRA_CLOBBERS #else /* CONFIG_X86_64 */ /* [re]ax isn't an arg, but the return val */ -#define PVOP_VCALL_ARGS \ +#define PVOP_CALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ __edx = __edx, __ecx = __ecx, __eax = __eax; -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) #define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) @@ -464,152 +428,138 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)pv_ops.op) #endif -#define PVOP_RETMASK(rettype) \ +#define PVOP_RETVAL(rettype) \ ({ unsigned long __mask = ~0UL; \ + BUILD_BUG_ON(sizeof(rettype) > sizeof(unsigned long)); \ switch (sizeof(rettype)) { \ case 1: __mask = 0xffUL; break; \ case 2: __mask = 0xffffUL; break; \ case 4: __mask = 0xffffffffUL; break; \ default: break; \ } \ - __mask; \ + __mask & __eax; \ }) -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ +#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ ({ \ - rettype __ret; \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \ - } \ - __ret; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : call_clbr, ASM_CALL_CONSTRAINT \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + ret; \ }) -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ + extra_clbr, ...) \ ({ \ - PVOP_VCALL_ARGS; \ + PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ + asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ + ret; \ }) -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) +#define __PVOP_CALL(rettype, op, ...) \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) + +#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ + ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, ...) \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + +#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ + CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + + +#define __PVOP_VCALL(op, ...) \ + (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, ##__VA_ARGS__) + +#define __PVOP_ALT_VCALL(op, alt, cond, ...) \ + (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ + ##__VA_ARGS__) -#define __PVOP_VCALLEESAVE(op, pre, post, ...) \ - ____PVOP_VCALL(op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) +#define __PVOP_VCALLEESAVE(op, ...) \ + (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) +#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") + __PVOP_CALL(rettype, op) #define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") + __PVOP_VCALL(op) +#define PVOP_ALT_CALL0(rettype, op, alt, cond) \ + __PVOP_ALT_CALL(rettype, op, alt, cond) +#define PVOP_ALT_VCALL0(op, alt, cond) \ + __PVOP_ALT_VCALL(op, alt, cond) #define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") + __PVOP_CALLEESAVE(rettype, op) #define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") + __PVOP_VCALLEESAVE(op) +#define PVOP_ALT_CALLEE0(rettype, op, alt, cond) \ + __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond) +#define PVOP_ALT_VCALLEE0(op, alt, cond) \ + __PVOP_ALT_VCALLEESAVE(op, alt, cond) #define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1)) #define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_VCALL1(op, arg1, alt, cond) \ + __PVOP_ALT_VCALL(op, alt, cond, PVOP_CALL_ARG1(arg1)) #define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_CALLEESAVE(rettype, op, PVOP_CALL_ARG1(arg1)) #define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_VCALLEESAVE(op, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_CALLEE1(rettype, op, arg1, alt, cond) \ + __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_VCALLEE1(op, arg1, alt, cond) \ + __PVOP_ALT_VCALLEESAVE(op, alt, cond, PVOP_CALL_ARG1(arg1)) #define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2)) #define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2)) #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), \ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) #define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), \ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif /* Lazy mode for batching updates / context switch */ enum paravirt_lazy_mode { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a02c67291cfc..b1099f2d9800 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1244,7 +1244,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * - * dst - pointer to pgd range anwhere on a pgd page + * dst - pointer to pgd range anywhere on a pgd page * src - "" * count - the number of pgds to copy. * diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f1b9ed5efaa9..185142b84ebe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -314,11 +314,6 @@ struct x86_hw_tss { struct x86_hw_tss { u32 reserved1; u64 sp0; - - /* - * We store cpu_current_top_of_stack in sp1 so it's always accessible. - * Linux does not use ring 1, so sp1 is not otherwise needed. - */ u64 sp1; /* @@ -426,12 +421,7 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -#ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); -#else -/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ -#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 -#endif #ifdef CONFIG_X86_64 struct fixed_percpu_data { @@ -527,7 +517,7 @@ struct thread_struct { struct io_bitmap *io_bitmap; /* - * IOPL. Priviledge level dependent I/O permission which is + * IOPL. Privilege level dependent I/O permission which is * emulated via the I/O bitmap to prevent user space from disabling * interrupts. */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index b6a9d51d1d79..8c5d1910a848 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -4,6 +4,8 @@ #include <asm/ldt.h> +struct task_struct; + /* misc architecture specific prototypes */ void syscall_init(void); diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 4352f08bfbb5..43fa081a1adb 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -8,8 +8,8 @@ /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack - * Executability : eXeutable, NoteXecutable + * Cacheability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXecutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent * Encryption : Encrypted, Decrypted diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 389d851a02c4..a12458a7a8d4 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -130,11 +130,6 @@ void *extend_brk(size_t size, size_t align); : : "i" (sz)); \ } -/* Helper for reserving space for arrays of things */ -#define RESERVE_BRK_ARRAY(type, name, entries) \ - type *name; \ - RESERVE_BRK(name, sizeof(type) * entries) - extern void probe_roms(void); #ifdef __i386__ diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h index dd7602c44c72..9c31e0ebc55b 100644 --- a/arch/x86/kernel/cpu/sgx/arch.h +++ b/arch/x86/include/asm/sgx.h @@ -2,15 +2,20 @@ /** * Copyright(c) 2016-20 Intel Corporation. * - * Contains data structures defined by the SGX architecture. Data structures - * defined by the Linux software stack should not be placed here. + * Intel Software Guard Extensions (SGX) support. */ -#ifndef _ASM_X86_SGX_ARCH_H -#define _ASM_X86_SGX_ARCH_H +#ifndef _ASM_X86_SGX_H +#define _ASM_X86_SGX_H #include <linux/bits.h> #include <linux/types.h> +/* + * This file contains both data structures defined by SGX architecture and Linux + * defined software data structures and functions. The two should not be mixed + * together for better readibility. The architectural definitions come first. + */ + /* The SGX specific CPUID function. */ #define SGX_CPUID 0x12 /* EPC enumeration. */ @@ -22,16 +27,36 @@ /* The bitmask for the EPC section type. */ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, +}; + /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_UNMASKED_EVENT = 128, }; @@ -271,7 +296,7 @@ struct sgx_pcmd { * @header1: constant byte string * @vendor: must be either 0x0000 or 0x8086 * @date: YYYYMMDD in BCD - * @header2: costant byte string + * @header2: constant byte string * @swdefined: software defined value */ struct sgx_sigstruct_header { @@ -335,4 +360,19 @@ struct sgx_sigstruct { #define SGX_LAUNCH_TOKEN_SIZE 304 -#endif /* _ASM_X86_SGX_ARCH_H */ +/* + * Do not put any hardware-defined SGX structure representations below this + * comment! + */ + +#ifdef CONFIG_X86_SGX_KVM +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr); +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr); +#endif + +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd); + +#endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 0bc9b0895f33..d17b39893b79 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -11,6 +11,7 @@ #include <asm/nops.h> #include <asm/cpufeatures.h> +#include <asm/alternative.h> /* "Raw" instruction opcodes */ #define __ASM_CLAC ".byte 0x0f,0x01,0xca" @@ -18,8 +19,6 @@ #ifdef __ASSEMBLY__ -#include <asm/alternative-asm.h> - #ifdef CONFIG_X86_SMAP #define ASM_CLAC \ @@ -37,8 +36,6 @@ #else /* __ASSEMBLY__ */ -#include <asm/alternative.h> - #ifdef CONFIG_X86_SMAP static __always_inline void clac(void) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index c0538f82c9a2..630ff08532be 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -132,6 +132,7 @@ void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); +void cond_wakeup_cpu0(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 9f69cc497f4b..b5f0d2ff47e4 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task) else this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* - * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That - * doesn't work on x86-32 because sp1 and - * cpu_current_top_of_stack have different values (because of - * the non-zero stack-padding on 32bit). - */ + /* Xen PV enters the kernel on the thread stack. */ if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index a84333adeef2..80c08c7d5e72 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -80,6 +80,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); } #define __COND_SYSCALL(abi, name) \ + __weak long __##abi##_##name(const struct pt_regs *__unused); \ __weak long __##abi##_##name(const struct pt_regs *__unused) \ { \ return sys_ni_syscall(); \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 06b740bae431..de406d93b515 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -197,13 +197,7 @@ static inline int arch_within_stack_frames(const void * const stack, #endif } -#else /* !__ASSEMBLY__ */ - -#ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) -#endif - -#endif +#endif /* !__ASSEMBLY__ */ /* * Thread-synchronous status. diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h index f241451035fb..027a9258dbca 100644 --- a/arch/x86/include/asm/uv/uv_geo.h +++ b/arch/x86/include/asm/uv/uv_geo.h @@ -10,7 +10,7 @@ #ifndef _ASM_UV_GEO_H #define _ASM_UV_GEO_H -/* Type declaractions */ +/* Type declarations */ /* Size of a geoid_s structure (must be before decl. of geoid_u) */ #define GEOID_SIZE 8 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 5002f52be332..d3e3197917be 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -353,7 +353,7 @@ union uvh_apicid { * * Note there are NO leds on a UV system. This register is only * used by the system controller to monitor system-wide operation. - * There are 64 regs per node. With Nahelem cpus (2 cores per node, + * There are 64 regs per node. With Nehalem cpus (2 cores per node, * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on * a node. * diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 7068e4bb057d..1a162e559753 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -87,18 +87,6 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, #endif /* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define XEN_EXTRA_MEM_RATIO (10) - -/* * Helper functions to write or read unsigned long values to/from * memory, when the access may fault. */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 600a141c8805..b25d3f82c2f3 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -234,7 +234,7 @@ struct boot_params { * handling of page tables. * * These enums should only ever be used by x86 code, and the code that uses - * it should be well contained and compartamentalized. + * it should be well contained and compartmentalized. * * KVM and Xen HVM do not have a subarch as these are expected to follow * standard x86 boot entries. If there is a genuine need for "hypervisor" type @@ -252,7 +252,7 @@ struct boot_params { * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are - * currently supportd through this PV boot path. + * currently supported through this PV boot path. * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform * systems which do not have the PCI legacy interfaces. * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h index b3d0664fadc9..ac83e25bbf37 100644 --- a/arch/x86/include/uapi/asm/msgbuf.h +++ b/arch/x86/include/uapi/asm/msgbuf.h @@ -12,7 +12,7 @@ * The msqid64_ds structure for x86 architecture with x32 ABI. * * On x86-32 and x86-64 we can just use the generic definition, but - * x32 uses the same binary layout as x86_64, which is differnet + * x32 uses the same binary layout as x86_64, which is different * from other 32-bit architectures. */ diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 9034f3007c4e..9690d6899ad9 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -152,7 +152,7 @@ struct sgx_enclave_run { * Most exceptions reported on ENCLU, including those that occur within the * enclave, are fixed up and reported synchronously instead of being delivered * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are - * never fixed up and are always delivered via standard signals. On synchrously + * never fixed up and are always delivered via standard signals. On synchronously * reported exceptions, -EFAULT is returned and details about the exception are * recorded in @run.exception, the optional sgx_enclave_exception struct. * diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h index f0305dc660c9..fce18eaa070c 100644 --- a/arch/x86/include/uapi/asm/shmbuf.h +++ b/arch/x86/include/uapi/asm/shmbuf.h @@ -9,7 +9,7 @@ * The shmid64_ds structure for x86 architecture with x32 ABI. * * On x86-32 and x86-64 we can just use the generic definition, but - * x32 uses the same binary layout as x86_64, which is differnet + * x32 uses the same binary layout as x86_64, which is different * from other 32-bit architectures. */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 844d60eb1882..d0d9b331d3a1 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -139,7 +139,7 @@ struct _fpstate_32 { * The 64-bit FPU frame. (FXSAVE format and later) * * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is - * larger: 'struct _xstate'. Note that 'struct _xstate' embedds + * larger: 'struct _xstate'. Note that 'struct _xstate' embeds * 'struct _fpstate' so that you can always assume the _fpstate portion * exists so that you can check the magic value. * diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2ddf08351f0b..0704c2a94272 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ KASAN_SANITIZE_sev-es.o := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD_test_nx.o := y -OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y ifdef CONFIG_FRAME_POINTER OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y @@ -121,7 +120,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o +obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7bdc0239a943..e90310cbe73a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -830,7 +830,7 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); /** - * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base + * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base * has been registered * @handle: ACPI handle of the IOAPIC device * @gsi_base: GSI base associated with the IOAPIC @@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void) /* * Initialize the ACPI boot-time table parser. */ - if (acpi_table_init()) { + if (acpi_locate_initial_tables()) disable_acpi(); - return; - } + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void) } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return; + return 1; } } -} - -int __init early_acpi_boot_init(void) -{ - /* - * If acpi_disabled, bail out - */ - if (acpi_disabled) - return 1; /* * Process the Multiple APIC Description Table (MADT), if present @@ -1657,7 +1656,7 @@ static int __init parse_acpi(char *arg) else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); } - /* "acpi=copy_dsdt" copys DSDT */ + /* "acpi=copy_dsdt" copies DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index cc1fea76aab0..3f85fcae450c 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -41,7 +41,7 @@ unsigned long acpi_get_wakeup_address(void) * x86_acpi_enter_sleep_state - enter sleep state * @state: Sleep state to enter. * - * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + * Wrapper around acpi_enter_sleep_state() to be called by assembly. */ asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) { diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 56b6865afb2a..d5d8a352eafa 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) /* * The suspend path may have poisoned some areas deeper in the stack, * which we now need to unpoison. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8d778e46725d..f810e6fececd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -28,6 +28,7 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> +#include <asm/paravirt.h> int __read_mostly alternatives_patched; @@ -388,21 +389,31 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + /* Mask away "NOT" flag bit for feature to test. */ + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); - BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) { + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); + + /* + * Patch if either: + * - feature is present + * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) { if (a->padlen > 1) optimize_nops(a, instr); continue; } - DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", - a->cpuid >> 5, - a->cpuid & 0x1f, + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", + feature >> 5, + feature & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen, a->padlen); @@ -605,7 +616,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insn_buff, p->instr, p->len); - used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); @@ -723,6 +734,33 @@ void __init alternative_instructions(void) * patching. */ + /* + * Paravirt patching and alternative patching can be combined to + * replace a function call with a short direct code sequence (e.g. + * by setting a constant return value instead of doing that in an + * external function). + * In order to make this work the following sequence is required: + * 1. set (artificial) features depending on used paravirt + * functions which can later influence alternative patching + * 2. apply paravirt patching (generally replacing an indirect + * function call with a direct one) + * 3. apply alternative patching (e.g. replacing a direct function + * call with a custom code sequence) + * Doing paravirt patching after alternative patching would clobber + * the optimization of the custom code with a function call again. + */ + paravirt_set_cap(); + + /* + * First patch paravirt functions, such that we overwrite the indirect + * call with the direct call. + */ + apply_paravirt(__parainstructions, __parainstructions_end); + + /* + * Then patch alternatives, such that those paravirt calls that are in + * alternatives can be overwritten by their immediate fragments. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); #ifdef CONFIG_SMP @@ -741,8 +779,6 @@ void __init alternative_instructions(void) } #endif - apply_paravirt(__parainstructions, __parainstructions_end); - restart_nmi(); alternatives_patched = 1; } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b4396952c9a6..09083094eb57 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Shared support code for AMD K8 northbridges and derivates. + * Shared support code for AMD K8 northbridges and derivatives. * Copyright 2006 Andi Kleen, SUSE Labs. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4f26700f314d..4a39fb429f15 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -619,7 +619,7 @@ static void setup_APIC_timer(void) if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; - /* Make LAPIC timer preferrable over percpu HPET */ + /* Make LAPIC timer preferable over percpu HPET */ lapic_clockevent.rating = 150; } @@ -666,7 +666,7 @@ void lapic_update_tsc_freq(void) * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * irqs synchronous. CPUs connected by the same APIC bus have the very same bus * frequency. * * This was previously done by reading the PIT/HPET and waiting for a wrap @@ -1532,7 +1532,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * Most probably by now the CPU has serviced that pending interrupt and it * might not have done the ack_APIC_irq() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector * 0x31). Issue an extra EOI to clear ISR. * @@ -1657,7 +1657,7 @@ static void setup_local_APIC(void) */ /* * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more + * frequent as it makes the interrupt distribution model be more * like LRU than MRU (the short-term load is more even across CPUs). */ @@ -1875,7 +1875,7 @@ static __init void try_to_enable_x2apic(int remap_mode) /* * Without IR, all CPUs can be addressed by IOAPIC/MSI only - * in physical mode, and CPUs with an APIC ID that cannnot + * in physical mode, and CPUs with an APIC ID that cannot * be addressed must not be brought online. */ x2apic_set_max_apicid(apic_limit); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 73ff4dd426a8..d5c691a3208b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -928,7 +928,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) /* * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger - * and polarity attirbutes. So allow the first user to reprogram the + * and polarity attributes. So allow the first user to reprogram the * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { @@ -994,7 +994,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, /* * Legacy ISA IRQ has already been allocated, just add pin to - * the pin list assoicated with this IRQ and program the IOAPIC + * the pin list associated with this IRQ and program the IOAPIC * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { @@ -1752,7 +1752,7 @@ static inline void ioapic_finish_move(struct irq_data *data, bool moveit) * with masking the ioapic entry and then polling until * Remote IRR was clear before reprogramming the * ioapic I don't trust the Remote IRR bit to be - * completey accurate. + * completely accurate. * * However there appears to be no other way to plug * this race, so if the Remote IRR bit is not @@ -1830,7 +1830,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) /* * Tail end of clearing remote IRR bit (either by delivering the EOI * message via io-apic EOI register write or simulating it using - * mask+edge followed by unnask+level logic) manually when the + * mask+edge followed by unmask+level logic) manually when the * level triggered interrupt is seen as the edge triggered interrupt * at the cpu. */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3c9c7492252f..6dbdc7c22bb7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -543,6 +543,14 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) return -ENOSYS; + /* + * Catch any attempt to touch the cascade interrupt on a PIC + * equipped system. + */ + if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY && + virq == PIC_CASCADE_IR)) + return -EINVAL; + for (i = 0; i < nr_irqs; i++) { irqd = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irqd); @@ -745,6 +753,11 @@ void __init lapic_assign_system_vectors(void) /* Mark the preallocated legacy interrupts */ for (i = 0; i < nr_legacy_irqs(); i++) { + /* + * Don't touch the cascade interrupt. It's unusable + * on PIC equipped machines. See the large comment + * in the IO/APIC code. + */ if (i != PIC_CASCADE_IR) irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); } @@ -1045,7 +1058,7 @@ void irq_force_complete_move(struct irq_desc *desc) * * But in case of cpu hotplug this should be a non issue * because if the affinity update happens right before all - * cpus rendevouz in stop machine, there is no way that the + * cpus rendezvous in stop machine, there is no way that the * interrupt can be blocked on the target cpu because all cpus * loops first with interrupts enabled in stop machine, so the * old vector is not yet cleaned up when the interrupt fires. @@ -1054,7 +1067,7 @@ void irq_force_complete_move(struct irq_desc *desc) * of the interrupt on the apic/system bus would be delayed * beyond the point where the target cpu disables interrupts * in stop machine. I doubt that it can happen, but at least - * there is a theroretical chance. Virtualization might be + * there is a theoretical chance. Virtualization might be * able to expose this, but AFAICT the IOAPIC emulation is not * as stupid as the real hardware. * diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 52bc217ca8c3..f5a48e66e4f5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -369,6 +369,15 @@ static int __init early_get_arch_type(void) return ret; } +/* UV system found, check which APIC MODE BIOS already selected */ +static void __init early_set_apic_mode(void) +{ + if (x2apic_enabled()) + uv_system_type = UV_X2APIC; + else + uv_system_type = UV_LEGACY_APIC; +} + static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) { /* Save OEM_ID passed from ACPI MADT */ @@ -404,11 +413,12 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) else uv_hubless_system |= 0x8; - /* Copy APIC type */ + /* Copy OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", oem_id, oem_table_id, uv_system_type, uv_hubless_system); + return 0; } @@ -453,6 +463,7 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) early_set_hub_type(); /* Other UV setup functions */ + early_set_apic_mode(); early_get_pnodeid(); early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; @@ -472,29 +483,14 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) if (uv_set_system_type(_oem_id, _oem_table_id) == 0) return 0; - /* Save and Decode OEM Table ID */ + /* Save for display of the OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - /* This is the most common hardware variant, x2apic mode */ - if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - - /* Only used for very small systems, usually 1 chassis, legacy mode */ - else if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - - else - goto badbios; - pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), uv_min_hub_revision_id); return 0; - -badbios: - pr_err("UV: UVarchtype:%s not supported\n", uv_archtype); - BUG(); } enum uv_system_type get_uv_system_type(void) @@ -1671,6 +1667,9 @@ static __init int uv_system_init_hubless(void) if (rc < 0) return rc; + /* Set section block size for current node memory */ + set_block_size(); + /* Create user access node */ if (rc >= 0) uv_setup_proc_files(1); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 660270359d39..241dda687eb9 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -94,7 +94,7 @@ * Remove APM dependencies in arch/i386/kernel/process.c * Remove APM dependencies in drivers/char/sysrq.c * Reset time across standby. - * Allow more inititialisation on SMP. + * Allow more initialisation on SMP. * Remove CONFIG_APM_POWER_OFF and make it boot time * configurable (default on). * Make debug only a boot time parameter (remove APM_DEBUG). @@ -766,7 +766,7 @@ static int apm_driver_version(u_short *val) * not cleared until it is acknowledged. * * Additional information is returned in the info pointer, providing - * that APM 1.2 is in use. If no messges are pending the value 0x80 + * that APM 1.2 is in use. If no messages are pending the value 0x80 * is returned (No power management events pending). */ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) @@ -1025,7 +1025,7 @@ static int apm_enable_power_management(int enable) * status which gives the rough battery status, and current power * source. The bat value returned give an estimate as a percentage * of life and a status value for the battery. The estimated life - * if reported is a lifetime in secodnds/minutes at current powwer + * if reported is a lifetime in seconds/minutes at current power * consumption. */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 60b9f42ce3c1..ecd3fd6993d1 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -61,13 +61,6 @@ static void __used common(void) OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -#ifdef CONFIG_PARAVIRT_XXL - BLANK(); - OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); - OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); - OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); -#endif - #ifdef CONFIG_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 3ca9be482a9e..d66af2950e06 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -877,7 +877,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct _cpuid4_info_regs *base) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *this_cpu_ci; struct cacheinfo *this_leaf; int i, sibling; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1a4e260b9027..99e1656b326e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -482,7 +482,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) if (pk) pk->pkru = init_pkru_value; /* - * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE + * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we * update that bit in this CPU's "cpu_info". */ @@ -1404,7 +1404,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) * where GS is unused by the prev and next threads. * * Since neither vendor documents this anywhere that I can see, - * detect it directly instead of hardcoding the choice by + * detect it directly instead of hard-coding the choice by * vendor. * * I've designated AMD's behavior as the "bug" because it's @@ -1748,6 +1748,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; + /* May not be marked __init: used by software suspend */ void syscall_init(void) { diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 42af31b64c2c..defda61f372d 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, {} }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 1d9b8aaea06c..7227c15299d0 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -291,7 +291,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif - c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */ + c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */ /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 3b1b01f2b248..da696eb4821a 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ -static void clear_sgx_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_SGX); - setup_clear_cpu_cap(X86_FEATURE_SGX_LC); -} - static int __init nosgx(char *str) { - clear_sgx_caps(); + setup_clear_cpu_cap(X86_FEATURE_SGX); return 0; } @@ -110,23 +104,30 @@ early_param("nosgx", nosgx); void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { + bool enable_sgx_kvm = false, enable_sgx_driver = false; bool tboot = tboot_enabled(); - bool enable_sgx; + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } - /* - * Enable SGX if and only if the kernel supports SGX and Launch Control - * is supported, i.e. disable SGX if the LE hash MSRs can't be written. - */ - enable_sgx = cpu_has(c, X86_FEATURE_SGX) && - cpu_has(c, X86_FEATURE_SGX_LC) && - IS_ENABLED(CONFIG_X86_SGX); + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } - if (enable_sgx) - msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } wrmsrl(MSR_IA32_FEAT_CTL, msr); @@ -173,10 +177,29 @@ update_caps: } update_sgx: - if (!(msr & FEAT_CTL_SGX_ENABLED) || - !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { - if (enable_sgx) - pr_err_once("SGX disabled by BIOS\n"); - clear_sgx_caps(); + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } } } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index e25e52bafeb6..fe0bec14d7ec 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -301,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * The operating system must reload CR3 to cause the TLB to be flushed" * * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h - * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7962355436da..bf7fe87a7e88 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -529,7 +529,7 @@ static void mce_irq_work_cb(struct irq_work *entry) * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granuality for now. + * parser). So only support physical addresses up to page granularity for now. */ int mce_usable_address(struct mce *m) { diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 7b360731fc2d..4e86d97f9653 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -74,6 +74,7 @@ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); MCE_INJECT_SET(synd); +MCE_INJECT_SET(ipid); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -88,11 +89,13 @@ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); MCE_INJECT_GET(synd); +MCE_INJECT_GET(ipid); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); static void setup_inj_struct(struct mce *m) { @@ -629,6 +632,8 @@ static const char readme_msg[] = "\t is present in hardware. \n" "\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" "\t APIC interrupt handler to handle the error. \n" +"\n" +"ipid:\t IPID (AMD-specific)\n" "\n"; static ssize_t @@ -652,6 +657,7 @@ static struct dfs_node { { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 83df991314c5..55ffa84d30d6 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -142,7 +142,7 @@ static struct severity { MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) ), MCESEV( - KEEP, "Non signalled machine check", + KEEP, "Non signaled machine check", SER, BITCLR(MCI_STATUS_S) ), diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b935e1b5f115..6a6318e9590c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -629,16 +629,16 @@ static ssize_t reload_store(struct device *dev, if (val != 1) return size; - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); - if (tmp_ret != UCODE_NEW) - return size; - get_online_cpus(); ret = check_online_cpus(); if (ret) goto put; + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_NEW) + goto put; + mutex_lock(µcode_mutex); ret = microcode_reload_late(); mutex_unlock(µcode_mutex); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e88bc296afca..415bc05d3dc7 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -197,7 +197,7 @@ static unsigned char hv_get_nmi_reason(void) #ifdef CONFIG_X86_LOCAL_APIC /* * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes - * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle * unknown NMI on the first CPU which gets it. */ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) @@ -428,7 +428,7 @@ static void __init ms_hyperv_init_platform(void) /* * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, - * set x2apic destination mode to physcial mode when x2apic is available + * set x2apic destination mode to physical mode when x2apic is available * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs * have 8-bit APIC id. */ diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 9231640782fa..0c3b372318b7 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -434,7 +434,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, state->range_sizek = sizek - second_sizek; } -/* Mininum size of mtrr block that can take hole: */ +/* Minimum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 28c8a23aa42e..a76694bffe86 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -799,7 +799,7 @@ void mtrr_ap_init(void) * * This routine is called in two cases: * - * 1. very earily time of software resume, when there absolutely + * 1. very early time of software resume, when there absolutely * isn't mtrr entry changes; * * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 698bb26aeb6e..23001ae03e82 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -192,7 +192,7 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz * - * Probe by trying to write the first of the L3 cach mask registers + * Probe by trying to write the first of the L3 cache mask registers * and checking that the bits stick. Max CLOSids is always 4 and max cbm length * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7ac31210e452..dbeaa8409313 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -387,7 +387,7 @@ void mon_event_count(void *info) * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so * that: * - * current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) * * This uses the MBM counters to measure the bandwidth and MBA throttle * MSRs to control the bandwidth for a particular rdtgrp. It builds on the @@ -397,7 +397,7 @@ void mon_event_count(void *info) * timer. Having 1s interval makes the calculation of bandwidth simpler. * * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid uncecessarily restricting + * be a need to increase the bandwidth to avoid unnecessarily restricting * the L2 <-> L3 traffic. * * Since MBA controls the L2 external bandwidth where as MBM measures the @@ -480,7 +480,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) /* * Delta values are updated dynamically package wise for each - * rdtgrp everytime the throttle MSR changes value. + * rdtgrp every time the throttle MSR changes value. * * This is because (1)the increase in bandwidth is not perfectly * linear and only "approximately" linear even when the hardware diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index e916646adc69..935af2ac6b1a 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1307,7 +1307,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * If the thread does not get on the CPU for whatever * reason and the process which sets up the region is * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will derefence + * state and once it gets on the CPU it will dereference * the cleared, but not freed, plr struct resulting in an * empty pseudo-locking loop. */ @@ -1391,7 +1391,7 @@ out: * group is removed from user space via a "rmdir" from userspace or the * unmount of the resctrl filesystem. On removal the resource group does * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus assymmetry with the creation where the + * removed directly. There is thus asymmetry with the creation where the * &struct pseudo_lock_region is removed here while it was not created in * rdtgroup_pseudo_lock_create(). * diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f9190adc52cb..01fd30e7829d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * User interface for Resource Alloction in Resource Director Technology(RDT) + * User interface for Resource Allocation in Resource Director Technology(RDT) * * Copyright (C) 2016 Intel Corporation * @@ -294,7 +294,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against resctrl_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from update_closid_rmid() is proteced against __switch_to() because + * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ static void update_cpu_closid_rmid(void *info) @@ -2555,7 +2555,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, /* * This creates a directory mon_data which contains the monitored data. * - * mon_data has one directory for each domain whic are named + * mon_data has one directory for each domain which are named * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data * with L3 domain looks as below: * ./mon_data: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 972ec3bfa9c0..21d1f062895a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a29..9c1656779b2a 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfb..aa9b8b868867 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7449ef33f081..3be203297988 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,7 +7,7 @@ #include <linux/shmem_fs.h> #include <linux/suspend.h> #include <linux/sched/mm.h> -#include "arch.h" +#include <asm/sgx.h> #include "encl.h" #include "encls.h" #include "sgx.h" @@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } @@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref) xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c..6e74f85b6264 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e70..9b204843b78d 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,21 +11,6 @@ #include <asm/traps.h> #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr * @@ -55,6 +40,19 @@ enum sgx_encls_function { } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -65,7 +63,7 @@ enum sgx_encls_function { */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 90a5caf76939..83df20e3e633 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include <asm/mman.h> +#include <asm/sgx.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +366,7 @@ err_out_unlock: mmap_read_unlock(current->mm); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; @@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); @@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) { struct sgx_sigstruct *sigstruct; struct sgx_enclave_init init_arg; - struct page *initp_page; void *token; int ret; @@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&init_arg, arg, sizeof(init_arg))) return -EFAULT; - initp_page = alloc_page(GFP_KERNEL); - if (!initp_page) + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) return -ENOMEM; - sigstruct = kmap(initp_page); token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); @@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) ret = sgx_encl_init(encl, sigstruct, token); out: - kunmap(initp_page); - __free_page(initp_page); + kfree(sigstruct); return ret; } @@ -665,24 +667,11 @@ out: static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8df81a3ed945..63d3de02bbcc 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include <linux/file.h> #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/miscdevice.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <asm/sgx.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); * with sgx_reclaimer_lock acquired. */ static LIST_HEAD(sgx_active_page_list); - static DEFINE_SPINLOCK(sgx_reclaimer_lock); +/* The free page list lock protected variables prepend the lock. */ +static unsigned long sgx_nr_free_pages; + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + +/* + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. + */ +static struct sgx_numa_node *sgx_numa_nodes; + +static LIST_HEAD(sgx_dirty_page_list); + /* - * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS - * pages whose child pages blocked EREMOVE. + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. */ -static void sgx_sanitize_section(struct sgx_epc_section *section) +static void __sgx_sanitize_pages(struct list_head *dirty_page_list) { struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; - /* init_laundry_list is thread-local, no need for a lock: */ - while (!list_empty(§ion->init_laundry_list)) { + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) return; - /* needed for access to ->page_list: */ - spin_lock(§ion->lock); - - page = list_first_entry(§ion->init_laundry_list, - struct sgx_epc_page, list); + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); ret = __eremove(sgx_get_epc_virt_addr(page)); - if (!ret) - list_move(&page->list, §ion->page_list); - else + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); - - spin_unlock(§ion->lock); + } cond_resched(); } - list_splice(&dirty, §ion->init_laundry_list); + list_splice(&dirty, dirty_page_list); } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -195,10 +214,10 @@ static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) /* * Swap page to the regular memory transformed to the blocked state by using - * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). * * The first trial just tries to write the page assuming that some other thread - * has reset the count for threads inside the enlave by using ETRACK, and + * has reset the count for threads inside the enclave by using ETRACK, and * previous thread count has been zeroed out. The second trial calls ETRACK * before EWB. If that fails we kick all the HW threads out, and then do EWB, * which should be guaranteed the succeed. @@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing, true); @@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void) struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; + struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -379,50 +399,33 @@ skip: epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; section = &sgx_epc_sections[epc_page->section]; - spin_lock(§ion->lock); - list_add_tail(&epc_page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); - } -} - -static unsigned long sgx_nr_free_pages(void) -{ - unsigned long cnt = 0; - int i; - - for (i = 0; i < sgx_nr_epc_sections; i++) - cnt += sgx_epc_sections[i].free_cnt; + node = section->node; - return cnt; + spin_lock(&node->lock); + list_add_tail(&epc_page->list, &node->free_page_list); + sgx_nr_free_pages++; + spin_unlock(&node->lock); + } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages() < watermark && - !list_empty(&sgx_active_page_list); + return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) { - int i; - set_freezable(); /* * Sanitize pages in order to recover from kexec(). The 2nd pass is * required for SECS pages, whose child pages blocked EREMOVE. */ - for (i = 0; i < sgx_nr_epc_sections; i++) - sgx_sanitize_section(&sgx_epc_sections[i]); - - for (i = 0; i < sgx_nr_epc_sections; i++) { - sgx_sanitize_section(&sgx_epc_sections[i]); + __sgx_sanitize_pages(&sgx_dirty_page_list); + __sgx_sanitize_pages(&sgx_dirty_page_list); - /* Should never happen. */ - if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) - WARN(1, "EPC section %d has unsanitized pages.\n", i); - } + /* sanity check: */ + WARN_ON(!list_empty(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void) return true; } -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { - struct sgx_epc_page *page; + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; - spin_lock(§ion->lock); + spin_lock(&node->lock); - if (list_empty(§ion->page_list)) { - spin_unlock(§ion->lock); + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); return NULL; } - page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - section->free_cnt--; + sgx_nr_free_pages--; + + spin_unlock(&node->lock); - spin_unlock(§ion->lock); return page; } /** * __sgx_alloc_epc_page() - Allocate an EPC page * - * Iterate through EPC sections and borrow a free EPC page to the caller. When a - * page is no longer needed it must be released with sgx_free_epc_page(). + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. * * Return: - * an EPC page, - * -errno on error + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. */ struct sgx_epc_page *__sgx_alloc_epc_page(void) { - struct sgx_epc_section *section; struct sgx_epc_page *page; - int i; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; - for (i = 0; i < sgx_nr_epc_sections; i++) { - section = &sgx_epc_sections[i]; + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } + + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; - page = __sgx_alloc_epc_page_from_section(section); + page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; } @@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; - int ret; + struct sgx_numa_node *node = section->node; - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + spin_lock(&node->lock); - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; + list_add_tail(&page->list, &node->free_page_list); + sgx_nr_free_pages++; - spin_lock(§ion->lock); - list_add_tail(&page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); + spin_unlock(&node->lock); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; - spin_lock_init(§ion->lock); - INIT_LIST_HEAD(§ion->page_list); - INIT_LIST_HEAD(§ion->init_laundry_list); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; - list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - section->free_cnt = nr_pages; return true; } @@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; u64 pa, size; + int nid; int i; + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); @@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void) break; } + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + node_set(nid, sgx_numa_mask); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_nr_epc_sections++; } @@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -716,12 +806,28 @@ static int __init sgx_init(void) goto err_page_cache; } - ret = sgx_drv_init(); + ret = misc_register(&sgx_dev_provision); if (ret) goto err_kthread; + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5fa42d143feb..4628acec0009 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,11 +8,15 @@ #include <linux/rwsem.h> #include <linux/types.h> #include <asm/asm.h> -#include "arch.h" +#include <asm/sgx.h> #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 @@ -30,28 +34,25 @@ struct sgx_epc_page { }; /* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + spinlock_t lock; +}; + +/* * The firmware can define multiple chunks of EPC to the different areas of the * physical memory e.g. for memory areas of the each node. This structure is * used to store EPC pages for one EPC section and virtual memory area where * the pages have been mapped. - * - * 'lock' must be held before accessing 'page_list' or 'free_cnt'. */ struct sgx_epc_section { unsigned long phys_addr; void *virt_addr; struct sgx_epc_page *pages; - - spinlock_t lock; - struct list_head page_list; - unsigned long free_cnt; - - /* - * Pages which need EREMOVE run on them before they can be - * used. Only safe to be accessed in ksgxd and init code. - * Not protected by locks. - */ - struct list_head init_laundry_list; + struct sgx_numa_node *node; }; extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; @@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..6ad165a5c0cc --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/xarray.h> +#include <asm/sgx.h> +#include <uapi/asm/sgx.h> + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret; + + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + + return 0; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 8678864ce712..132a2de44d2f 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,7 +30,7 @@ EXPORT_SYMBOL(__max_die_per_package); #ifdef CONFIG_SMP /* - * Check if given CPUID extended toplogy "leaf" is implemented + * Check if given CPUID extended topology "leaf" is implemented */ static int check_extended_topology_leaf(int leaf) { @@ -44,7 +44,7 @@ static int check_extended_topology_leaf(int leaf) return 0; } /* - * Return best CPUID Extended Toplogy Leaf supported + * Return best CPUID Extended Topology Leaf supported */ static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c6ede3b3d302..c04b933f48d3 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -27,6 +27,7 @@ #include <linux/clocksource.h> #include <linux/cpu.h> #include <linux/reboot.h> +#include <linux/static_call.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -336,11 +337,11 @@ static void __init vmware_paravirt_ops_setup(void) vmware_cyc2ns_setup(); if (vmw_sched_clock) - pv_ops.time.sched_clock = vmware_sched_clock; + paravirt_set_sched_clock(vmware_sched_clock); if (vmware_is_stealclock_available()) { has_steal_clock = true; - pv_ops.time.steal_clock = vmware_steal_clock; + static_call_update(pv_steal_clock, vmware_steal_clock); /* We use reboot notifier only to disable steal clock */ register_reboot_notifier(&vmware_pv_reboot_nb); @@ -378,6 +379,8 @@ static void __init vmware_set_capabilities(void) { setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_tsc_khz) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) setup_force_cpu_cap(X86_FEATURE_VMCALL); else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a8f3af257e26..b1deacbeb266 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -337,7 +337,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(sizeof(struct crash_mem)); + cmem = vzalloc(struct_size(cmem, ranges, 1)); if (!cmem) return -ENOMEM; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 22aad412f965..f74cb7da9557 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -793,7 +793,7 @@ core_initcall(e820__register_nvs_regions); #endif /* - * Allocate the requested number of bytes with the requsted alignment + * Allocate the requested number of bytes with the requested alignment * and return (the physical address) to the caller. Also register this * range in the 'kexec' E820 table as a reserved range. * diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 683749b80ae2..a85c64000218 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -253,7 +253,7 @@ static bool xfeature_enabled(enum xfeature xfeature) static void __init setup_xstate_features(void) { u32 eax, ebx, ecx, edx, i; - /* start at the beginnning of the "extended state" */ + /* start at the beginning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5e9beb77cafd..18be44163a50 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -104,7 +104,7 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) static bool __head check_la57_support(unsigned long physaddr) { /* - * 5-level paging is detected and enabled at kernel decomression + * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. */ if (!(native_read_cr4() & X86_CR4_LA57)) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ee1a283f8e96..d552f177eca0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -245,7 +245,7 @@ static const __initconst struct idt_data ist_idts[] = { * after that. * * Note, that X86_64 cannot install the real #PF handler in - * idt_setup_early_traps() because the memory intialization needs the #PF + * idt_setup_early_traps() because the memory initialization needs the #PF * handler from the early_idt_handler_array to initialize the early page * tables. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 58aa712973ac..e28f6a5d14f1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -338,7 +338,7 @@ void fixup_irqs(void) irq_migrate_all_off_this_cpu(); /* - * We can remove mdelay() and then send spuriuous interrupts to + * We can remove mdelay() and then send spurious interrupts to * new cpu targets for all the irqs that were handled previously by * this cpu. While it works, I have seen spurious interrupt messages * (nothing wrong but still...). diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ff7878df96b4..3a43a2dee658 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -17,7 +17,7 @@ * Updated by: Tom Rini <trini@kernel.crashing.org> * Updated by: Jason Wessel <jason.wessel@windriver.com> * Modified for 386 by Jim Kingdon, Cygnus Support. - * Origianl kgdb, compatibility with 2.1.xx kernel by + * Original kgdb, compatibility with 2.1.xx kernel by * David Grothe <dave@gcom.com> * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com> * X86_64 changes from Andi Kleen's patch merged by Jim Houston @@ -642,7 +642,7 @@ void kgdb_arch_late(void) struct perf_event **pevent; /* - * Pre-allocate the hw breakpoint structions in the non-atomic + * Pre-allocate the hw breakpoint instructions in the non-atomic * portion of kgdb because this operation requires mutexs to * complete. */ diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 51c7f5271aee..596de2f6d3a5 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,7 +12,7 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabled */ +/* Ftrace callback handler for kprobes -- called under preempt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 78bb0fae3982..172c947240b9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -650,7 +650,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_ops.time.steal_clock = kvm_steal_clock; + static_call_update(pv_steal_clock, kvm_steal_clock); } if (pv_tlb_flush_supported()) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1fc0962c89c0..d37ed4e1d033 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -106,7 +106,7 @@ static inline void kvm_sched_clock_init(bool stable) if (!stable) clear_sched_clock_stable(); kvm_sched_clock_offset = kvm_clock_read(); - pv_ops.time.sched_clock = kvm_sched_clock_read; + paravirt_set_sched_clock(kvm_sched_clock_read); pr_info("kvm-clock: using sched offset of %llu cycles", kvm_sched_clock_offset); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a29a44a98e5b..f01cd9a08155 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -260,7 +260,7 @@ static void set_idt(void *newidt, u16 limit) { struct desc_ptr curidt; - /* x86-64 supports unaliged loads & stores */ + /* x86-64 supports unaligned loads & stores */ curidt.size = limit; curidt.address = (unsigned long)newidt; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 4f75d0cf6305..9e1ea99ad9df 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void) return pv_ops.lock.vcpu_is_preempted.func == __raw_callee_save___native_vcpu_is_preempted; } + +void __init paravirt_set_cap(void) +{ + if (!pv_is_native_spin_unlock()) + setup_force_cpu_cap(X86_FEATURE_PVUNLOCK); + + if (!pv_is_native_vcpu_is_preempted()) + setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT); +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c60222ab8ab9..d0730264786b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -14,6 +14,7 @@ #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -52,7 +53,10 @@ void __init default_banner(void) } /* Undefined instruction for dealing with missing ops pointers. */ -static const unsigned char ud2a[] = { 0x0f, 0x0b }; +static void paravirt_BUG(void) +{ + BUG(); +} struct branch { unsigned char opcode; @@ -85,25 +89,6 @@ u64 notrace _paravirt_ident_64(u64 x) { return x; } - -static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - struct branch *b = insn_buff; - unsigned long delta = (unsigned long)target - (addr+5); - - if (len < 5) { -#ifdef CONFIG_RETPOLINE - WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); -#endif - return len; /* call too long for patch site */ - } - - b->opcode = 0xe9; /* jmp */ - b->delta = delta; - - return 5; -} #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -114,8 +99,8 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -unsigned paravirt_patch_default(u8 type, void *insn_buff, - unsigned long addr, unsigned len) +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) { /* * Neat trick to map patch type back to the call within the @@ -125,20 +110,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned ret; if (opfunc == NULL) - /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); + /* If there's no function, patch it with paravirt_BUG() */ + ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); else if (opfunc == _paravirt_nop) ret = 0; - -#ifdef CONFIG_PARAVIRT_XXL - /* identity functions just return their single argument */ - else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insn_buff, len); - - else if (type == PARAVIRT_PATCH(cpu.iret)) - /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); -#endif else /* Otherwise call the function. */ ret = paravirt_patch_call(insn_buff, opfunc, addr, len); @@ -146,19 +121,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, return ret; } -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, - const char *start, const char *end) -{ - unsigned insn_len = end - start; - - /* Alternative instruction is too large for the patch site and we cannot continue: */ - BUG_ON(insn_len > len || start == NULL); - - memcpy(insn_buff, start, insn_len); - - return insn_len; -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -167,6 +129,14 @@ static u64 native_steal_clock(int cpu) return 0; } +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); +DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)) +{ + static_call_update(pv_sched_clock, func); +} + /* These are in entry.S */ extern void native_iret(void); @@ -269,13 +239,6 @@ struct pv_info pv_info = { #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { - /* Init ops. */ - .init.patch = native_patch, - - /* Time ops. */ - .time.sched_clock = native_sched_clock, - .time.steal_clock = native_steal_clock, - /* Cpu ops. */ .cpu.io_delay = native_io_delay, @@ -308,8 +271,6 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.iret = native_iret, - #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, @@ -414,6 +375,8 @@ struct paravirt_patch_template pv_ops = { NOKPROBE_SYMBOL(native_get_debugreg); NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); + +void (*paravirt_iret)(void) = native_iret; #endif EXPORT_SYMBOL(pv_ops); diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c deleted file mode 100644 index abd27ec67397..000000000000 --- a/arch/x86/kernel/paravirt_patch.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/stringify.h> - -#include <asm/paravirt.h> -#include <asm/asm-offsets.h> - -#define PSTART(d, m) \ - patch_data_##d.m - -#define PEND(d, m) \ - (PSTART(d, m) + sizeof(patch_data_##d.m)) - -#define PATCH(d, m, insn_buff, len) \ - paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) - -#define PATCH_CASE(ops, m, data, insn_buff, len) \ - case PARAVIRT_PATCH(ops.m): \ - return PATCH(data, ops##_##m, insn_buff, len) - -#ifdef CONFIG_PARAVIRT_XXL -struct patch_xxl { - const unsigned char irq_irq_disable[1]; - const unsigned char irq_irq_enable[1]; - const unsigned char irq_save_fl[2]; - const unsigned char mmu_read_cr2[3]; - const unsigned char mmu_read_cr3[3]; - const unsigned char mmu_write_cr3[3]; - const unsigned char cpu_wbinvd[2]; - const unsigned char mov64[3]; -}; - -static const struct patch_xxl patch_data_xxl = { - .irq_irq_disable = { 0xfa }, // cli - .irq_irq_enable = { 0xfb }, // sti - .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax - .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax - .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax - .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax -}; - -unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) -{ - return PATCH(xxl, mov64, insn_buff, len); -} -# endif /* CONFIG_PARAVIRT_XXL */ - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -struct patch_lock { - unsigned char queued_spin_unlock[3]; - unsigned char vcpu_is_preempted[2]; -}; - -static const struct patch_lock patch_data_lock = { - .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax - -# ifdef CONFIG_X86_64 - .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) -# else - .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) -# endif -}; -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - -unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) -{ - switch (type) { - -#ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, save_fl, xxl, insn_buff, len); - PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); - PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); - - PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); - PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); - PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - - PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); -#endif - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return PATCH(lock, queued_spin_unlock, insn_buff, len); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return PATCH(lock, vcpu_is_preempted, insn_buff, len); - break; -#endif - default: - break; - } - - return paravirt_patch_default(type, insn_buff, addr, len); -} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9c214d7085a4..43cbfc84153a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, - /* - * .sp1 is cpu_current_top_of_stack. The init task never - * runs user code, but cpu_current_top_of_stack should still - * be well defined before the first context switch. - */ +#ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, -#ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif @@ -451,7 +446,7 @@ void speculative_store_bypass_ht_init(void) * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link - * themself to the state of this CPU. + * themselves to the state of this CPU. */ st->shared_state = st; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 11065dc03f5b..eda37df016f0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -89,7 +89,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) /* * Assumption here is that last_value, a global accumulator, always goes * forward. If we are less than that, we should not be much smaller. - * We assume there is an error marging we're inside, and then the correction + * We assume there is an error margin we're inside, and then the correction * does not sacrifice accuracy. * * For reads: global may have changed between test and return, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 94b33885f8d2..f469153eca8a 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -107,7 +107,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movl %cr0, %eax andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a4d9a261425b..c53271aebb64 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -121,7 +121,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movq %cr0, %rax andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d883176ef2ce..69757fac7462 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -65,7 +65,7 @@ RESERVE_BRK(dmi_alloc, 65536); /* * Range of the BSS area. The size of the BSS area is determined - * at link time, with RESERVE_BRK*() facility reserving additional + * at link time, with RESERVE_BRK() facility reserving additional * chunks. */ unsigned long _brk_start = (unsigned long)__brk_base; @@ -633,11 +633,16 @@ static void __init trim_snb_memory(void) printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); /* - * Reserve all memory below the 1 MB mark that has not - * already been reserved. + * SandyBridge integrated graphics devices have a bug that prevents + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * + * To avoid these pages being ever accessed by SNB gfx devices + * reserve all memory below the 1 MB mark and bad_pages that have + * not already been reserved at boot time. */ memblock_reserve(0, 1<<20); - + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) printk(KERN_WARNING "failed to reserve 0x%08lx\n", @@ -645,18 +650,6 @@ static void __init trim_snb_memory(void) } } -/* - * Here we put platform-specific memory range workarounds, i.e. - * memory known to be corrupt or otherwise in need to be reserved on - * specific platforms. - * - * If this gets used more widely it could use a real dispatch mechanism. - */ -static void __init trim_platform_memory_ranges(void) -{ - trim_snb_memory(); -} - static void __init trim_bios_range(void) { /* @@ -725,11 +718,41 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static void __init trim_low_memory_range(void) +static void __init early_reserve_memory(void) { + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + + /* + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * + * Reserve the first memory page and typically some additional + * memory (64KiB by default) since some BIOSes are known to corrupt + * low memory. See the Kconfig help text for X86_RESERVE_LOW. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + + early_reserve_initrd(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + + memblock_x86_reserve_range_setup_data(); + + reserve_ibft_region(); + reserve_bios_regions(); } - + /* * Dump out kernel offset information on panic. */ @@ -764,29 +787,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { - /* - * Reserve the memory occupied by the kernel between _text and - * __end_of_kernel_reserve symbols. Any kernel sections after the - * __end_of_kernel_reserve symbol must be explicitly reserved with a - * separate memblock_reserve() or they will be discarded. - */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); - - /* - * Make sure page 0 is always reserved because on systems with - * L1TF its contents can be leaked to user processes. - */ - memblock_reserve(0, PAGE_SIZE); - - early_reserve_initrd(); - - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -910,8 +910,18 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); + /* + * Do some memory reservations *before* memory is added to + * memblock, so memblock allocations won't overwrite it. + * Do it after early param, so we could get (unlikely) panic from + * serial. + * + * After this point everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + early_reserve_memory(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -938,9 +948,6 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); - /* after early param, so could get panic from serial */ - memblock_x86_reserve_range_setup_data(); - if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; @@ -1032,14 +1039,12 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); - reserve_ibft_region(); - early_alloc_pgt_buf(); /* * Need to conclude brk, before e820__memblock_setup() - * it could use memblock_find_in_range, could overlap with - * brk area. + * it could use memblock_find_in_range, could overlap with + * brk area. */ reserve_brk(); @@ -1054,8 +1059,6 @@ void __init setup_arch(char **cmdline_p) */ sev_setup_arch(); - reserve_bios_regions(); - efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); @@ -1081,8 +1084,12 @@ void __init setup_arch(char **cmdline_p) reserve_real_mode(); - trim_platform_memory_ranges(); - trim_low_memory_range(); + /* + * Reserving memory causing GPU hangs on Sandy Bridge integrated + * graphics devices should be done after we allocated memory under + * 1M for the real mode trampoline. + */ + trim_snb_memory(); init_mem_mapping(); @@ -1129,6 +1136,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); acpi_table_upgrade(); + /* Look for ACPI tables and reserve memory occupied by them. */ + acpi_boot_table_init(); vsmp_init(); @@ -1136,11 +1145,6 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); initmem_init(); diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index cdc04d091242..0aa9f13efd57 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -24,7 +24,7 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int reason) { u64 val = GHCB_SEV_TERMINATE; @@ -186,7 +186,6 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * make it accessible to the hypervisor. * * In particular, check for: - * - Hypervisor CPUID bit * - Availability of CPUID leaf 0x8000001f * - SEV CPUID bit. * @@ -194,10 +193,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * can't be checked here. */ - if ((fn == 1 && !(regs->cx & BIT(31)))) - /* Hypervisor bit */ - goto fail; - else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) /* SEV leaf check */ goto fail; else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) @@ -210,12 +206,8 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) return; fail: - sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); - VMGEXIT(); - - /* Shouldn't get here - if we do halt the machine */ - while (true) - asm volatile("hlt\n"); + /* Terminate the guest */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 04a780abb512..26f5479a97a8 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -137,29 +137,41 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs) } /* - * This function handles the case when an NMI is raised in the #VC exception - * handler entry code. In this case, the IST entry for #VC must be adjusted, so - * that any subsequent #VC exception will not overwrite the stack contents of the - * interrupted #VC handler. + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. * * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested - * sev_es_ist_exit() call may adjust back the IST entry too early. + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. */ void noinstr __sev_es_ist_enter(struct pt_regs *regs) { unsigned long old_ist, new_ist; /* Read old IST entry */ - old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - /* Make room on the IST stack */ + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ if (on_vc_stack(regs)) - new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); - else - new_ist = old_ist - sizeof(old_ist); + new_ist = regs->sp; - /* Store old IST entry */ + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); *(unsigned long *)new_ist = old_ist; /* Set new IST entry */ @@ -277,7 +289,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) return ES_EXCEPTION; } - insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); insn_get_length(&ctxt->insn); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f306e85a08a6..a06cb107c0e8 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -492,7 +492,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatbility issue here: DOSEMU + * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index eff4ce3b10da..06db901fabe8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -67,7 +67,7 @@ * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings - * 7AP. We do not assume writes to the LVT deassering IRQs + * 7AP. We do not assume writes to the LVT deasserting IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode * @@ -204,7 +204,7 @@ static void native_stop_other_cpus(int wait) } /* * Don't wait longer than 10 ms if the caller didn't - * reqeust it. If wait is true, the machine hangs here if + * request it. If wait is true, the machine hangs here if * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 02813a7f3a7c..1e2050c4f94a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1407,7 +1407,7 @@ void __init calculate_max_logical_packages(void) int ncpus; /* - * Today neither Intel nor AMD support heterogenous systems so + * Today neither Intel nor AMD support heterogeneous systems so * extrapolate the boot cpu's data to all packages. */ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); @@ -1659,13 +1659,17 @@ void play_dead_common(void) local_irq_disable(); } -static bool wakeup_cpu0(void) +/** + * cond_wakeup_cpu0 - Wake up CPU0 if needed. + * + * If NMI wants to wake up CPU0, start CPU0. + */ +void cond_wakeup_cpu0(void) { if (smp_processor_id() == 0 && enable_start_cpu0) - return true; - - return false; + start_cpu0(); } +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); /* * We need to flush the caches before going to sleep, lest we have @@ -1734,11 +1738,8 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } @@ -1749,11 +1750,8 @@ void hlt_play_dead(void) while (1) { native_halt(); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8627fda8d993..15b058eefc4e 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -29,12 +29,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 653b7f617b61..8a56a6d80098 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -10,7 +10,7 @@ * EFI Quirks * Several EFI systems do not correctly advertise their boot framebuffers. * Hence, we use this static table of known broken machines and fix up the - * information so framebuffer drivers can load corectly. + * information so framebuffer drivers can load correctly. */ #include <linux/dmi.h> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c09ba110204..f9af561c3cd4 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -49,6 +49,30 @@ bool tboot_enabled(void) return tboot != NULL; } +/* noinline to prevent gcc from warning about dereferencing constant fixaddr */ +static noinline __init bool check_tboot_version(void) +{ + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + return false; + } + + if (tboot->version < 5) { + pr_warn("tboot version is invalid: %u\n", tboot->version); + return false; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); + + return true; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ @@ -66,25 +90,9 @@ void __init tboot_probe(void) /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); - tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); - if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + tboot = (void *)fix_to_virt(FIX_TBOOT_BASE); + if (!check_tboot_version()) tboot = NULL; - return; - } - if (tboot->version < 5) { - pr_warn("tboot version is invalid: %u\n", tboot->version); - tboot = NULL; - return; - } - - pr_info("found shared page at phys addr 0x%llx:\n", - boot_params.tboot_addr); - pr_debug("version: %d\n", tboot->version); - pr_debug("log_addr: 0x%08x\n", tboot->log_addr); - pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); - pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); - pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); } static pgd_t *tboot_pg_dir; diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index f5477eab5692..bd83748e2bde 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -113,7 +113,7 @@ int arch_register_cpu(int num) * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate * depends on BSP. PIC interrupts depend on BSP. * - * If the BSP depencies are under control, one can tell kernel to + * If the BSP dependencies are under control, one can tell kernel to * enable BSP hotplug. This basically adds a control file and * one can attempt to offline BSP. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7bb94a6edc04..f577d07fbd43 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -395,7 +395,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because - * we won't enable interupts or schedule before we invoke + * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * @@ -556,7 +556,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk->thread.trap_nr = X86_TRAP_GP; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + goto exit; show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); @@ -1061,7 +1061,7 @@ static void math_error(struct pt_regs *regs, int trapnr) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) - return; + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index f70dffc2771f..57ec01192180 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> +#include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -254,7 +255,7 @@ unsigned long long sched_clock(void) bool using_native_sched_clock(void) { - return pv_ops.time.sched_clock == native_sched_clock; + return static_call_query(pv_sched_clock) == native_sched_clock; } #else unsigned long long @@ -739,7 +740,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the - * reference read for any possible disturbance. We dicard + * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. @@ -1079,7 +1080,7 @@ static void tsc_resume(struct clocksource *cs) * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in + * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm @@ -1264,7 +1265,7 @@ EXPORT_SYMBOL(convert_art_to_tsc); * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparibility of two cycle + * by timekeeping code to verify comparability of two cycle * values. */ diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 3d3c761eb74a..50a4515fe0ad 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -472,7 +472,7 @@ retry: /* * Add the result to the previous adjustment value. * - * The adjustement value is slightly off by the overhead of the + * The adjustment value is slightly off by the overhead of the * sync mechanism (observed values are ~200 TSC cycles), but this * really depends on CPU, node distance and frequency. So * compensating for this is hard to get right. Experiments show diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f6225bf22c02..fac1daae7994 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -272,7 +272,7 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes. + * significant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a788d5120d4d..f6b93a35ce14 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -84,6 +84,18 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config X86_SGX_KVM + bool "Software Guard eXtensions (SGX) Virtualization" + depends on X86_SGX && KVM_INTEL + help + + Enables KVM guests to create SGX enclaves. + + This includes support to expose "raw" unreclaimable enclave memory to + guests via a device node, e.g. /dev/sgx_vepc. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 1b4766fe1de2..eafc4d601f25 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y += -Iarch/x86/kvm +ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror ifeq ($(CONFIG_FRAME_POINTER),y) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6bd2f8b830e4..c02466a1410b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1033,7 +1033,7 @@ EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); * - Centaur: 0xc0000000 - 0xcfffffff * * The Hypervisor class is further subdivided into sub-classes that each act as - * their own indepdent class associated with a 0x100 byte range. E.g. if Qemu + * their own independent class associated with a 0x100 byte range. E.g. if Qemu * is advertising support for both HyperV and KVM, the resulting Hypervisor * CPUID sub-classes are: * diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f7970ba6219f..cdd2a2b6550e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3222,7 +3222,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, } /* - * Now load segment descriptors. If fault happenes at this stage + * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8a4de3f12820..d5b72a08e566 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -269,7 +269,7 @@ int kvm_set_routing_entry(struct kvm *kvm, const struct kvm_irq_routing_entry *ue) { /* We can't check irqchip_in_kernel() here as some callers are - * currently inititalizing the irqchip. Other callers should therefore + * currently initializing the irqchip. Other callers should therefore * check kvm_arch_can_set_irq_routing() before calling this function. */ switch (ue->type) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d75524bc8423..62b1729277ef 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4961,7 +4961,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, /* * No need to care whether allocation memory is successful - * or not since pte prefetch is skiped if it does not have + * or not since pte prefetch is skipped if it does not have * enough objects in the cache. */ mmu_topup_memory_caches(vcpu, true); @@ -5884,6 +5884,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); + bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); @@ -5905,19 +5906,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); if (is_tdp_mmu_page(sp)) { - kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, - sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + flush |= kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1f6f98c76bdf..360983865398 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -59,7 +59,7 @@ struct kvm_mmu_page { #ifdef CONFIG_X86_64 bool tdp_mmu_page; - /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif }; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 462b1f71c77f..34207b874886 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield); + gfn_t start, gfn_t end, bool can_yield, bool flush); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { @@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del(&root->link); - zap_gfn_range(kvm, root, 0, max_gfn, false); + zap_gfn_range(kvm, root, 0, max_gfn, false, false); free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); @@ -404,7 +404,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * If this warning were to trigger it would indicate that there was a * missing MMU notifier or a race with some notifier handler. * A present, leaf SPTE should never be directly replaced with another - * present leaf SPTE pointing to a differnt PFN. A notifier handler + * present leaf SPTE pointing to a different PFN. A notifier handler * should be zapping the SPTE before the main MM's page table is * changed, or the SPTE should be zeroed, and the TLBs flushed by the * thread before replacement. @@ -418,7 +418,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, /* * Crash the host to prevent error propagation and guest data - * courruption. + * corruption. */ BUG(); } @@ -529,7 +529,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, /* * No other thread can overwrite the removed SPTE as they * must either wait on the MMU lock or use - * tdp_mmu_set_spte_atomic which will not overrite the + * tdp_mmu_set_spte_atomic which will not overwrite the * special removed SPTE value. No bookkeeping is needed * here since the SPTE is going from non-present * to non-present. @@ -668,20 +668,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. + * operation can cause a soft lockup. Note, in some use cases a flush may be + * required by prior actions. Ensure the pending flush is performed prior to + * yielding. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield) + gfn_t start, gfn_t end, bool can_yield, bool flush) { struct tdp_iter iter; - bool flush_needed = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { - flush_needed = false; + tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + flush = false; continue; } @@ -699,11 +700,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_set_spte(kvm, &iter, 0); - flush_needed = true; + flush = true; } rcu_read_unlock(); - return flush_needed; + return flush; } /* @@ -712,13 +713,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield) { struct kvm_mmu_page *root; bool flush = false; for_each_tdp_mmu_root_yield_safe(kvm, root) - flush |= zap_gfn_range(kvm, root, start, end, true); + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); return flush; } @@ -930,7 +932,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, unsigned long unused) { - return zap_gfn_range(kvm, root, start, end, false); + return zap_gfn_range(kvm, root, start, end, false, false); } int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3b761c111bff..31096ece9b14 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -8,7 +8,29 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, + gfn_t end) +{ + return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); +} +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + + /* + * Don't allow yielding, as the caller may have a flush pending. Note, + * if mmu_lock is held for write, zapping will never yield in this case, + * but explicitly disallow it for safety. The TDP MMU does not yield + * until it has made forward progress (steps sideways), and when zapping + * a single shadow page that it's guaranteed to see (thus the mmu_lock + * requirement), its "step sideways" will always step beyond the bounds + * of the shadow page's gfn range and stop iterating before yielding. + */ + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); +} void kvm_tdp_mmu_zap_all(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7b30bc967af3..67e753edfa22 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -103,7 +103,7 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, /* returns general purpose PMC with the specified MSR. Note that it can be * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a - * paramenter to tell them apart. + * parameter to tell them apart. */ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, u32 base) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 78bdcfac4e40..3e55674098be 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -727,7 +727,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; /** - * In some cases, the existing irte is updaed and re-set, + * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ @@ -838,7 +838,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. * 2. Unsetting posted interrupt. - * 3. APIC virtialization is disabled for the vcpu. + * 3. APIC virtualization is disabled for the vcpu. * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 35891d9a1099..fb204eaa8bb3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -246,11 +246,18 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; bool vmcb12_lma; + /* + * FIXME: these should be done after copying the fields, + * to avoid TOC/TOU races. For these save area checks + * the possible damage is limited since kvm_set_cr0 and + * kvm_set_cr4 handle failure; EFER_SVME is an exception + * so it is force-set later in nested_prepare_vmcb_save. + */ if ((vmcb12->save.efer & EFER_SVME) == 0) return false; @@ -271,7 +278,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb12->control); + return true; } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -396,7 +403,14 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.gdtr = vmcb12->save.gdtr; svm->vmcb->save.idtr = vmcb12->save.idtr; kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, vmcb12->save.efer); + + /* + * Force-set EFER_SVME even though it is checked earlier on the + * VMCB12, because the guest can flip the bit between the check + * and now. Clearing EFER_SVME would call svm_free_nested. + */ + svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -468,7 +482,6 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, svm->nested.vmcb12_gpa = vmcb12_gpa; - load_nested_vmcb_control(svm, &vmcb12->control); nested_prepare_vmcb_control(svm); nested_prepare_vmcb_save(svm, vmcb12); @@ -515,7 +528,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - if (!nested_vmcb_checks(svm, vmcb12)) { + load_nested_vmcb_control(svm, &vmcb12->control); + + if (!nested_vmcb_check_save(svm, vmcb12) || + !nested_vmcb_check_controls(&svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -1209,6 +1225,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (!(save->cr0 & X86_CR0_PG)) goto out_free; + if (!(save->efer & EFER_SVME)) + goto out_free; /* * All checks done, we can enter guest mode. L1 control fields diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 035da07500e8..fdf587f19c5f 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -98,6 +98,8 @@ static enum index msr_to_index(u32 msr) static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, enum pmu_type type) { + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -105,6 +107,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTL3: case MSR_F15H_PERF_CTL4: case MSR_F15H_PERF_CTL5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: if (type != PMU_TYPE_EVNTSEL) return NULL; @@ -115,6 +120,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTR3: case MSR_F15H_PERF_CTR4: case MSR_F15H_PERF_CTR5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: if (type != PMU_TYPE_COUNTER) return NULL; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 874ea309279f..2b27a9452403 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2082,7 +2082,7 @@ void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - /* PKRU is restored on VMEXIT, save the curent host value */ + /* PKRU is restored on VMEXIT, save the current host value */ hostsa->pkru = read_pkru(); /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 58a45bb139f8..6dad89248312 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4400,7 +4400,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i * * This happens because CPU microcode reading instruction bytes * uses a special opcode which attempts to read data using CPL=0 - * priviledges. The microcode reads CS:RIP and if it hits a SMAP + * privileges. The microcode reads CS:RIP and if it hits a SMAP * fault, it gives up and returns no instruction bytes. * * Detection: diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcca0b80e0d0..1e069aac7410 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3537,7 +3537,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * snapshot restore (migration). * * In this flow, it is assumed that vmcs12 cache was - * trasferred as part of captured nVMX state and should + * transferred as part of captured nVMX state and should * therefore not be read from guest memory (which may not * exist on destination host yet). */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 4831bc44ce66..459748680daf 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -10,7 +10,7 @@ #include "vmx.h" /* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we * can find which vCPU should be waken up. */ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 32cf8287d4a7..bcbf0d2139e9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1529,7 +1529,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) /* * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that - * utilize encodings marked reserved will casue a #GP fault. + * utilize encodings marked reserved will cause a #GP fault. */ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && @@ -2761,7 +2761,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * Update real mode segment cache. It may be not up-to-date if sement + * Update real mode segment cache. It may be not up-to-date if segment * register was written while vcpu was in a guest mode. */ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); @@ -6027,19 +6027,19 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + int ndata = 3; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason.full; vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.ndata++; - vcpu->run->internal.data[3] = + vcpu->run->internal.data[ndata++] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } - vcpu->run->internal.data[vcpu->run->internal.ndata++] = - vcpu->arch.last_vmentry_cpu; + vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + vcpu->run->internal.ndata = ndata; return 0; } @@ -7252,7 +7252,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; - /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe806e894212..efc7a82ab140 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -156,9 +156,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancment of 1000ns. '0' disables + * adaptive tuning starting from default advancement of 1000ns. '0' disables * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows priveleged userspace to set an exact advancement time. + * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); @@ -271,8 +271,7 @@ static struct kmem_cache *x86_emulator_cache; * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ -static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -1288,7 +1287,7 @@ static const u32 emulated_msrs_all[] = { MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, + MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, @@ -1373,7 +1372,7 @@ static u64 kvm_get_arch_capabilities(void) /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that * the nested hypervisor runs with NX huge pages. If it is not, - * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other * L1 guests, so it need not worry about its own (L2) guests. */ data |= ARCH_CAP_PSCHANGE_MC_NO; @@ -1445,7 +1444,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) r = 0; } @@ -1620,7 +1619,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(vcpu, index, data, true)) + if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; @@ -1658,7 +1657,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } @@ -1850,7 +1849,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) ret = EXIT_FASTPATH_EXIT_HANDLED; } break; - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_tscdeadline(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); @@ -2329,7 +2328,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -2337,7 +2336,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2559,13 +2558,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + unsigned long flags; kvm_hv_invalidate_tsc_page(kvm); - spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2573,8 +2575,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } @@ -2582,17 +2582,18 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + unsigned long flags; u64 ret; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -2686,13 +2687,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -3086,7 +3087,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: @@ -3448,7 +3449,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: @@ -4024,7 +4025,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct kvm_host_map map; struct kvm_steal_time *st; - int idx; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4032,15 +4032,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, &vcpu->arch.st.cache, true)) - goto out; + return; st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); @@ -4048,20 +4042,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); - -out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + if (vcpu->preempted && !vcpu->arch.guest_state_protected) vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_xen_msr_enabled(vcpu->kvm)) kvm_xen_runstate_set_preempted(vcpu); else kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -5726,6 +5725,7 @@ set_pit2_out: } #endif case KVM_SET_CLOCK: { + struct kvm_arch *ka = &kvm->arch; struct kvm_clock_data user_ns; u64 now_ns; @@ -5744,8 +5744,22 @@ set_pit2_out: * pvclock_update_vm_gtod_copy(). */ kvm_gen_update_masterclock(kvm); - now_ns = get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'user_ns.clock' is very small. + */ + spin_lock_irq(&ka->pvclock_gtod_sync_lock); + if (kvm->arch.use_master_clock) + now_ns = ka->master_kernel_ns; + else + now_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = user_ns.clock - now_ns; + spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } @@ -7724,6 +7738,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; + unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -7739,17 +7754,15 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock(&ka->pvclock_gtod_sync_lock); - + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); } mutex_unlock(&kvm_lock); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 39eb04887141..9035e34aa156 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -250,7 +250,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 3b6544111ac9..16bc9130e7a5 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -6,7 +6,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 1c5c81c16b06..ce6935690766 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -6,7 +6,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> .macro read64 reg movl %ebx, %eax diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 2402d4c489d2..db4b4f9197c7 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -3,7 +3,7 @@ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> /* diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 77b9b2a3b5c8..57b79c577496 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -11,7 +11,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/asm.h> #include <asm/smap.h> #include <asm/export.h> diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index bb0b3fe1e0a0..2bf07e18e38c 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -232,7 +232,7 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) * resolve_seg_reg() - obtain segment register index * @insn: Instruction with operands * @regs: Register values as seen when entering kernel mode - * @regoff: Operand offset, in pt_regs, used to deterimine segment register + * @regoff: Operand offset, in pt_regs, used to determine segment register * * Determine the segment register associated with the operands and, if * applicable, prefixes and the instruction pointed by @insn. @@ -517,7 +517,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, * @insn: Instruction containing ModRM byte * @regs: Register values as seen when entering kernel mode * @offs1: Offset of the first operand register - * @offs2: Offset of the second opeand register, if applicable + * @offs2: Offset of the second operand register, if applicable * * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte * in @insn. This function is to be used with 16-bit address encodings. The @@ -576,7 +576,7 @@ static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs, * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement- * only addressing. This means that no registers are involved in * computing the effective address. Thus, ensure that the first - * register offset is invalild. The second register offset is already + * register offset is invalid. The second register offset is already * invalid under the aforementioned conditions. */ if ((X86_MODRM_MOD(insn->modrm.value) == 0) && diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1e299ac73c86..1cc9da6e29c7 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,7 +4,7 @@ #include <linux/linkage.h> #include <asm/errno.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> .pushsection .noinstr.text, "ax" diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 41902fe8b859..64801010d312 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -8,7 +8,7 @@ */ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> #undef memmove diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 0bfd26e4ca9e..9827ae267f96 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -3,7 +3,7 @@ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> /* diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 419365c48b2a..cc5f4ea943d3 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -14,7 +14,7 @@ * tested so far for any MMX solution figured. * * 22/09/2000 - Arjan van de Ven - * Improved for non-egineering-sample Athlons + * Improved for non-engineering-sample Athlons * */ #include <linux/hardirq.h> diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index 75a0915b0d01..40bbe56bde32 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -252,7 +252,7 @@ static void __wrmsr_safe_regs_on_cpu(void *info) rv->err = wrmsr_safe_regs(rv->regs); } -int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; @@ -265,7 +265,7 @@ int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); -int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 3bd905e10ee2..b09cd2ad426c 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(msrs_free); * argument @m. * */ -int msr_read(u32 msr, struct msr *m) +static int msr_read(u32 msr, struct msr *m) { int err; u64 val; @@ -54,7 +54,7 @@ int msr_read(u32 msr, struct msr *m) * @msr: MSR to write * @m: value to write */ -int msr_write(u32 msr, struct msr *m) +static int msr_write(u32 msr, struct msr *m) { return wrmsrl_safe(msr, m->q); } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index f6fb1d218dcc..6bb74b5c238c 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -4,7 +4,7 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 4a9887851ad8..990d847ae902 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -547,7 +547,7 @@ static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) single_arg_error(st0_ptr, st0_tag); } -static int fsin(FPU_REG *st0_ptr, u_char tag) +static int f_sin(FPU_REG *st0_ptr, u_char tag) { u_char arg_sign = getsign(st0_ptr); @@ -608,6 +608,11 @@ static int fsin(FPU_REG *st0_ptr, u_char tag) } } +static void fsin(FPU_REG *st0_ptr, u_char tag) +{ + f_sin(st0_ptr, tag); +} + static int f_cos(FPU_REG *st0_ptr, u_char tag) { u_char st0_sign; @@ -724,7 +729,7 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) } reg_copy(st0_ptr, &arg); - if (!fsin(st0_ptr, st0_tag)) { + if (!f_sin(st0_ptr, st0_tag)) { push(); FPU_copy_to_reg0(&arg, st0_tag); f_cos(&st(0), st0_tag); @@ -1635,7 +1640,7 @@ void FPU_triga(void) } static FUNC_ST0 const trig_table_b[] = { - fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos + fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, fsin, fcos }; void FPU_trigb(void) diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index fe6246ff9887..7ca6417c0c8d 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -964,7 +964,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) /* The return value (in eax) is zero if the result is exact, if bits are changed due to rounding, truncation, etc, then a non-zero value is returned */ -/* Overflow is signalled by a non-zero return value (in eax). +/* Overflow is signaled by a non-zero return value (in eax). In the case of overflow, the returned significand always has the largest possible value */ int FPU_round_to_int(FPU_REG *r, u_char tag) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index 11a1f798451b..4a9fc3cc5a4d 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -575,7 +575,7 @@ Normalise_result: #ifdef PECULIAR_486 /* * This implements a special feature of 80486 behaviour. - * Underflow will be signalled even if the number is + * Underflow will be signaled even if the number is * not a denormal after rounding. * This difference occurs only for masked underflow, and not * in the unmasked case. diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a73347e2cdfc..1c548ad00752 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1497,7 +1497,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event - * which is injected when the memory becomes available, is delived via + * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * @@ -1523,7 +1523,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps - * debugability. + * debuggability. */ state = irqentry_enter(regs); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dd694fb93916..fbf41dd142ca 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,7 +29,7 @@ /* * We need to define the tracepoints somewhere, and tlb.c - * is only compied when SMP=y. + * is only compiled when SMP=y. */ #define CREATE_TRACE_POINTS #include <trace/events/tlb.h> @@ -756,7 +756,7 @@ void __init init_mem_mapping(void) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { - /* can we preseve max_low_pfn ?*/ + /* can we preserve max_low_pfn ?*/ max_low_pfn = max_pfn; } #else @@ -939,7 +939,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, - * decompresser could be confused by aligned initrd_end + * decompressor could be confused by aligned initrd_end * We already reserve the end partial page before in * - i386_start_kernel() * - x86_64_start_kernel() diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b5a3fa4033d3..55247451ba85 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -172,7 +172,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end) /* * With folded p4d, pgd_none() is always false, we need to - * handle synchonization on p4d level. + * handle synchronization on p4d level. */ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); @@ -986,7 +986,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { /* * Do not free direct mapping pages since they were - * freed when offlining, or simplely not in use. + * freed when offlining, or simply not in use. */ if (!direct) free_pagetable(pte_page(*pte), 0); @@ -1004,7 +1004,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, * * If we are not removing the whole page, it means * other page structs in this page are being used and - * we canot remove them. So fill the unused page_structs + * we cannot remove them. So fill the unused page_structs * with 0xFD, and remove the page when it is wholly * filled with 0xFD. */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 6e6b39710e5f..557f0fe25dff 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -96,7 +96,7 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt phyiscal memory region size based on available memory */ + /* Adapt physical memory region size based on available memory */ if (memory_tb < kaslr_regions[0].size_tb) kaslr_regions[0].size_tb = memory_tb; diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index be020a7bc414..d3efbc5b3449 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Support for MMIO probes. - * Benfit many code from kprobes + * Benefit many code from kprobes * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. * 2007 Alexander Eichner * 2008 Pekka Paalanen <pq@iki.fi> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 4b01f7dbaf30..f633f9e23b8f 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/dma-mapping.h> +#include <linux/virtio_config.h> #include <asm/tlbflush.h> #include <asm/fixmap.h> @@ -262,7 +263,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) if (pgprot_val(old_prot) == pgprot_val(new_prot)) return; - pa = pfn << page_level_shift(level); + pa = pfn << PAGE_SHIFT; size = page_level_size(level); /* @@ -484,3 +485,8 @@ void __init mem_encrypt_init(void) print_mem_encrypt_feature_info(); } +int arch_has_restricted_virtio_memory_access(void) +{ + return sev_active(); +} +EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access); diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 7a84fc8bc5c3..17d292b7072f 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -27,7 +27,7 @@ SYM_FUNC_START(sme_encrypt_execute) * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) * - intermediate copy buffer (PMD_PAGE_SIZE) - * R8 - physcial address of the pagetables to use for encryption + * R8 - physical address of the pagetables to use for encryption */ push %rbp diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 6c5eb6f3f14f..a19374d26101 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -503,14 +503,10 @@ void __init sme_enable(struct boot_params *bp) #define AMD_SME_BIT BIT(0) #define AMD_SEV_BIT BIT(1) - /* - * Set the feature mask (SME or SEV) based on whether we are - * running under a hypervisor. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = __rdmsr(MSR_AMD64_SEV); + feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* * Check for the SME/SEV feature: @@ -530,19 +526,26 @@ void __init sme_enable(struct boot_params *bp) /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { + /* + * No SME if Hypervisor bit is set. This check is here to + * prevent a guest from trying to enable SME. For running as a + * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * might be other hypervisors which emulate that MSR as non-zero + * or even pass it through to the guest. + * A malicious hypervisor can still trick a guest into this + * path, but there is no way to protect against that. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (ecx & BIT(31)) + return; + /* For SME, check the SYSCFG MSR */ msr = __rdmsr(MSR_K8_SYSCFG); if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) return; } else { - /* For SEV, check the SEV MSR */ - msr = __rdmsr(MSR_AMD64_SEV); - if (!(msr & MSR_AMD64_SEV_ENABLED)) - return; - - /* Save SEV_STATUS to avoid reading MSR again */ - sev_status = msr; - /* SEV state cannot be controlled by a command line option */ sme_me_mask = me_mask; sev_enabled = true; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index ca311aaa67b8..3112ca7786ed 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -695,7 +695,7 @@ int memtype_free(u64 start, u64 end) /** - * lookup_memtype - Looksup the memory type for a physical address + * lookup_memtype - Looks up the memory type for a physical address * @paddr: physical address of which memory type needs to be looked up * * Only to be called when PAT is enabled @@ -800,6 +800,7 @@ void memtype_free_io(resource_size_t start, resource_size_t end) memtype_free(start, end); } +#ifdef CONFIG_X86_PAT int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) { enum page_cache_mode type = _PAGE_CACHE_MODE_WC; @@ -813,6 +814,7 @@ void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) memtype_free_io(start, start + size); } EXPORT_SYMBOL(arch_io_free_memtype_wc); +#endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 16f878c26667..427980617557 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -680,7 +680,7 @@ pmd_t *lookup_pmd_address(unsigned long address) * end up in this kind of memory, for instance. * * This could be optimized, but it is only intended to be - * used at inititalization time, and keeping it + * used at initialization time, and keeping it * unoptimized should increase the testing coverage for * the more obscure platforms. */ diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 8873ed1438a9..a2332eef66e9 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -128,7 +128,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | /* * Called from the FPU code when creating a fresh set of FPU * registers. This is called from a very specific context where - * we know the FPU regstiers are safe for use and we can use PKRU + * we know the FPU registers are safe for use and we can use PKRU * directly. */ void copy_init_pkru_to_fpregs(void) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 1aab92930569..5d5c7bb50ce9 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -361,7 +361,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, * global, so set it as global in both copies. Note: * the X86_FEATURE_PGE check is not _required_ because * the CPU ignores _PAGE_GLOBAL when PGE is not - * supported. The check keeps consistentency with + * supported. The check keeps consistency with * code that only set this bit when supported. */ if (boot_cpu_has(X86_FEATURE_PGE)) @@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void) for_each_possible_cpu(cpu) { /* - * The SYSCALL64 entry code needs to be able to find the - * thread stack and needs one word of scratch space in which - * to spill a register. All of this lives in the TSS, in - * the sp1 and sp2 slots. + * The SYSCALL64 entry code needs one word of scratch space + * in which to spill a register. It lives in the sp2 slot + * of the CPU's TSS. * * This is done for all possible CPUs during boot to ensure * that it's propagated to all mms. @@ -512,7 +511,7 @@ static void pti_clone_entry_text(void) static inline bool pti_kernel_image_global_ok(void) { /* - * Systems with PCIDs get litlle benefit from global + * Systems with PCIDs get little benefit from global * kernel text and are not worth the downsides. */ if (cpu_feature_enabled(X86_FEATURE_PCID)) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 569ac1d57f55..98f269560d40 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -106,7 +106,7 @@ static inline u16 kern_pcid(u16 asid) #ifdef CONFIG_PAGE_TABLE_ISOLATION /* - * Make sure that the dynamic ASID space does not confict with the + * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs. */ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); @@ -736,7 +736,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * 3, we'd be break the invariant: we'd update local_tlb_gen above * 1 without the full flush that's needed for tlb_gen 2. * - * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. * Partial TLB flushes are not all that much cheaper than full TLB * flushes, so it seems unlikely that it would be a performance win * to do a partial flush if that won't bring our TLB fully up to @@ -876,7 +876,7 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, static inline void put_flush_tlb_info(void) { #ifdef CONFIG_DEBUG_VM - /* Complete reentrency prevention checks */ + /* Complete reentrancy prevention checks */ barrier(); this_cpu_dec(flush_tlb_info_idx); #endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6926d0ca6c71..220e72434f3c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1556,7 +1556,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ if (is_imm8(jmp_offset)) { if (jmp_padding) { /* To keep the jmp_offset valid, the extra bytes are - * padded before the jump insn, so we substract the + * padded before the jump insn, so we subtract the * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 @@ -1631,7 +1631,7 @@ emit_jmp: if (jmp_padding) { /* To avoid breaking jmp_offset, the extra bytes * are padded before the actual jmp insn, so - * 2 bytes is substracted from INSN_SZ_DIFF. + * 2 bytes is subtracted from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 * jmp, there is nothing to pad (0 byte). @@ -1689,7 +1689,16 @@ emit_jmp: } if (image) { - if (unlikely(proglen + ilen > oldproglen)) { + /* + * When populating the image, assert that: + * + * i) We do not write beyond the allocated space, and + * ii) addrs[i] did not change from the prior run, in order + * to validate assumptions made for computing branch + * displacements. + */ + if (unlikely(proglen + ilen > oldproglen || + proglen + ilen != addrs[i])) { pr_err("bpf_jit: fatal error\n"); return -EFAULT; } @@ -1936,7 +1945,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1984,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2011,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2020,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { @@ -2033,9 +2053,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ @@ -2225,7 +2253,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2317,7 +2345,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index d17b67c69f89..6a99def7d315 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2276,7 +2276,16 @@ notyet: } if (image) { - if (unlikely(proglen + ilen > oldproglen)) { + /* + * When populating the image, assert that: + * + * i) We do not write beyond the allocated space, and + * ii) addrs[i] did not change from the prior run, in order + * to validate assumptions made for computing branch + * displacements. + */ + if (unlikely(proglen + ilen > oldproglen || + proglen + ilen != addrs[i])) { pr_err("bpf_jit: fatal error\n"); return -EFAULT; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 0a0e168be1cb..02dc64625e64 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -375,7 +375,7 @@ static const struct dmi_system_id msi_k8t_dmi_table[] = { * The BIOS only gives options "DISABLED" and "AUTO". This code sets * the corresponding register-value to enable the soundcard. * - * The soundcard is only enabled, if the mainborad is identified + * The soundcard is only enabled, if the mainboard is identified * via DMI-tables and the soundcard is detected to be off. */ static void pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 1b82d77019b1..df7b5477fc4f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -195,7 +195,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) } /* - * Certain firmware versions are way too sentimential and still believe + * Certain firmware versions are way too sentimental and still believe * they are exclusive and unquestionable owners of the first physical page, * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY * (but then write-access it later during SetVirtualAddressMap()). @@ -457,7 +457,7 @@ void __init efi_dump_pagetable(void) * in a kernel thread and user context. Preemption needs to remain disabled * while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm * can not change under us. - * It should be ensured that there are no concurent calls to this function. + * It should be ensured that there are no concurrent calls to this function. */ void efi_enter_mm(void) { diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 67d93a243c35..7850111008a8 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -441,7 +441,7 @@ void __init efi_free_boot_services(void) * 1.4.4 with SGX enabled booting Linux via Fedora 24's * grub2-efi on a hard disk. (And no, I don't know why * this happened, but Linux should still try to boot rather - * panicing early.) + * panicking early.) */ rm_size = real_mode_size_needed(); if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { @@ -726,7 +726,7 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) * Buggy efi_reset_system() is handled differently from other EFI * Runtime Services as it doesn't use efi_rts_wq. Although, * native_machine_emergency_restart() says that machine_real_restart() - * could fail, it's better not to compilcate this fault handler + * could fail, it's better not to complicate this fault handler * because this case occurs *very* rarely and hence could be improved * on a need by basis. */ diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 0286fe1b14b5..d3d456925b2a 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * imr.c -- Intel Isolated Memory Region driver * * Copyright(c) 2013 Intel Corporation. @@ -551,7 +551,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) /* * Setup an unlocked IMR around the physical extent of the kernel - * from the beginning of the .text secton to the end of the + * from the beginning of the .text section to the end of the * .rodata section as one physically contiguous block. * * We don't round up @size since it is already PAGE_SIZE aligned. diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 570e3062faac..761f3689f60a 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * imr_selftest.c -- Intel Isolated Memory Region self-test driver * * Copyright(c) 2013 Intel Corporation. diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index 526f70f27c1c..fdd49d70b437 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -187,7 +187,7 @@ bool iosf_mbi_available(void) EXPORT_SYMBOL(iosf_mbi_available); /* - **************** P-Unit/kernel shared I2C bus arbritration **************** + **************** P-Unit/kernel shared I2C bus arbitration **************** * * Some Bay Trail and Cherry Trail devices have the P-Unit and us (the kernel) * share a single I2C bus to the PMIC. Below are helpers to arbitrate the @@ -493,7 +493,7 @@ static void iosf_sideband_debug_init(void) /* mcrx */ debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); - /* mcr - initiates mailbox tranaction */ + /* mcr - initiates mailbox transaction */ debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); } diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 85f4638764d6..994a229cb79f 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -27,7 +27,7 @@ static bool lid_wake_on_close; * wake-on-close. This is implemented as standard by the XO-1.5 DSDT. * * We provide here a sysfs attribute that will additionally enable - * wake-on-close behavior. This is useful (e.g.) when we oportunistically + * wake-on-close behavior. This is useful (e.g.) when we opportunistically * suspend with the display running; if the lid is then closed, we want to * wake up to turn the display off. * diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 26d1f6693789..75e3319e8bee 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -131,7 +131,7 @@ void * __init prom_early_alloc(unsigned long size) const size_t chunk_size = max(PAGE_SIZE, size); /* - * To mimimize the number of allocations, grab at least + * To minimize the number of allocations, grab at least * PAGE_SIZE of memory (that's an arbitrary choice that's * fast enough on the platforms we care about while minimizing * wasted bootmem) and hand off chunks of it to callers. diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index d2ccadc247e6..66b317398b8a 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -30,10 +30,10 @@ * the boot start info structure. * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared. * - `cr4`: all bits are cleared. - * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’ - * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified. + * - `cs `: must be a 32-bit read/execute code segment with a base of `0` + * and a limit of `0xFFFFFFFF`. The selector value is unspecified. * - `ds`, `es`: must be a 32-bit read/write data segment with a base of - * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all + * `0` and a limit of `0xFFFFFFFF`. The selector values are all * unspecified. * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit * of '0x67'. diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index eafc530c8767..1e9ff28bc2e0 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -24,6 +24,7 @@ #include <asm/kdebug.h> #include <asm/local64.h> #include <asm/nmi.h> +#include <asm/reboot.h> #include <asm/traps.h> #include <asm/uv/uv.h> #include <asm/uv/uv_hub.h> @@ -91,6 +92,8 @@ static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1); static atomic_t uv_nmi_slave_continue; static cpumask_var_t uv_nmi_cpu_mask; +static atomic_t uv_nmi_kexec_failed; + /* Values for uv_nmi_slave_continue */ #define SLAVE_CLEAR 0 #define SLAVE_CONTINUE 1 @@ -834,38 +837,35 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -static atomic_t uv_nmi_kexec_failed; - -#if defined(CONFIG_KEXEC_CORE) -static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) +static void uv_nmi_kdump(int cpu, int main, struct pt_regs *regs) { + /* Check if kdump kernel loaded for both main and secondary CPUs */ + if (!kexec_crash_image) { + if (main) + pr_err("UV: NMI error: kdump kernel not loaded\n"); + return; + } + /* Call crash to dump system state */ - if (master) { + if (main) { pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu); crash_kexec(regs); - pr_emerg("UV: crash_kexec unexpectedly returned, "); + pr_emerg("UV: crash_kexec unexpectedly returned\n"); atomic_set(&uv_nmi_kexec_failed, 1); - if (!kexec_crash_image) { - pr_cont("crash kernel not loaded\n"); - return; - } - pr_cont("kexec busy, stalling cpus while waiting\n"); - } - /* If crash exec fails the slaves should return, otherwise stall */ - while (atomic_read(&uv_nmi_kexec_failed) == 0) - mdelay(10); -} + } else { /* secondary */ -#else /* !CONFIG_KEXEC_CORE */ -static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) -{ - if (master) - pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); - atomic_set(&uv_nmi_kexec_failed, 1); + /* If kdump kernel fails, secondaries will exit this loop */ + while (atomic_read(&uv_nmi_kexec_failed) == 0) { + + /* Once shootdown cpus starts, they do not return */ + run_crash_ipi_callback(regs); + + mdelay(10); + } + } } -#endif /* !CONFIG_KEXEC_CORE */ #ifdef CONFIG_KGDB #ifdef CONFIG_KGDB_KDB @@ -889,7 +889,7 @@ static inline int uv_nmi_kdb_reason(void) * Call KGDB/KDB from NMI handler * * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or - * 'kdb' has no affect on which is used. See the KGDB documention for further + * 'kdb' has no affect on which is used. See the KGDB documentation for further * information. */ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index db1378c6ff26..c9908bcdb249 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -321,7 +321,7 @@ int hibernate_resume_nonboot_cpu_disable(void) /* * When bsp_check() is called in hibernate and suspend, cpu hotplug - * is disabled already. So it's unnessary to handle race condition between + * is disabled already. So it's unnecessary to handle race condition between * cpumask query and cpu hotplug. */ static int bsp_check(void) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 22fda7d99159..1be71ef5e4c4 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -103,7 +103,7 @@ static void __init setup_real_mode(void) *ptr += phys_base; } - /* Must be perfomed *after* relocation. */ + /* Must be performed *after* relocation. */ trampoline_header = (struct trampoline_header *) __va(real_mode_header->trampoline_header); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index dc0a337f985b..4f18cd9eacd8 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1070,8 +1070,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_pmc = xen_read_pmc, - .iret = xen_iret, - .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, .load_gdt = xen_load_gdt, @@ -1233,8 +1231,8 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.init.patch = paravirt_patch_default; pv_ops.cpu = xen_cpu_ops; + paravirt_iret = xen_iret; xen_init_irq_ops(); /* diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index cf2ade864c30..1e28c880f642 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2410,7 +2410,7 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, rmd.prot = prot; /* * We use the err_ptr to indicate if there we are doing a contiguous - * mapping or a discontigious mapping. + * mapping or a discontiguous mapping. */ rmd.contiguous = !err_ptr; rmd.no_translate = no_translate; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 17d80f751fcb..ac06ca32e9ef 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT -#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT +#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT #else #define P2M_LIMIT 0 #endif @@ -416,9 +416,6 @@ void __init xen_vmalloc_p2m_tree(void) xen_p2m_last_pfn = xen_max_p2m_pfn; p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; - if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC)) - p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO; - vm.flags = VM_ALLOC; vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), PMD_SIZE * PMDS_PER_MID_PAGE); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1a3b75652fa4..8bfc10330107 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -59,6 +59,18 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -778,13 +790,13 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. * * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); i = 0; addr = xen_e820_table.entries[0].addr; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 91f5b330dcc6..d9c945ee1100 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -379,11 +379,6 @@ void xen_timer_resume(void) } } -static const struct pv_time_ops xen_time_ops __initconst = { - .sched_clock = xen_sched_clock, - .steal_clock = xen_steal_clock, -}; - static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; static u64 xen_clock_value_saved; @@ -525,17 +520,24 @@ static void __init xen_time_init(void) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } -void __init xen_init_time_ops(void) +static void __init xen_init_time_common(void) { xen_sched_clock_offset = xen_clocksource_read(); - pv_ops.time = xen_time_ops; + static_call_update(pv_steal_clock, xen_steal_clock); + paravirt_set_sched_clock(xen_sched_clock); + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; +} + +void __init xen_init_time_ops(void) +{ + xen_init_time_common(); x86_init.timers.timer_init = xen_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; /* Dom0 uses the native method to set the hardware RTC. */ if (!xen_initial_domain()) x86_platform.set_wallclock = xen_set_wallclock; @@ -569,13 +571,11 @@ void __init xen_hvm_init_time_ops(void) return; } - xen_sched_clock_offset = xen_clocksource_read(); - pv_ops.time = xen_time_ops; + xen_init_time_common(); + x86_init.timers.setup_percpu_clockev = xen_time_init; x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; x86_platform.set_wallclock = xen_set_wallclock; } #endif diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index c426b846beef..45cc0ae0af6f 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -100,37 +100,6 @@ LOAD_CP_REGS_TAB(7) /* - * coprocessor_flush(struct thread_info*, index) - * a2 a3 - * - * Save coprocessor registers for coprocessor 'index'. - * The register values are saved to or loaded from the coprocessor area - * inside the task_info structure. - * - * Note that this function doesn't update the coprocessor_owner information! - * - */ - -ENTRY(coprocessor_flush) - - /* reserve 4 bytes on stack to save a0 */ - abi_entry(4) - - s32i a0, a1, 0 - movi a0, .Lsave_cp_regs_jump_table - addx8 a3, a3, a0 - l32i a4, a3, 4 - l32i a3, a3, 0 - add a2, a2, a4 - beqz a3, 1f - callx0 a3 -1: l32i a0, a1, 0 - - abi_ret(4) - -ENDPROC(coprocessor_flush) - -/* * Entry condition: * * a0: trashed, original value saved on stack (PT_AREG0) @@ -245,6 +214,39 @@ ENTRY(fast_coprocessor) ENDPROC(fast_coprocessor) + .text + +/* + * coprocessor_flush(struct thread_info*, index) + * a2 a3 + * + * Save coprocessor registers for coprocessor 'index'. + * The register values are saved to or loaded from the coprocessor area + * inside the task_info structure. + * + * Note that this function doesn't update the coprocessor_owner information! + * + */ + +ENTRY(coprocessor_flush) + + /* reserve 4 bytes on stack to save a0 */ + abi_entry(4) + + s32i a0, a1, 0 + movi a0, .Lsave_cp_regs_jump_table + addx8 a3, a3, a0 + l32i a4, a3, 4 + l32i a3, a3, 0 + add a2, a2, a4 + beqz a3, 1f + callx0 a3 +1: l32i a0, a1, 0 + + abi_ret(4) + +ENDPROC(coprocessor_flush) + .data ENTRY(coprocessor_owner) diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 7666408ce12a..95a74890c7e9 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -112,8 +112,11 @@ good_area: */ fault = handle_mm_fault(vma, address, flags, regs); - if (fault_signal_pending(fault, regs)) + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + goto bad_page_fault; return; + } if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) |