diff options
Diffstat (limited to 'arch')
469 files changed, 3924 insertions, 3024 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 2bb30673d8e6..ecfd3520b676 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -632,13 +632,12 @@ config HAS_LTO_CLANG def_bool y # Clang >= 11: https://github.com/ClangBuiltLinux/linux/issues/510 depends on CC_IS_CLANG && CLANG_VERSION >= 110000 && LD_IS_LLD - depends on $(success,test $(LLVM) -eq 1) depends on $(success,test $(LLVM_IAS) -eq 1) depends on $(success,$(NM) --help | head -n 1 | grep -qi llvm) depends on $(success,$(AR) --help | head -n 1 | grep -qi llvm) depends on ARCH_SUPPORTS_LTO_CLANG depends on !FTRACE_MCOUNT_USE_RECORDMCOUNT - depends on !KASAN + depends on !KASAN || KASAN_HW_TAGS depends on !GCOV_KERNEL help The compiler and Kconfig options support building with Clang's diff --git a/arch/arc/boot/dts/haps_hs.dts b/arch/arc/boot/dts/haps_hs.dts index 60d578e2781f..76ad527a0847 100644 --- a/arch/arc/boot/dts/haps_hs.dts +++ b/arch/arc/boot/dts/haps_hs.dts @@ -16,7 +16,7 @@ memory { device_type = "memory"; /* CONFIG_LINUX_RAM_BASE needs to match low mem start */ - reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MB low mem */ + reg = <0x0 0x80000000 0x0 0x40000000 /* 1 GB low mem */ 0x1 0x00000000 0x0 0x40000000>; /* 1 GB highmem */ }; diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index a78d8f745a67..fdbe06c98895 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -96,7 +96,7 @@ stash_usr_regs(struct rt_sigframe __user *sf, struct pt_regs *regs, sizeof(sf->uc.uc_mcontext.regs.scratch)); err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(sigset_t)); - return err; + return err ? -EFAULT : 0; } static int restore_usr_regs(struct pt_regs *regs, struct rt_sigframe __user *sf) @@ -110,7 +110,7 @@ static int restore_usr_regs(struct pt_regs *regs, struct rt_sigframe __user *sf) &(sf->uc.uc_mcontext.regs.scratch), sizeof(sf->uc.uc_mcontext.regs.scratch)); if (err) - return err; + return -EFAULT; set_current_blocked(&set); regs->bta = uregs.scratch.bta; diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index 74ad4256022e..47bab67f8649 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -187,25 +187,26 @@ static void init_unwind_table(struct unwind_table *table, const char *name, const void *table_start, unsigned long table_size, const u8 *header_start, unsigned long header_size) { - const u8 *ptr = header_start + 4; - const u8 *end = header_start + header_size; - table->core.pc = (unsigned long)core_start; table->core.range = core_size; table->init.pc = (unsigned long)init_start; table->init.range = init_size; table->address = table_start; table->size = table_size; - - /* See if the linker provided table looks valid. */ - if (header_size <= 4 - || header_start[0] != 1 - || (void *)read_pointer(&ptr, end, header_start[1]) != table_start - || header_start[2] == DW_EH_PE_omit - || read_pointer(&ptr, end, header_start[2]) <= 0 - || header_start[3] == DW_EH_PE_omit) - header_start = NULL; - + /* To avoid the pointer addition with NULL pointer.*/ + if (header_start != NULL) { + const u8 *ptr = header_start + 4; + const u8 *end = header_start + header_size; + /* See if the linker provided table looks valid. */ + if (header_size <= 4 + || header_start[0] != 1 + || (void *)read_pointer(&ptr, end, header_start[1]) + != table_start + || header_start[2] == DW_EH_PE_omit + || read_pointer(&ptr, end, header_start[2]) <= 0 + || header_start[3] == DW_EH_PE_omit) + header_start = NULL; + } table->hdrsz = header_size; smp_wmb(); table->header = header_start; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 5da96f5df48f..2fae14857dcf 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1293,9 +1293,15 @@ config KASAN_SHADOW_OFFSET config NR_CPUS int "Maximum number of CPUs (2-32)" - range 2 32 + range 2 16 if DEBUG_KMAP_LOCAL + range 2 32 if !DEBUG_KMAP_LOCAL depends on SMP default "4" + help + The maximum number of CPUs that the kernel can support. + Up to 32 CPUs can be supported, or up to 16 if kmap_local() + debugging is enabled, which uses half of the per-CPU fixmap + slots as guard regions. config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi index 5b213a1e68bb..5e33d0e88f5b 100644 --- a/arch/arm/boot/dts/am33xx.dtsi +++ b/arch/arm/boot/dts/am33xx.dtsi @@ -40,6 +40,9 @@ ethernet1 = &cpsw_emac1; spi0 = &spi0; spi1 = &spi1; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; }; cpus { diff --git a/arch/arm/boot/dts/armada-385-turris-omnia.dts b/arch/arm/boot/dts/armada-385-turris-omnia.dts index 646a06420c77..5bd6a66d2c2b 100644 --- a/arch/arm/boot/dts/armada-385-turris-omnia.dts +++ b/arch/arm/boot/dts/armada-385-turris-omnia.dts @@ -32,7 +32,8 @@ ranges = <MBUS_ID(0xf0, 0x01) 0 0xf1000000 0x100000 MBUS_ID(0x01, 0x1d) 0 0xfff00000 0x100000 MBUS_ID(0x09, 0x19) 0 0xf1100000 0x10000 - MBUS_ID(0x09, 0x15) 0 0xf1110000 0x10000>; + MBUS_ID(0x09, 0x15) 0 0xf1110000 0x10000 + MBUS_ID(0x0c, 0x04) 0 0xf1200000 0x100000>; internal-regs { @@ -389,6 +390,7 @@ phy1: ethernet-phy@1 { compatible = "ethernet-phy-ieee802.3-c22"; reg = <1>; + marvell,reg-init = <3 18 0 0x4985>; /* irq is connected to &pcawan pin 7 */ }; diff --git a/arch/arm/boot/dts/at91-sam9x60ek.dts b/arch/arm/boot/dts/at91-sam9x60ek.dts index 73b6b1f89de9..775ceb3acb6c 100644 --- a/arch/arm/boot/dts/at91-sam9x60ek.dts +++ b/arch/arm/boot/dts/at91-sam9x60ek.dts @@ -334,14 +334,6 @@ }; &pinctrl { - atmel,mux-mask = < - /* A B C */ - 0xFFFFFE7F 0xC0E0397F 0xEF00019D /* pioA */ - 0x03FFFFFF 0x02FC7E68 0x00780000 /* pioB */ - 0xffffffff 0xF83FFFFF 0xB800F3FC /* pioC */ - 0x003FFFFF 0x003F8000 0x00000000 /* pioD */ - >; - adc { pinctrl_adc_default: adc_default { atmel,pins = <AT91_PIOB 15 AT91_PERIPH_A AT91_PINCTRL_NONE>; diff --git a/arch/arm/boot/dts/at91-sama5d27_som1.dtsi b/arch/arm/boot/dts/at91-sama5d27_som1.dtsi index 1b1163858b1d..e3251f3e3eaa 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1.dtsi +++ b/arch/arm/boot/dts/at91-sama5d27_som1.dtsi @@ -84,8 +84,8 @@ pinctrl-0 = <&pinctrl_macb0_default>; phy-mode = "rmii"; - ethernet-phy@0 { - reg = <0x0>; + ethernet-phy@7 { + reg = <0x7>; interrupt-parent = <&pioA>; interrupts = <PIN_PD31 IRQ_TYPE_LEVEL_LOW>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/bcm2711.dtsi b/arch/arm/boot/dts/bcm2711.dtsi index 462b1dfb0385..720beec54d61 100644 --- a/arch/arm/boot/dts/bcm2711.dtsi +++ b/arch/arm/boot/dts/bcm2711.dtsi @@ -308,14 +308,6 @@ #reset-cells = <1>; }; - bsc_intr: interrupt-controller@7ef00040 { - compatible = "brcm,bcm2711-l2-intc", "brcm,l2-intc"; - reg = <0x7ef00040 0x30>; - interrupts = <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>; - interrupt-controller; - #interrupt-cells = <1>; - }; - aon_intr: interrupt-controller@7ef00100 { compatible = "brcm,bcm2711-l2-intc", "brcm,l2-intc"; reg = <0x7ef00100 0x30>; @@ -362,8 +354,6 @@ reg = <0x7ef04500 0x100>, <0x7ef00b00 0x300>; reg-names = "bsc", "auto-i2c"; clock-frequency = <97500>; - interrupt-parent = <&bsc_intr>; - interrupts = <0>; status = "disabled"; }; @@ -405,8 +395,6 @@ reg = <0x7ef09500 0x100>, <0x7ef05b00 0x300>; reg-names = "bsc", "auto-i2c"; clock-frequency = <97500>; - interrupt-parent = <&bsc_intr>; - interrupts = <1>; status = "disabled"; }; }; diff --git a/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi b/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi index 7a1e53195785..f28a96fcf23e 100644 --- a/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi +++ b/arch/arm/boot/dts/imx6qdl-phytec-pfla02.dtsi @@ -433,6 +433,7 @@ pinctrl-0 = <&pinctrl_usdhc2>; cd-gpios = <&gpio1 4 GPIO_ACTIVE_LOW>; wp-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>; + vmmc-supply = <&vdd_sd1_reg>; status = "disabled"; }; @@ -442,5 +443,6 @@ &pinctrl_usdhc3_cdwp>; cd-gpios = <&gpio1 27 GPIO_ACTIVE_LOW>; wp-gpios = <&gpio1 29 GPIO_ACTIVE_HIGH>; + vmmc-supply = <&vdd_sd0_reg>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi index c593597b2119..5a1e10def6ef 100644 --- a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi +++ b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi @@ -210,9 +210,6 @@ micrel,led-mode = <1>; clocks = <&clks IMX6UL_CLK_ENET_REF>; clock-names = "rmii-ref"; - reset-gpios = <&gpio_spi 1 GPIO_ACTIVE_LOW>; - reset-assert-us = <10000>; - reset-deassert-us = <100>; }; @@ -222,9 +219,6 @@ micrel,led-mode = <1>; clocks = <&clks IMX6UL_CLK_ENET2_REF>; clock-names = "rmii-ref"; - reset-gpios = <&gpio_spi 2 GPIO_ACTIVE_LOW>; - reset-assert-us = <10000>; - reset-deassert-us = <100>; }; }; }; @@ -243,6 +237,22 @@ status = "okay"; }; +&gpio_spi { + eth0-phy-hog { + gpio-hog; + gpios = <1 GPIO_ACTIVE_HIGH>; + output-high; + line-name = "eth0-phy"; + }; + + eth1-phy-hog { + gpio-hog; + gpios = <2 GPIO_ACTIVE_HIGH>; + output-high; + line-name = "eth1-phy"; + }; +}; + &i2c1 { clock-frequency = <100000>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts b/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts index ecbb2cc5b9ab..79cc45728cd2 100644 --- a/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts +++ b/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts @@ -14,5 +14,6 @@ }; &gpmi { + fsl,use-minimum-ecc; status = "okay"; }; diff --git a/arch/arm/boot/dts/omap3.dtsi b/arch/arm/boot/dts/omap3.dtsi index 9dcae1f2bc99..c5b9da0d7e6c 100644 --- a/arch/arm/boot/dts/omap3.dtsi +++ b/arch/arm/boot/dts/omap3.dtsi @@ -24,6 +24,9 @@ i2c0 = &i2c1; i2c1 = &i2c2; i2c2 = &i2c3; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; serial0 = &uart1; serial1 = &uart2; serial2 = &uart3; diff --git a/arch/arm/boot/dts/omap4.dtsi b/arch/arm/boot/dts/omap4.dtsi index 72e4f6481776..4a9f9496a867 100644 --- a/arch/arm/boot/dts/omap4.dtsi +++ b/arch/arm/boot/dts/omap4.dtsi @@ -22,6 +22,11 @@ i2c1 = &i2c2; i2c2 = &i2c3; i2c3 = &i2c4; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; + mmc3 = &mmc4; + mmc4 = &mmc5; serial0 = &uart1; serial1 = &uart2; serial2 = &uart3; diff --git a/arch/arm/boot/dts/omap44xx-clocks.dtsi b/arch/arm/boot/dts/omap44xx-clocks.dtsi index 532868591107..1f1c04d8f472 100644 --- a/arch/arm/boot/dts/omap44xx-clocks.dtsi +++ b/arch/arm/boot/dts/omap44xx-clocks.dtsi @@ -770,14 +770,6 @@ ti,max-div = <2>; }; - sha2md5_fck: sha2md5_fck@15c8 { - #clock-cells = <0>; - compatible = "ti,gate-clock"; - clocks = <&l3_div_ck>; - ti,bit-shift = <1>; - reg = <0x15c8>; - }; - usb_phy_cm_clk32k: usb_phy_cm_clk32k@640 { #clock-cells = <0>; compatible = "ti,gate-clock"; diff --git a/arch/arm/boot/dts/omap5.dtsi b/arch/arm/boot/dts/omap5.dtsi index e025b7c9a357..ee821d0ab364 100644 --- a/arch/arm/boot/dts/omap5.dtsi +++ b/arch/arm/boot/dts/omap5.dtsi @@ -25,6 +25,11 @@ i2c2 = &i2c3; i2c3 = &i2c4; i2c4 = &i2c5; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; + mmc3 = &mmc4; + mmc4 = &mmc5; serial0 = &uart1; serial1 = &uart2; serial2 = &uart3; diff --git a/arch/arm/boot/dts/sam9x60.dtsi b/arch/arm/boot/dts/sam9x60.dtsi index 84066c1298df..ec45ced3cde6 100644 --- a/arch/arm/boot/dts/sam9x60.dtsi +++ b/arch/arm/boot/dts/sam9x60.dtsi @@ -606,6 +606,15 @@ compatible = "microchip,sam9x60-pinctrl", "atmel,at91sam9x5-pinctrl", "atmel,at91rm9200-pinctrl", "simple-bus"; ranges = <0xfffff400 0xfffff400 0x800>; + /* mux-mask corresponding to sam9x60 SoC in TFBGA228L package */ + atmel,mux-mask = < + /* A B C */ + 0xffffffff 0xffe03fff 0xef00019d /* pioA */ + 0x03ffffff 0x02fc7e7f 0x00780000 /* pioB */ + 0xffffffff 0xffffffff 0xf83fffff /* pioC */ + 0x003fffff 0x003f8000 0x00000000 /* pioD */ + >; + pioA: gpio@fffff400 { compatible = "microchip,sam9x60-gpio", "atmel,at91sam9x5-gpio", "atmel,at91rm9200-gpio"; reg = <0xfffff400 0x200>; diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index 472e56d09eea..1da3f41359aa 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S @@ -99,28 +99,6 @@ __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr .endm - .macro __rev, out, in - .if __LINUX_ARM_ARCH__ < 6 - lsl t0, \in, #24 - and t1, \in, #0xff00 - and t2, \in, #0xff0000 - orr \out, t0, \in, lsr #24 - orr \out, \out, t1, lsl #8 - orr \out, \out, t2, lsr #8 - .else - rev \out, \in - .endif - .endm - - .macro __adrl, out, sym, c - .if __LINUX_ARM_ARCH__ < 7 - ldr\c \out, =\sym - .else - movw\c \out, #:lower16:\sym - movt\c \out, #:upper16:\sym - .endif - .endm - .macro do_crypt, round, ttab, ltab, bsz push {r3-r11, lr} @@ -133,10 +111,10 @@ ldr r7, [in, #12] #ifdef CONFIG_CPU_BIG_ENDIAN - __rev r4, r4 - __rev r5, r5 - __rev r6, r6 - __rev r7, r7 + rev_l r4, t0 + rev_l r5, t0 + rev_l r6, t0 + rev_l r7, t0 #endif eor r4, r4, r8 @@ -144,7 +122,7 @@ eor r6, r6, r10 eor r7, r7, r11 - __adrl ttab, \ttab + mov_l ttab, \ttab /* * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into * L1 cache, assuming cacheline size >= 32. This is a hardening measure @@ -180,7 +158,7 @@ 2: .ifb \ltab add ttab, ttab, #1 .else - __adrl ttab, \ltab + mov_l ttab, \ltab // Prefetch inverse S-box for final round; see explanation above .set i, 0 .rept 256 / 64 @@ -194,10 +172,10 @@ \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds #ifdef CONFIG_CPU_BIG_ENDIAN - __rev r4, r4 - __rev r5, r5 - __rev r6, r6 - __rev r7, r7 + rev_l r4, t0 + rev_l r5, t0 + rev_l r6, t0 + rev_l r7, t0 #endif ldr out, [sp] diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c index 34d73200e7fa..4b59d027ba4a 100644 --- a/arch/arm/crypto/blake2b-neon-glue.c +++ b/arch/arm/crypto/blake2b-neon-glue.c @@ -85,8 +85,8 @@ static int __init blake2b_neon_mod_init(void) static void __exit blake2b_neon_mod_exit(void) { - return crypto_unregister_shashes(blake2b_neon_algs, - ARRAY_SIZE(blake2b_neon_algs)); + crypto_unregister_shashes(blake2b_neon_algs, + ARRAY_SIZE(blake2b_neon_algs)); } module_init(blake2b_neon_mod_init); diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S index bed897e9a181..86345751bbf3 100644 --- a/arch/arm/crypto/blake2s-core.S +++ b/arch/arm/crypto/blake2s-core.S @@ -8,6 +8,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> // Registers used to hold message words temporarily. There aren't // enough ARM registers to hold the whole message block, so we have to @@ -38,6 +39,23 @@ #endif .endm +.macro _le32_bswap a, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp +#endif +.endm + +.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp + _le32_bswap \a, \tmp + _le32_bswap \b, \tmp + _le32_bswap \c, \tmp + _le32_bswap \d, \tmp + _le32_bswap \e, \tmp + _le32_bswap \f, \tmp + _le32_bswap \g, \tmp + _le32_bswap \h, \tmp +.endm + // Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. // (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two // columns/diagonals. s0-s1 are the word offsets to the message words the first @@ -180,8 +198,10 @@ ENTRY(blake2s_compress_arch) tst r1, #3 bne .Lcopy_block_misaligned ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 stmia r12!, {r2-r9} ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 stmia r12, {r2-r9} .Lcopy_block_done: str r1, [sp, #68] // Update message pointer @@ -268,6 +288,7 @@ ENTRY(blake2s_compress_arch) 1: #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ldr r3, [r1], #4 + _le32_bswap r3, r4 #else ldrb r3, [r1, #0] ldrb r4, [r1, #1] diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/crypto/chacha-scalar-core.S index 2985b80a45b5..083fe1ab96d0 100644 --- a/arch/arm/crypto/chacha-scalar-core.S +++ b/arch/arm/crypto/chacha-scalar-core.S @@ -41,32 +41,15 @@ X14 .req r12 X15 .req r14 -.macro __rev out, in, t0, t1, t2 -.if __LINUX_ARM_ARCH__ >= 6 - rev \out, \in -.else - lsl \t0, \in, #24 - and \t1, \in, #0xff00 - and \t2, \in, #0xff0000 - orr \out, \t0, \in, lsr #24 - orr \out, \out, \t1, lsl #8 - orr \out, \out, \t2, lsr #8 -.endif -.endm - -.macro _le32_bswap x, t0, t1, t2 +.macro _le32_bswap_4x a, b, c, d, tmp #ifdef __ARMEB__ - __rev \x, \x, \t0, \t1, \t2 + rev_l \a, \tmp + rev_l \b, \tmp + rev_l \c, \tmp + rev_l \d, \tmp #endif .endm -.macro _le32_bswap_4x a, b, c, d, t0, t1, t2 - _le32_bswap \a, \t0, \t1, \t2 - _le32_bswap \b, \t0, \t1, \t2 - _le32_bswap \c, \t0, \t1, \t2 - _le32_bswap \d, \t0, \t1, \t2 -.endm - .macro __ldrd a, b, src, offset #if __LINUX_ARM_ARCH__ >= 6 ldrd \a, \b, [\src, #\offset] @@ -200,7 +183,7 @@ add X1, X1, r9 add X2, X2, r10 add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + _le32_bswap_4x X0, X1, X2, X3, r8 ldmia r12!, {r8-r11} eor X0, X0, r8 eor X1, X1, r9 @@ -216,7 +199,7 @@ ldmia r12!, {X0-X3} add X6, r10, X6, ror #brot add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + _le32_bswap_4x X4, X5, X6, X7, r8 eor X4, X4, X0 eor X5, X5, X1 eor X6, X6, X2 @@ -231,7 +214,7 @@ add r1, r1, r9 // x9 add r6, r6, r10 // x10 add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + _le32_bswap_4x r0, r1, r6, r7, r8 ldmia r12!, {r8-r11} eor r0, r0, r8 // x8 eor r1, r1, r9 // x9 @@ -245,7 +228,7 @@ add r3, r9, r3, ror #drot // x13 add r4, r10, r4, ror #drot // x14 add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + _le32_bswap_4x r2, r3, r4, r5, r9 ldr r9, [sp, #72] // load LEN eor r2, r2, r0 // x12 eor r3, r3, r1 // x13 @@ -301,7 +284,7 @@ add X1, X1, r9 add X2, X2, r10 add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + _le32_bswap_4x X0, X1, X2, X3, r8 stmia r14!, {X0-X3} // Save keystream for x4-x7 @@ -311,7 +294,7 @@ add X5, r9, X5, ror #brot add X6, r10, X6, ror #brot add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + _le32_bswap_4x X4, X5, X6, X7, r8 add r8, sp, #64 stmia r14!, {X4-X7} @@ -323,7 +306,7 @@ add r1, r1, r9 // x9 add r6, r6, r10 // x10 add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + _le32_bswap_4x r0, r1, r6, r7, r8 stmia r14!, {r0,r1,r6,r7} __ldrd r8, r9, sp, 144 __ldrd r10, r11, sp, 152 @@ -331,7 +314,7 @@ add r3, r9, r3, ror #drot // x13 add r4, r10, r4, ror #drot // x14 add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + _le32_bswap_4x r2, r3, r4, r5, r9 stmia r14, {r2-r5} // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN diff --git a/arch/arm/crypto/curve25519-core.S b/arch/arm/crypto/curve25519-core.S index be18af52e7dc..b697fa5d059a 100644 --- a/arch/arm/crypto/curve25519-core.S +++ b/arch/arm/crypto/curve25519-core.S @@ -10,8 +10,8 @@ #include <linux/linkage.h> .text -.fpu neon .arch armv7-a +.fpu neon .align 4 ENTRY(curve25519_neon) diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c index 3023c1acfa19..c31bd8f7c092 100644 --- a/arch/arm/crypto/poly1305-glue.c +++ b/arch/arm/crypto/poly1305-glue.c @@ -29,7 +29,7 @@ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit) static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_init_arm(&dctx->h, key); dctx->s[0] = get_unaligned_le32(key + 16); diff --git a/arch/arm/include/asm/paravirt.h b/arch/arm/include/asm/paravirt.h index cdbf02d9c1d4..95d5b0d625cd 100644 --- a/arch/arm/include/asm/paravirt.h +++ b/arch/arm/include/asm/paravirt.h @@ -3,23 +3,19 @@ #define _ASM_ARM_PARAVIRT_H #ifdef CONFIG_PARAVIRT +#include <linux/static_call_types.h> + struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; -struct pv_time_ops { - unsigned long long (*steal_clock)(int cpu); -}; - -struct paravirt_patch_template { - struct pv_time_ops time; -}; +u64 dummy_steal_clock(int cpu); -extern struct paravirt_patch_template pv_ops; +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); static inline u64 paravirt_steal_clock(int cpu) { - return pv_ops.time.steal_clock(cpu); + return static_call(pv_steal_clock)(cpu); } #endif diff --git a/arch/arm/kernel/paravirt.c b/arch/arm/kernel/paravirt.c index 4cfed91fe256..7dd9806369fb 100644 --- a/arch/arm/kernel/paravirt.c +++ b/arch/arm/kernel/paravirt.c @@ -9,10 +9,15 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/types.h> +#include <linux/static_call.h> #include <asm/paravirt.h> struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; -EXPORT_SYMBOL_GPL(pv_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); diff --git a/arch/arm/mach-footbridge/cats-pci.c b/arch/arm/mach-footbridge/cats-pci.c index 0b2fd7e2e9b4..90b1e9be430e 100644 --- a/arch/arm/mach-footbridge/cats-pci.c +++ b/arch/arm/mach-footbridge/cats-pci.c @@ -15,14 +15,14 @@ #include <asm/mach-types.h> /* cats host-specific stuff */ -static int irqmap_cats[] __initdata = { IRQ_PCI, IRQ_IN0, IRQ_IN1, IRQ_IN3 }; +static int irqmap_cats[] = { IRQ_PCI, IRQ_IN0, IRQ_IN1, IRQ_IN3 }; static u8 cats_no_swizzle(struct pci_dev *dev, u8 *pin) { return 0; } -static int __init cats_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int cats_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (dev->irq >= 255) return -1; /* not a valid interrupt. */ diff --git a/arch/arm/mach-footbridge/ebsa285-pci.c b/arch/arm/mach-footbridge/ebsa285-pci.c index 6f28aaa9ca79..c3f280d08fa7 100644 --- a/arch/arm/mach-footbridge/ebsa285-pci.c +++ b/arch/arm/mach-footbridge/ebsa285-pci.c @@ -14,9 +14,9 @@ #include <asm/mach/pci.h> #include <asm/mach-types.h> -static int irqmap_ebsa285[] __initdata = { IRQ_IN3, IRQ_IN1, IRQ_IN0, IRQ_PCI }; +static int irqmap_ebsa285[] = { IRQ_IN3, IRQ_IN1, IRQ_IN0, IRQ_PCI }; -static int __init ebsa285_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int ebsa285_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (dev->vendor == PCI_VENDOR_ID_CONTAQ && dev->device == PCI_DEVICE_ID_CONTAQ_82C693) diff --git a/arch/arm/mach-footbridge/netwinder-pci.c b/arch/arm/mach-footbridge/netwinder-pci.c index 9473aa0305e5..e8304392074b 100644 --- a/arch/arm/mach-footbridge/netwinder-pci.c +++ b/arch/arm/mach-footbridge/netwinder-pci.c @@ -18,7 +18,7 @@ * We now use the slot ID instead of the device identifiers to select * which interrupt is routed where. */ -static int __init netwinder_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int netwinder_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (slot) { case 0: /* host bridge */ diff --git a/arch/arm/mach-footbridge/personal-pci.c b/arch/arm/mach-footbridge/personal-pci.c index 4391e433a4b2..9d19aa98a663 100644 --- a/arch/arm/mach-footbridge/personal-pci.c +++ b/arch/arm/mach-footbridge/personal-pci.c @@ -14,13 +14,12 @@ #include <asm/mach/pci.h> #include <asm/mach-types.h> -static int irqmap_personal_server[] __initdata = { +static int irqmap_personal_server[] = { IRQ_IN0, IRQ_IN1, IRQ_IN2, IRQ_IN3, 0, 0, 0, IRQ_DOORBELLHOST, IRQ_DMA1, IRQ_DMA2, IRQ_PCI }; -static int __init personal_server_map_irq(const struct pci_dev *dev, u8 slot, - u8 pin) +static int personal_server_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char line; diff --git a/arch/arm/mach-imx/avic.c b/arch/arm/mach-imx/avic.c index 322caa21bcb3..21bce4049cec 100644 --- a/arch/arm/mach-imx/avic.c +++ b/arch/arm/mach-imx/avic.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/irqchip.h> #include <linux/io.h> #include <linux/of.h> #include <linux/of_address.h> @@ -162,7 +163,7 @@ static void __exception_irq_entry avic_handle_irq(struct pt_regs *regs) * interrupts. It registers the interrupt enable and disable functions * to the kernel for each interrupt source. */ -void __init mxc_init_irq(void __iomem *irqbase) +static void __init mxc_init_irq(void __iomem *irqbase) { struct device_node *np; int irq_base; @@ -220,3 +221,16 @@ void __init mxc_init_irq(void __iomem *irqbase) printk(KERN_INFO "MXC IRQ initialized\n"); } + +static int __init imx_avic_init(struct device_node *node, + struct device_node *parent) +{ + void __iomem *avic_base; + + avic_base = of_iomap(node, 0); + BUG_ON(!avic_base); + mxc_init_irq(avic_base); + return 0; +} + +IRQCHIP_DECLARE(imx_avic, "fsl,avic", imx_avic_init); diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h index 2b004cc4f95e..474dedb73bc7 100644 --- a/arch/arm/mach-imx/common.h +++ b/arch/arm/mach-imx/common.h @@ -22,7 +22,6 @@ void mx35_map_io(void); void imx21_init_early(void); void imx31_init_early(void); void imx35_init_early(void); -void mxc_init_irq(void __iomem *); void mx31_init_irq(void); void mx35_init_irq(void); void mxc_set_cpu_type(unsigned int type); diff --git a/arch/arm/mach-imx/mach-imx1.c b/arch/arm/mach-imx/mach-imx1.c index 32df3b8012f9..8eca92d66a2e 100644 --- a/arch/arm/mach-imx/mach-imx1.c +++ b/arch/arm/mach-imx/mach-imx1.c @@ -17,16 +17,6 @@ static void __init imx1_init_early(void) mxc_set_cpu_type(MXC_CPU_MX1); } -static void __init imx1_init_irq(void) -{ - void __iomem *avic_addr; - - avic_addr = ioremap(MX1_AVIC_ADDR, SZ_4K); - WARN_ON(!avic_addr); - - mxc_init_irq(avic_addr); -} - static const char * const imx1_dt_board_compat[] __initconst = { "fsl,imx1", NULL @@ -34,7 +24,6 @@ static const char * const imx1_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX1_DT, "Freescale i.MX1 (Device Tree Support)") .init_early = imx1_init_early, - .init_irq = imx1_init_irq, .dt_compat = imx1_dt_board_compat, .restart = mxc_restart, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx25.c b/arch/arm/mach-imx/mach-imx25.c index 95de48a1aa7d..51927bd08aef 100644 --- a/arch/arm/mach-imx/mach-imx25.c +++ b/arch/arm/mach-imx/mach-imx25.c @@ -22,17 +22,6 @@ static void __init imx25_dt_init(void) imx_aips_allow_unprivileged_access("fsl,imx25-aips"); } -static void __init mx25_init_irq(void) -{ - struct device_node *np; - void __iomem *avic_base; - - np = of_find_compatible_node(NULL, NULL, "fsl,avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - mxc_init_irq(avic_base); -} - static const char * const imx25_dt_board_compat[] __initconst = { "fsl,imx25", NULL @@ -42,6 +31,5 @@ DT_MACHINE_START(IMX25_DT, "Freescale i.MX25 (Device Tree Support)") .init_early = imx25_init_early, .init_machine = imx25_dt_init, .init_late = imx25_pm_init, - .init_irq = mx25_init_irq, .dt_compat = imx25_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx27.c b/arch/arm/mach-imx/mach-imx27.c index 262422a9c196..e325c9468105 100644 --- a/arch/arm/mach-imx/mach-imx27.c +++ b/arch/arm/mach-imx/mach-imx27.c @@ -56,17 +56,6 @@ static void __init imx27_init_early(void) mxc_set_cpu_type(MXC_CPU_MX27); } -static void __init mx27_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - mxc_init_irq(avic_base); -} - static const char * const imx27_dt_board_compat[] __initconst = { "fsl,imx27", NULL @@ -75,7 +64,6 @@ static const char * const imx27_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX27_DT, "Freescale i.MX27 (Device Tree Support)") .map_io = mx27_map_io, .init_early = imx27_init_early, - .init_irq = mx27_init_irq, .init_late = imx27_pm_init, .dt_compat = imx27_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx31.c b/arch/arm/mach-imx/mach-imx31.c index dc69dfe600df..e9a1092b6093 100644 --- a/arch/arm/mach-imx/mach-imx31.c +++ b/arch/arm/mach-imx/mach-imx31.c @@ -14,6 +14,5 @@ static const char * const imx31_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX31_DT, "Freescale i.MX31 (Device Tree Support)") .map_io = mx31_map_io, .init_early = imx31_init_early, - .init_irq = mx31_init_irq, .dt_compat = imx31_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx35.c b/arch/arm/mach-imx/mach-imx35.c index ec5c3068715c..0fc08218b77d 100644 --- a/arch/arm/mach-imx/mach-imx35.c +++ b/arch/arm/mach-imx/mach-imx35.c @@ -27,6 +27,5 @@ DT_MACHINE_START(IMX35_DT, "Freescale i.MX35 (Device Tree Support)") .l2c_aux_mask = ~0, .map_io = mx35_map_io, .init_early = imx35_init_early, - .init_irq = mx35_init_irq, .dt_compat = imx35_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mm-imx3.c b/arch/arm/mach-imx/mm-imx3.c index 5056438e5b42..28db97289ee8 100644 --- a/arch/arm/mach-imx/mm-imx3.c +++ b/arch/arm/mach-imx/mm-imx3.c @@ -109,18 +109,6 @@ void __init imx31_init_early(void) mx3_ccm_base = of_iomap(np, 0); BUG_ON(!mx3_ccm_base); } - -void __init mx31_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,imx31-avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - - mxc_init_irq(avic_base); -} #endif /* ifdef CONFIG_SOC_IMX31 */ #ifdef CONFIG_SOC_IMX35 @@ -158,16 +146,4 @@ void __init imx35_init_early(void) mx3_ccm_base = of_iomap(np, 0); BUG_ON(!mx3_ccm_base); } - -void __init mx35_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,imx35-avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - - mxc_init_irq(avic_base); -} #endif /* ifdef CONFIG_SOC_IMX35 */ diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c index cd711bfc591f..2c647bdf8d25 100644 --- a/arch/arm/mach-keystone/keystone.c +++ b/arch/arm/mach-keystone/keystone.c @@ -65,7 +65,7 @@ static void __init keystone_init(void) static long long __init keystone_pv_fixup(void) { long long offset; - phys_addr_t mem_start, mem_end; + u64 mem_start, mem_end; mem_start = memblock_start_of_DRAM(); mem_end = memblock_end_of_DRAM(); @@ -78,7 +78,7 @@ static long long __init keystone_pv_fixup(void) if (mem_start < KEYSTONE_HIGH_PHYS_START || mem_end > KEYSTONE_HIGH_PHYS_END) { pr_crit("Invalid address space for memory (%08llx-%08llx)\n", - (u64)mem_start, (u64)mem_end); + mem_start, mem_end); return 0; } diff --git a/arch/arm/mach-omap1/ams-delta-fiq-handler.S b/arch/arm/mach-omap1/ams-delta-fiq-handler.S index 14a6c3eb3298..f745a65d3bd7 100644 --- a/arch/arm/mach-omap1/ams-delta-fiq-handler.S +++ b/arch/arm/mach-omap1/ams-delta-fiq-handler.S @@ -15,6 +15,7 @@ #include <linux/platform_data/gpio-omap.h> #include <asm/assembler.h> +#include <asm/irq.h> #include "ams-delta-fiq.h" #include "board-ams-delta.h" diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c index 7290f033fd2d..1610c567a6a3 100644 --- a/arch/arm/mach-omap2/board-generic.c +++ b/arch/arm/mach-omap2/board-generic.c @@ -33,7 +33,7 @@ static void __init __maybe_unused omap_generic_init(void) } /* Clocks are needed early, see drivers/clocksource for the rest */ -void __init __maybe_unused omap_init_time_of(void) +static void __init __maybe_unused omap_init_time_of(void) { omap_clk_init(); timer_probe(); diff --git a/arch/arm/mach-omap2/omap-secure.c b/arch/arm/mach-omap2/omap-secure.c index f70d561f37f7..0659ab4cb0af 100644 --- a/arch/arm/mach-omap2/omap-secure.c +++ b/arch/arm/mach-omap2/omap-secure.c @@ -9,6 +9,7 @@ */ #include <linux/arm-smccc.h> +#include <linux/cpu_pm.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/io.h> @@ -20,6 +21,7 @@ #include "common.h" #include "omap-secure.h" +#include "soc.h" static phys_addr_t omap_secure_memblock_base; @@ -213,3 +215,40 @@ void __init omap_secure_init(void) { omap_optee_init_check(); } + +/* + * Dummy dispatcher call after core OSWR and MPU off. Updates the ROM return + * address after MMU has been re-enabled after CPU1 has been woken up again. + * Otherwise the ROM code will attempt to use the earlier physical return + * address that got set with MMU off when waking up CPU1. Only used on secure + * devices. + */ +static int cpu_notifier(struct notifier_block *nb, unsigned long cmd, void *v) +{ + switch (cmd) { + case CPU_CLUSTER_PM_EXIT: + omap_secure_dispatcher(OMAP4_PPA_SERVICE_0, + FLAG_START_CRITICAL, + 0, 0, 0, 0, 0); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block secure_notifier_block = { + .notifier_call = cpu_notifier, +}; + +static int __init secure_pm_init(void) +{ + if (omap_type() == OMAP2_DEVICE_TYPE_GP || !soc_is_omap44xx()) + return 0; + + cpu_pm_register_notifier(&secure_notifier_block); + + return 0; +} +omap_arch_initcall(secure_pm_init); diff --git a/arch/arm/mach-omap2/omap-secure.h b/arch/arm/mach-omap2/omap-secure.h index 4aaa95706d39..172069f31616 100644 --- a/arch/arm/mach-omap2/omap-secure.h +++ b/arch/arm/mach-omap2/omap-secure.h @@ -50,6 +50,7 @@ #define OMAP5_DRA7_MON_SET_ACR_INDEX 0x107 /* Secure PPA(Primary Protected Application) APIs */ +#define OMAP4_PPA_SERVICE_0 0x21 #define OMAP4_PPA_L2_POR_INDEX 0x23 #define OMAP4_PPA_CPU_ACTRL_SMP_INDEX 0x25 diff --git a/arch/arm/mach-omap2/pmic-cpcap.c b/arch/arm/mach-omap2/pmic-cpcap.c index 09076ad0576d..668dc84fd31e 100644 --- a/arch/arm/mach-omap2/pmic-cpcap.c +++ b/arch/arm/mach-omap2/pmic-cpcap.c @@ -246,10 +246,10 @@ int __init omap4_cpcap_init(void) omap_voltage_register_pmic(voltdm, &omap443x_max8952_mpu); if (of_machine_is_compatible("motorola,droid-bionic")) { - voltdm = voltdm_lookup("mpu"); + voltdm = voltdm_lookup("core"); omap_voltage_register_pmic(voltdm, &omap_cpcap_core); - voltdm = voltdm_lookup("mpu"); + voltdm = voltdm_lookup("iva"); omap_voltage_register_pmic(voltdm, &omap_cpcap_iva); } else { voltdm = voltdm_lookup("core"); diff --git a/arch/arm/mach-omap2/sr_device.c b/arch/arm/mach-omap2/sr_device.c index 62df666c2bd0..605925684b0a 100644 --- a/arch/arm/mach-omap2/sr_device.c +++ b/arch/arm/mach-omap2/sr_device.c @@ -88,34 +88,26 @@ static void __init sr_set_nvalues(struct omap_volt_data *volt_data, extern struct omap_sr_data omap_sr_pdata[]; -static int __init sr_dev_init(struct omap_hwmod *oh, void *user) +static int __init sr_init_by_name(const char *name, const char *voltdm) { struct omap_sr_data *sr_data = NULL; struct omap_volt_data *volt_data; - struct omap_smartreflex_dev_attr *sr_dev_attr; static int i; - if (!strncmp(oh->name, "smartreflex_mpu_iva", 20) || - !strncmp(oh->name, "smartreflex_mpu", 16)) + if (!strncmp(name, "smartreflex_mpu_iva", 20) || + !strncmp(name, "smartreflex_mpu", 16)) sr_data = &omap_sr_pdata[OMAP_SR_MPU]; - else if (!strncmp(oh->name, "smartreflex_core", 17)) + else if (!strncmp(name, "smartreflex_core", 17)) sr_data = &omap_sr_pdata[OMAP_SR_CORE]; - else if (!strncmp(oh->name, "smartreflex_iva", 16)) + else if (!strncmp(name, "smartreflex_iva", 16)) sr_data = &omap_sr_pdata[OMAP_SR_IVA]; if (!sr_data) { - pr_err("%s: Unknown instance %s\n", __func__, oh->name); + pr_err("%s: Unknown instance %s\n", __func__, name); return -EINVAL; } - sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr; - if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) { - pr_err("%s: No voltage domain specified for %s. Cannot initialize\n", - __func__, oh->name); - goto exit; - } - - sr_data->name = oh->name; + sr_data->name = name; if (cpu_is_omap343x()) sr_data->ip_type = 1; else @@ -136,10 +128,10 @@ static int __init sr_dev_init(struct omap_hwmod *oh, void *user) } } - sr_data->voltdm = voltdm_lookup(sr_dev_attr->sensor_voltdm_name); + sr_data->voltdm = voltdm_lookup(voltdm); if (!sr_data->voltdm) { pr_err("%s: Unable to get voltage domain pointer for VDD %s\n", - __func__, sr_dev_attr->sensor_voltdm_name); + __func__, voltdm); goto exit; } @@ -160,6 +152,20 @@ exit: return 0; } +static int __init sr_dev_init(struct omap_hwmod *oh, void *user) +{ + struct omap_smartreflex_dev_attr *sr_dev_attr; + + sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr; + if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) { + pr_err("%s: No voltage domain specified for %s. Cannot initialize\n", + __func__, oh->name); + return 0; + } + + return sr_init_by_name(oh->name, sr_dev_attr->sensor_voltdm_name); +} + /* * API to be called from board files to enable smartreflex * autocompensation at init. @@ -169,7 +175,42 @@ void __init omap_enable_smartreflex_on_init(void) sr_enable_on_init = true; } +static const char * const omap4_sr_instances[] = { + "mpu", + "iva", + "core", +}; + +static const char * const dra7_sr_instances[] = { + "mpu", + "core", +}; + int __init omap_devinit_smartreflex(void) { + const char * const *sr_inst = NULL; + int i, nr_sr = 0; + + if (soc_is_omap44xx()) { + sr_inst = omap4_sr_instances; + nr_sr = ARRAY_SIZE(omap4_sr_instances); + + } else if (soc_is_dra7xx()) { + sr_inst = dra7_sr_instances; + nr_sr = ARRAY_SIZE(dra7_sr_instances); + } + + if (nr_sr) { + const char *name, *voltdm; + + for (i = 0; i < nr_sr; i++) { + name = kasprintf(GFP_KERNEL, "smartreflex_%s", sr_inst[i]); + voltdm = sr_inst[i]; + sr_init_by_name(name, voltdm); + } + + return 0; + } + return omap_hwmod_for_each_by_class("smartreflex", sr_dev_init, NULL); } diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c index d1010ec26e9f..d237bd030238 100644 --- a/arch/arm/mach-pxa/mainstone.c +++ b/arch/arm/mach-pxa/mainstone.c @@ -502,16 +502,20 @@ static inline void mainstone_init_keypad(void) {} #endif static int mst_pcmcia0_irqs[11] = { - [0 ... 10] = -1, + [0 ... 4] = -1, [5] = MAINSTONE_S0_CD_IRQ, + [6 ... 7] = -1, [8] = MAINSTONE_S0_STSCHG_IRQ, + [9] = -1, [10] = MAINSTONE_S0_IRQ, }; static int mst_pcmcia1_irqs[11] = { - [0 ... 10] = -1, + [0 ... 4] = -1, [5] = MAINSTONE_S1_CD_IRQ, + [6 ... 7] = -1, [8] = MAINSTONE_S1_STSCHG_IRQ, + [9] = -1, [10] = MAINSTONE_S1_IRQ, }; diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index a25b660c3017..c1e12aab67b8 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -387,8 +387,7 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) pte_t *pte = pte_offset_fixmap(pmd_off_k(vaddr), vaddr); /* Make sure fixmap region does not exceed available allocation. */ - BUILD_BUG_ON(FIXADDR_START + (__end_of_fixed_addresses * PAGE_SIZE) > - FIXADDR_END); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) < FIXADDR_START); BUG_ON(idx >= __end_of_fixed_addresses); /* we only support device mappings until pgprot_kernel has been set */ diff --git a/arch/arm/mm/pmsa-v7.c b/arch/arm/mm/pmsa-v7.c index 88950e41a3a9..59d916ccdf25 100644 --- a/arch/arm/mm/pmsa-v7.c +++ b/arch/arm/mm/pmsa-v7.c @@ -235,6 +235,7 @@ void __init pmsav7_adjust_lowmem_bounds(void) phys_addr_t mem_end; phys_addr_t reg_start, reg_end; unsigned int mem_max_regions; + bool first = true; int num; u64 i; @@ -263,7 +264,7 @@ void __init pmsav7_adjust_lowmem_bounds(void) #endif for_each_mem_range(i, ®_start, ®_end) { - if (i == 0) { + if (first) { phys_addr_t phys_offset = PHYS_OFFSET; /* @@ -275,6 +276,7 @@ void __init pmsav7_adjust_lowmem_bounds(void) mem_start = reg_start; mem_end = reg_end; specified_mem_size = mem_end - mem_start; + first = false; } else { /* * memblock auto merges contiguous blocks, remove diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c index 2de019f7503e..8359748a19a1 100644 --- a/arch/arm/mm/pmsa-v8.c +++ b/arch/arm/mm/pmsa-v8.c @@ -95,10 +95,11 @@ void __init pmsav8_adjust_lowmem_bounds(void) { phys_addr_t mem_end; phys_addr_t reg_start, reg_end; + bool first = true; u64 i; for_each_mem_range(i, ®_start, ®_end) { - if (i == 0) { + if (first) { phys_addr_t phys_offset = PHYS_OFFSET; /* @@ -107,6 +108,7 @@ void __init pmsav8_adjust_lowmem_bounds(void) if (reg_start != phys_offset) panic("First memory bank must be contiguous from PHYS_OFFSET"); mem_end = reg_end; + first = false; } else { /* * memblock auto merges contiguous blocks, remove diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index c4b49b322e8a..f5f790c6e5f8 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -204,7 +204,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) static struct undef_hook uprobes_arm_break_hook = { .instr_mask = 0x0fffffff, .instr_val = (UPROBE_SWBP_ARM_INSN & 0x0fffffff), - .cpsr_mask = MODE_MASK, + .cpsr_mask = (PSR_T_BIT | MODE_MASK), .cpsr_val = USR_MODE, .fn = uprobe_trap_handler, }; @@ -212,7 +212,7 @@ static struct undef_hook uprobes_arm_break_hook = { static struct undef_hook uprobes_arm_ss_hook = { .instr_mask = 0x0fffffff, .instr_val = (UPROBE_SS_ARM_INSN & 0x0fffffff), - .cpsr_mask = MODE_MASK, + .cpsr_mask = (PSR_T_BIT | MODE_MASK), .cpsr_val = USR_MODE, .fn = uprobe_trap_handler, }; diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index acb464547a54..84a1cea1f43b 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -11,6 +11,7 @@ #include <xen/xen.h> #include <xen/interface/memory.h> +#include <xen/grant_table.h> #include <xen/page.h> #include <xen/swiotlb-xen.h> @@ -109,7 +110,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, map_ops[i].status = GNTST_general_error; unmap.host_addr = map_ops[i].host_addr, unmap.handle = map_ops[i].handle; - map_ops[i].handle = ~0; + map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) unmap.dev_bus_addr = map_ops[i].dev_bus_addr; else @@ -130,7 +131,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, return 0; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -145,7 +145,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return 0; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn, unsigned long nr_pages) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1f212b47a48a..dfdc3e0af5e1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -810,6 +810,16 @@ config QCOM_FALKOR_ERRATUM_E1041 If unsure, say Y. +config NVIDIA_CARMEL_CNP_ERRATUM + bool "NVIDIA Carmel CNP: CNP on Carmel semantically different than ARM cores" + default y + help + If CNP is enabled on Carmel cores, non-sharable TLBIs on a core will not + invalidate shared TLB entries installed by a different core, as it would + on standard ARM cores. + + If unsure, say Y. + config SOCIONEXT_SYNQUACER_PREITS bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y @@ -1055,8 +1065,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_WANT_HUGE_PMD_SHARE - config ARCH_HAS_CACHE_LINE_SIZE def_bool y @@ -1157,8 +1165,8 @@ config XEN config FORCE_MAX_ZONEORDER int - default "14" if (ARM64_64K_PAGES && TRANSPARENT_HUGEPAGE) - default "12" if (ARM64_16K_PAGES && TRANSPARENT_HUGEPAGE) + default "14" if ARM64_64K_PAGES + default "12" if ARM64_16K_PAGES default "11" help The kernel memory allocator divides physically contiguous memory @@ -1398,10 +1406,13 @@ config ARM64_PAN config AS_HAS_LDAPR def_bool $(as-instr,.arch_extension rcpc) +config AS_HAS_LSE_ATOMICS + def_bool $(as-instr,.arch_extension lse) + config ARM64_LSE_ATOMICS bool default ARM64_USE_LSE_ATOMICS - depends on $(as-instr,.arch_extension lse) + depends on AS_HAS_LSE_ATOMICS config ARM64_USE_LSE_ATOMICS bool "Atomic instructions" @@ -1658,6 +1669,7 @@ config ARM64_MTE default y depends on ARM64_AS_HAS_MTE && ARM64_TAGGED_ADDR_ABI depends on AS_HAS_ARMV8_5 + depends on AS_HAS_LSE_ATOMICS # Required for tag checking in the uaccess routines depends on ARM64_PAN select ARCH_USES_HIGH_VMA_FLAGS @@ -1855,12 +1867,6 @@ config CMDLINE_FROM_BOOTLOADER the boot loader doesn't provide any, the default kernel command string provided in CMDLINE will be used. -config CMDLINE_EXTEND - bool "Extend bootloader kernel arguments" - help - The command-line arguments provided by the boot loader will be - appended to the default kernel command string. - config CMDLINE_FORCE bool "Always use the default kernel command string" help diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts index 437ffe3628a5..596a25907432 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-lts.dts @@ -19,3 +19,7 @@ }; }; }; + +&mmc0 { + broken-cd; /* card detect is broken on *some* boards */ +}; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi index 3402cec87035..df62044ff7a7 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi @@ -34,7 +34,7 @@ vmmc-supply = <®_dcdc1>; disable-wp; bus-width = <4>; - cd-gpios = <&pio 5 6 GPIO_ACTIVE_LOW>; /* PF6 */ + cd-gpios = <&pio 5 6 GPIO_ACTIVE_HIGH>; /* PF6 push-pull switch */ status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts index 4f4755152fce..b5808047d6e4 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts @@ -289,10 +289,6 @@ vcc-pm-supply = <®_aldo1>; }; -&rtc { - clocks = <&ext_osc32k>; -}; - &spdif { status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi index 49e979794094..af8b7d0ef750 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi @@ -995,9 +995,9 @@ compatible = "allwinner,sun8i-a23-rsb"; reg = <0x07083000 0x400>; interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&r_ccu 13>; + clocks = <&r_ccu CLK_R_APB2_RSB>; clock-frequency = <3000000>; - resets = <&r_ccu 7>; + resets = <&r_ccu RST_R_APB2_RSB>; pinctrl-names = "default"; pinctrl-0 = <&r_rsb_pins>; status = "disabled"; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi index 7de6b376d792..9058cfa4980f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi @@ -198,6 +198,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 5a8a1dc4262d..28c51e521cb2 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -348,6 +348,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = <0 75 0x4>; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index 1d6dfd189c7f..39458305e333 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -354,6 +354,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", diff --git a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h index 5ccc4cc91959..a003e6af3353 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h +++ b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h @@ -124,7 +124,7 @@ #define MX8MM_IOMUXC_SD1_CMD_USDHC1_CMD 0x0A4 0x30C 0x000 0x0 0x0 #define MX8MM_IOMUXC_SD1_CMD_GPIO2_IO1 0x0A4 0x30C 0x000 0x5 0x0 #define MX8MM_IOMUXC_SD1_DATA0_USDHC1_DATA0 0x0A8 0x310 0x000 0x0 0x0 -#define MX8MM_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x31 0x000 0x5 0x0 +#define MX8MM_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x310 0x000 0x5 0x0 #define MX8MM_IOMUXC_SD1_DATA1_USDHC1_DATA1 0x0AC 0x314 0x000 0x0 0x0 #define MX8MM_IOMUXC_SD1_DATA1_GPIO2_IO3 0x0AC 0x314 0x000 0x5 0x0 #define MX8MM_IOMUXC_SD1_DATA2_USDHC1_DATA2 0x0B0 0x318 0x000 0x0 0x0 diff --git a/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts b/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts index 0e1a6d953389..122c95ddad30 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts @@ -35,7 +35,7 @@ &i2c2 { clock-frequency = <400000>; - pinctrl-names = "default"; + pinctrl-names = "default", "gpio"; pinctrl-0 = <&pinctrl_i2c2>; pinctrl-1 = <&pinctrl_i2c2_gpio>; sda-gpios = <&gpio5 17 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi index 44a8c2337cee..f3965ec5b31d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi @@ -67,7 +67,7 @@ &i2c1 { clock-frequency = <400000>; - pinctrl-names = "default"; + pinctrl-names = "default", "gpio"; pinctrl-0 = <&pinctrl_i2c1>; pinctrl-1 = <&pinctrl_i2c1_gpio>; sda-gpios = <&gpio5 15 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; diff --git a/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h index b94b02080a34..68e8fa172974 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h +++ b/arch/arm64/boot/dts/freescale/imx8mq-pinfunc.h @@ -130,7 +130,7 @@ #define MX8MQ_IOMUXC_SD1_CMD_USDHC1_CMD 0x0A4 0x30C 0x000 0x0 0x0 #define MX8MQ_IOMUXC_SD1_CMD_GPIO2_IO1 0x0A4 0x30C 0x000 0x5 0x0 #define MX8MQ_IOMUXC_SD1_DATA0_USDHC1_DATA0 0x0A8 0x310 0x000 0x0 0x0 -#define MX8MQ_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x31 0x000 0x5 0x0 +#define MX8MQ_IOMUXC_SD1_DATA0_GPIO2_IO2 0x0A8 0x310 0x000 0x5 0x0 #define MX8MQ_IOMUXC_SD1_DATA1_USDHC1_DATA1 0x0AC 0x314 0x000 0x0 0x0 #define MX8MQ_IOMUXC_SD1_DATA1_GPIO2_IO3 0x0AC 0x314 0x000 0x5 0x0 #define MX8MQ_IOMUXC_SD1_DATA2_USDHC1_DATA2 0x0B0 0x318 0x000 0x0 0x0 diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index d239ab70ed99..53e817c5f6f3 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -1,7 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0+ OR MIT) /* * Device Tree file for CZ.NIC Turris Mox Board - * 2019 by Marek Behun <marek.behun@nic.cz> + * 2019 by Marek Behún <kabel@kernel.org> */ /dts-v1/; diff --git a/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi b/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi index 64179a372ecf..c6f5df2deccf 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp11x.dtsi @@ -310,9 +310,11 @@ }; CP11X_LABEL(sata0): sata@540000 { - compatible = "marvell,armada-8k-ahci"; + compatible = "marvell,armada-8k-ahci", + "generic-ahci"; reg = <0x540000 0x30000>; dma-coherent; + interrupts = <107 IRQ_TYPE_LEVEL_HIGH>; clocks = <&CP11X_LABEL(clk) 1 15>, <&CP11X_LABEL(clk) 1 16>; #address-cells = <1>; @@ -320,12 +322,10 @@ status = "disabled"; sata-port@0 { - interrupts = <109 IRQ_TYPE_LEVEL_HIGH>; reg = <0>; }; sata-port@1 { - interrupts = <107 IRQ_TYPE_LEVEL_HIGH>; reg = <1>; }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts b/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts index 9f5f5e1fa82e..683743f81849 100644 --- a/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra186-p2771-0000.dts @@ -10,7 +10,7 @@ model = "NVIDIA Jetson TX2 Developer Kit"; compatible = "nvidia,p2771-0000", "nvidia,tegra186"; - aconnect { + aconnect@2900000 { status = "okay"; dma-controller@2930000 { diff --git a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi index fd9177447711..fcd71bfc6707 100644 --- a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi @@ -23,7 +23,7 @@ }; chosen { - bootargs = "earlycon console=ttyS0,115200n8"; + bootargs = "earlycon console=ttyS0,115200n8 fw_devlink=on"; stdout-path = "serial0:115200n8"; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra186.dtsi b/arch/arm64/boot/dts/nvidia/tegra186.dtsi index 02b26b39cedc..9f75bbf00cf7 100644 --- a/arch/arm64/boot/dts/nvidia/tegra186.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra186.dtsi @@ -73,7 +73,7 @@ snps,rxpbl = <8>; }; - aconnect { + aconnect@2900000 { compatible = "nvidia,tegra186-aconnect", "nvidia,tegra210-aconnect"; clocks = <&bpmp TEGRA186_CLK_APE>, diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts index 2888efc42ba1..d618f197a1d3 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts @@ -651,6 +651,8 @@ reg = <0x1a>; interrupt-parent = <&gpio>; interrupts = <TEGRA194_MAIN_GPIO(S, 5) GPIO_ACTIVE_HIGH>; + clocks = <&bpmp TEGRA194_CLK_AUD_MCLK>; + clock-names = "mclk"; realtek,jd-src = <2>; sound-name-prefix = "CVB-RT"; @@ -658,7 +660,6 @@ rt5658_ep: endpoint { remote-endpoint = <&i2s1_dap_ep>; mclk-fs = <256>; - clocks = <&bpmp TEGRA194_CLK_AUD_MCLK>; }; }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi index 7da3d48cb410..14da4206ea66 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0000.dtsi @@ -5,6 +5,10 @@ model = "NVIDIA Jetson Xavier NX (SD-card)"; compatible = "nvidia,p3668-0000", "nvidia,tegra194"; + aliases { + mmc0 = "/bus@0/mmc@3400000"; + }; + bus@0 { /* SDMMC1 (SD/MMC) */ mmc@3400000 { diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi index b7808648cfe4..f5a9ebbfb12f 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668-0001.dtsi @@ -5,6 +5,10 @@ model = "NVIDIA Jetson Xavier NX (eMMC)"; compatible = "nvidia,p3668-0001", "nvidia,tegra194"; + aliases { + mmc0 = "/bus@0/mmc@3460000"; + }; + bus@0 { /* SDMMC4 (eMMC) */ mmc@3460000 { diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi index 4f12721c332b..f16b0aa8a374 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi @@ -14,7 +14,6 @@ i2c5 = "/bus@0/i2c@31c0000"; i2c6 = "/bus@0/i2c@c250000"; i2c7 = "/bus@0/i2c@31e0000"; - mmc0 = "/bus@0/mmc@3460000"; rtc0 = "/bpmp/i2c/pmic@3c"; rtc1 = "/bus@0/rtc@c2a0000"; serial0 = &tcu; diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index bbdb54702aa7..247011356d11 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -359,6 +359,7 @@ ST5( mov v4.16b, vctr.16b ) ins vctr.d[0], x8 /* apply carry to N counter blocks for N := x12 */ + cbz x12, 2f adr x16, 1f sub x16, x16, x12, lsl #3 br x16 diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 683de671741a..9c3d86e397bf 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -25,7 +25,7 @@ asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_init_arm64(&dctx->h, key); dctx->s[0] = get_unaligned_le32(key + 16); diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h index 5df500dcc627..8a078fc662ac 100644 --- a/arch/arm64/include/asm/alternative-macros.h +++ b/arch/arm64/include/asm/alternative-macros.h @@ -97,9 +97,9 @@ .popsection .subsection 1 663: \insn2 -664: .previous - .org . - (664b-663b) + (662b-661b) +664: .org . - (664b-663b) + (662b-661b) .org . - (662b-661b) + (664b-663b) + .previous .endif .endm @@ -169,11 +169,11 @@ */ .macro alternative_endif 664: + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) .if .Lasm_alt_mode==0 .previous .endif - .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) .endm /* diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 93a161b3bf3f..dc52b733675d 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -37,7 +37,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) } while (--n > 0); sum += ((sum >> 32) | (sum << 32)); - return csum_fold((__force u32)(sum >> 32)); + return csum_fold((__force __wsum)(sum >> 32)); } #define ip_fast_csum ip_fast_csum diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index b77d997b173b..c40f2490cd7b 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -66,7 +66,8 @@ #define ARM64_WORKAROUND_1508412 58 #define ARM64_HAS_LDAPR 59 #define ARM64_KVM_PROTECTED_MODE 60 +#define ARM64_WORKAROUND_NVIDIA_CARMEL_CNP 61 -#define ARM64_NCAPS 61 +#define ARM64_NCAPS 62 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 4e90c2debf70..94d4025acc0b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -278,6 +278,7 @@ #define CPTR_EL2_DEFAULT CPTR_EL2_RES1 /* Hyp Debug Configuration Register bits */ +#define MDCR_EL2_TTRF (1 << 19) #define MDCR_EL2_TPMS (1 << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 22d933e9b59e..a7ab84f781f7 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -47,10 +47,10 @@ #define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 +#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 #define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 #define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 @@ -183,16 +183,16 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) extern void __kvm_flush_vm_context(void); +extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level); extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); -extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu); extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); -extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_get_gic_config(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index c0450828378b..32ae676236b6 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt); void __debug_switch_to_guest(struct kvm_vcpu *vcpu); void __debug_switch_to_host(struct kvm_vcpu *vcpu); +#ifdef __KVM_NVHE_HYPERVISOR__ +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); +#endif + void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); @@ -97,7 +102,8 @@ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ -void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); +void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c759faf7a1ff..0aabc3be9a75 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -328,6 +328,11 @@ static inline void *phys_to_virt(phys_addr_t x) #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) #if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL) +#define page_to_virt(x) ({ \ + __typeof__(x) __page = x; \ + void *__addr = __va(page_to_phys(__page)); \ + (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\ +}) #define virt_to_page(x) pfn_to_page(virt_to_pfn(x)) #else #define page_to_virt(x) ({ \ diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 70ce8c1d2b07..bd02e99b1a4c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -63,23 +63,6 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) extern u64 idmap_t0sz; extern u64 idmap_ptrs_per_pgd; -static inline bool __cpu_uses_extended_idmap(void) -{ - if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52)) - return false; - - return unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS)); -} - -/* - * True if the extended ID map requires an extra level of translation table - * to be configured. - */ -static inline bool __cpu_uses_extended_idmap_level(void) -{ - return ARM64_HW_PGTABLE_LEVELS(64 - idmap_t0sz) > CONFIG_PGTABLE_LEVELS; -} - /* * Ensure TCR.T0SZ is set to the provided value. */ diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h index cf3a0fd7c1a7..9aa193e0e8f2 100644 --- a/arch/arm64/include/asm/paravirt.h +++ b/arch/arm64/include/asm/paravirt.h @@ -3,23 +3,19 @@ #define _ASM_ARM64_PARAVIRT_H #ifdef CONFIG_PARAVIRT +#include <linux/static_call_types.h> + struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; -struct pv_time_ops { - unsigned long long (*steal_clock)(int cpu); -}; - -struct paravirt_patch_template { - struct pv_time_ops time; -}; +u64 dummy_steal_clock(int cpu); -extern struct paravirt_patch_template pv_ops; +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); static inline u64 paravirt_steal_clock(int cpu) { - return pv_ops.time.steal_clock(cpu); + return static_call(pv_steal_clock)(cpu); } int __init pv_time_init(void); diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 046be789fbb4..9a65fb528110 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -66,7 +66,6 @@ extern bool arm64_use_ng_mappings; #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define PAGE_KERNEL __pgprot(PROT_NORMAL) -#define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED) #define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define PAGE_KERNEL_ROX __pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e17b96d0e4b5..47027796c2f9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -486,6 +486,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) #define pgprot_device(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN) +#define pgprot_tagged(prot) \ + __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED)) +#define pgprot_mhp pgprot_tagged /* * DMA allocations for non-coherent devices use what the Arm architecture calls * "Normal non-cacheable" memory, which permits speculation, unaligned accesses diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ca2cd75d3286..efc10e9041a0 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -251,6 +251,8 @@ unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); +asmlinkage void arm64_preempt_schedule_irq(void); + #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index dfd4edbfe360..d4a5fca984c3 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -796,6 +796,11 @@ #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX 0x7 + #ifdef CONFIG_ARM64_PA_BITS_52 #define ID_AA64MMFR0_PARANGE_MAX ID_AA64MMFR0_PARANGE_52 #else @@ -961,14 +966,17 @@ #define ID_PFR1_PROGMOD_SHIFT 0 #if defined(CONFIG_ARM64_4K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN4_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #elif defined(CONFIG_ARM64_16K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN16_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0xF #elif defined(CONFIG_ARM64_64K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN64_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #endif #define MVFR2_FPMISC_SHIFT 4 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 9f4e3b266f21..6623c99f0984 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -55,6 +55,8 @@ void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec void arch_release_task_struct(struct task_struct *tsk); +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src); #endif diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h index 3333950b5909..ea487218db79 100644 --- a/arch/arm64/include/asm/word-at-a-time.h +++ b/arch/arm64/include/asm/word-at-a-time.h @@ -53,7 +53,7 @@ static inline unsigned long find_zero(unsigned long mask) */ static inline unsigned long load_unaligned_zeropad(const void *addr) { - unsigned long ret, offset; + unsigned long ret, tmp; /* Load word from unaligned pointer addr */ asm( @@ -61,9 +61,9 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) "2:\n" " .pushsection .fixup,\"ax\"\n" " .align 2\n" - "3: and %1, %2, #0x7\n" - " bic %2, %2, #0x7\n" - " ldr %0, [%2]\n" + "3: bic %1, %2, #0x7\n" + " ldr %0, [%1]\n" + " and %1, %2, #0x7\n" " lsl %1, %1, #0x3\n" #ifndef __AARCH64EB__ " lsr %0, %0, %1\n" @@ -73,7 +73,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) " b 2b\n" " .popsection\n" _ASM_EXTABLE(1b, 3b) - : "=&r" (ret), "=&r" (offset) + : "=&r" (ret), "=&r" (tmp) : "r" (addr), "Q" (*(unsigned long *)addr)); return ret; diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 506a1cd37973..e2c20c036442 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -526,6 +526,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { 1, 0), }, #endif +#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM + { + /* NVIDIA Carmel */ + .desc = "NVIDIA Carmel CNP erratum", + .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, + ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + }, +#endif { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 066030717a4c..e5281e1c8f1d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -383,7 +383,6 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { * of support. */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_TRACEVER_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6), ARM64_FTR_END, }; @@ -1321,7 +1320,10 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) * may share TLB entries with a CPU stuck in the crashed * kernel. */ - if (is_kdump_kernel()) + if (is_kdump_kernel()) + return false; + + if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 77605aec25fe..51fcf99d5351 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -353,7 +353,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) * with the CLIDR_EL1 fields to avoid triggering false warnings * when there is a mismatch across the CPUs. Keep track of the * effective value of the CTR_EL0 in our internal records for - * acurate sanity check and feature enablement. + * accurate sanity check and feature enablement. */ info->reg_ctr = read_cpuid_effective_cachetype(); info->reg_dczid = read_cpuid(DCZID_EL0); diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index e6e284265f19..58303a9ec32c 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -64,5 +64,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + *ppos += count; + return count; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a31a0a713c85..6acfc5e6b5e0 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -148,16 +148,18 @@ alternative_cb_end .endm /* Check for MTE asynchronous tag check faults */ - .macro check_mte_async_tcf, flgs, tmp + .macro check_mte_async_tcf, tmp, ti_flags #ifdef CONFIG_ARM64_MTE + .arch_extension lse alternative_if_not ARM64_MTE b 1f alternative_else_nop_endif mrs_s \tmp, SYS_TFSRE0_EL1 tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */ - orr \flgs, \flgs, #_TIF_MTE_ASYNC_FAULT - str \flgs, [tsk, #TSK_TI_FLAGS] + mov \tmp, #_TIF_MTE_ASYNC_FAULT + add \ti_flags, tsk, #TSK_TI_FLAGS + stset \tmp, [\ti_flags] msr_s SYS_TFSRE0_EL1, xzr 1: #endif @@ -244,7 +246,7 @@ alternative_else_nop_endif disable_step_tsk x19, x20 /* Check for asynchronous tag check faults in user space */ - check_mte_async_tcf x19, x22 + check_mte_async_tcf x22, x23 apply_ssbd 1, x22, x23 ptrauth_keys_install_kernel tsk, x20, x22, x23 diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 66b0e0b66e31..840bda1869e9 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -319,7 +319,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) */ adrp x5, __idmap_text_end clz x5, x5 - cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? + cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough? b.ge 1f // .. then skip VA range extension adr_l x6, idmap_t0sz @@ -655,8 +655,10 @@ SYM_FUNC_END(__secondary_too_slow) SYM_FUNC_START(__enable_mmu) mrs x2, ID_AA64MMFR0_EL1 ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED - b.ne __no_granule_support + cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN + b.lt __no_granule_support + cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX + b.gt __no_granule_support update_early_cpu_boot_status 0, x2, x3 adrp x2, idmap_pg_dir phys_to_ttbr x1, x1 diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index dffb16682330..83f1c4b92095 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -163,33 +163,36 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) } while (1); } -static __init void parse_cmdline(void) +static __init const u8 *get_bootargs_cmdline(void) { - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - const u8 *prop; - void *fdt; - int node; + const u8 *prop; + void *fdt; + int node; - fdt = get_early_fdt_ptr(); - if (!fdt) - goto out; + fdt = get_early_fdt_ptr(); + if (!fdt) + return NULL; - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return NULL; - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + return NULL; - __parse_cmdline(prop, true); + return strlen(prop) ? prop : NULL; +} - if (!IS_ENABLED(CONFIG_CMDLINE_EXTEND)) - return; - } +static __init void parse_cmdline(void) +{ + const u8 *prop = get_bootargs_cmdline(); -out: - __parse_cmdline(CONFIG_CMDLINE, true); + if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + __parse_cmdline(CONFIG_CMDLINE, true); + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) + __parse_cmdline(prop, true); } /* Keep checkers quiet */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 23f1a557bd9f..5aa9ed1e9ec6 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -101,6 +101,9 @@ KVM_NVHE_ALIAS(__stop___kvm_ex_table); /* Array containing bases of nVHE per-CPU memory regions. */ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); +/* PMU available static key */ +KVM_NVHE_ALIAS(kvm_arm_pmu_available); + #endif /* CONFIG_KVM */ #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index c07d7a034941..75fed4460407 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -18,6 +18,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/static_call.h> #include <asm/paravirt.h> #include <asm/pvclock-abi.h> @@ -26,8 +27,12 @@ struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; -EXPORT_SYMBOL_GPL(pv_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); struct pv_time_stolen_time_region { struct pvclock_vcpu_stolen_time *kaddr; @@ -45,7 +50,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); /* return stolen time in ns by asking the hypervisor */ -static u64 pv_steal_clock(int cpu) +static u64 para_steal_clock(int cpu) { struct pv_time_stolen_time_region *reg; @@ -150,7 +155,7 @@ int __init pv_time_init(void) if (ret) return ret; - pv_ops.time.steal_clock = pv_steal_clock; + static_call_update(pv_steal_clock, para_steal_clock); static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 7d2318f80955..4658fcf88c2b 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -460,7 +460,7 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline u32 armv8pmu_read_evcntr(int idx) +static inline u64 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 66aac2881ba8..85645b2b0c7a 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -267,10 +267,12 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) if (!instruction_pointer(regs)) BUG(); - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { restore_previous_kprobe(kcb); - else + } else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; case KPROBE_HIT_ACTIVE: diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 325c83b1a24d..6e60aa3b5ea9 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -57,6 +57,8 @@ #include <asm/processor.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> +#include <asm/switch_to.h> +#include <asm/system_misc.h> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 5bfd9b87f85d..4ea9392f86e0 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -134,7 +134,7 @@ SYM_FUNC_START(_cpu_resume) */ bl cpu_do_resume -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) mov x0, sp bl kasan_unpoison_task_stack_below #endif diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ad20981dfda4..d55bdfb7789c 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -194,8 +194,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) #ifdef CONFIG_STACKTRACE -void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, - struct task_struct *task, struct pt_regs *regs) +noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { struct stackframe frame; @@ -203,8 +204,8 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, start_backtrace(&frame, regs->regs[29], regs->pc); else if (task == current) start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)arch_stack_walk); + (unsigned long)__builtin_frame_address(1), + (unsigned long)__builtin_return_address(0)); else start_backtrace(&frame, thread_saved_fp(task), thread_saved_pc(task)); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fc4c95dd2d26..7f06ba76698d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) last_ran = this_cpu_ptr(mmu->last_vcpu_ran); /* + * We guarantee that both TLBs and I-cache are private to each + * vcpu. If detecting that a vcpu from the same VM has + * previously run on the same physical CPU, call into the + * hypervisor code to nuke the relevant contexts. + * * We might get preempted before the vCPU actually runs, but * over-invalidation doesn't affect correctness. */ if (*last_ran != vcpu->vcpu_id) { - kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu); + kvm_call_hyp(__kvm_flush_cpu_context, mmu); *last_ran = vcpu->vcpu_id; } diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 7a7e425616b5..dbc890511631 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -89,6 +89,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) * - Debug ROM Address (MDCR_EL2_TDRA) * - OS related registers (MDCR_EL2_TDOSA) * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) + * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) * * Additionally, KVM only traps guest accesses to the debug registers if * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY @@ -112,6 +113,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | + MDCR_EL2_TTRF | MDCR_EL2_TPMCR | MDCR_EL2_TDRA | MDCR_EL2_TDOSA); diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index b0afad7a99c6..e831d3dfd50d 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -85,8 +85,10 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // If the hyp context is loaded, go straight to hyp_panic get_loaded_vcpu x0, x1 - cbz x0, hyp_panic + cbnz x0, 1f + b hyp_panic +1: // The hyp context is saved so make sure it is restored to allow // hyp_panic to run at hyp and, subsequently, panic to run in the host. // This makes use of __guest_exit to avoid duplication but sets the @@ -94,7 +96,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // current state is saved to the guest context but it will only be // accurate if the guest had been completely restored. adr_this_cpu x0, kvm_hyp_ctxt, x1 - adr x1, hyp_panic + adr_l x1, hyp_panic str x1, [x0, #CPU_XREG_OFFSET(30)] get_vcpu_ptr x1, x0 @@ -146,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Now restore the hyp regs restore_callee_saved_regs x2 - set_loaded_vcpu xzr, x1, x2 + set_loaded_vcpu xzr, x2, x3 alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 54f4860cd87c..6c1f51f25eb3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -90,15 +90,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - write_sysreg(0, pmselr_el0); - write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) { + write_sysreg(0, pmselr_el0); + write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + } write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } static inline void __deactivate_traps_common(void) { write_sysreg(0, hstr_el2); - write_sysreg(0, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) + write_sysreg(0, pmuserenr_el0); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 91a711aa8382..f401724f12ef 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); } -void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); +} + +void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +{ __debug_switch_to_guest_common(vcpu); } -void __debug_switch_to_host(struct kvm_vcpu *vcpu) +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); +} + +void __debug_switch_to_host(struct kvm_vcpu *vcpu) +{ __debug_switch_to_host_common(vcpu); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 6585a7cbbc56..5d94584840cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -71,7 +71,8 @@ SYM_FUNC_START(__host_enter) SYM_FUNC_END(__host_enter) /* - * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); + * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + * u64 elr, u64 par); */ SYM_FUNC_START(__hyp_do_panic) /* Prepare and exit to the host's panic funciton. */ @@ -82,9 +83,11 @@ SYM_FUNC_START(__hyp_do_panic) hyp_kimg_va lr, x6 msr elr_el2, lr - /* Set the panic format string. Use the, now free, LR as scratch. */ - ldr lr, =__hyp_panic_string - hyp_kimg_va lr, x6 + mov x29, x0 + + /* Load the format string into x0 and arguments into x1-7 */ + ldr x0, =__hyp_panic_string + hyp_kimg_va x0, x6 /* Load the format arguments into x1-7. */ mov x6, x3 @@ -94,9 +97,7 @@ SYM_FUNC_START(__hyp_do_panic) mrs x5, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ - cmp x0, xzr - mov x0, lr - b.eq __host_enter_without_restoring + cbz x29, __host_enter_without_restoring b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f012f8665ecc..936328207bde 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } -static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt) +static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); - __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); + __kvm_flush_cpu_context(kern_hyp_va(mmu)); } static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) @@ -67,9 +67,9 @@ static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt) write_sysreg_el2(tmp, SYS_SCTLR); } -static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) { - cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2(); + cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) @@ -115,10 +115,10 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), - HANDLE_FUNC(__kvm_tlb_flush_local_vmid), + HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2), + HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__vgic_v3_read_vmcr), HANDLE_FUNC(__vgic_v3_write_vmcr), HANDLE_FUNC(__vgic_v3_init_lrs), diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f3d0e9eca56c..68ab6b4d5141 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); __sysreg_save_state_nvhe(host_ctxt); + /* + * We must flush and disable the SPE buffer for nVHE, as + * the translation regime(EL1&0) is going to be loaded with + * that of the guest. And we must do this before we change the + * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and + * before we load guest Stage1. + */ + __debug_save_host_buffers_nvhe(vcpu); __adjust_pc(vcpu); @@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) __fpsimd_save_fpexc32(vcpu); + __debug_switch_to_host(vcpu); /* * This must come after restoring the host sysregs, since a non-VHE * system may enable SPE here and make use of the TTBRs. */ - __debug_switch_to_host(vcpu); + __debug_restore_host_buffers_nvhe(vcpu); if (pmu_switch_needed) __pmu_switch_to_host(host_ctxt); @@ -257,7 +266,6 @@ void __noreturn hyp_panic(void) u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg_par(); - bool restore_host = true; struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; @@ -271,7 +279,7 @@ void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } - __hyp_do_panic(restore_host, spsr, elr, par); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index fbde89a2c6e8..229b06748c20 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4d177ce1d536..926fc07074f5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -223,6 +223,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; if (!table) { + data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); data->addr += kvm_granule_size(level); goto out; } diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 80406f463c28..39f8f7f9227c 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -405,9 +405,54 @@ void __vgic_v3_init_lrs(void) __gic_v3_set_lr(0, i); } -u64 __vgic_v3_get_ich_vtr_el2(void) +/* + * Return the GIC CPU configuration: + * - [31:0] ICH_VTR_EL2 + * - [62:32] RES0 + * - [63] MMIO (GICv2) capable + */ +u64 __vgic_v3_get_gic_config(void) { - return read_gicreg(ICH_VTR_EL2); + u64 val, sre = read_gicreg(ICC_SRE_EL1); + unsigned long flags = 0; + + /* + * To check whether we have a MMIO-based (GICv2 compatible) + * CPU interface, we need to disable the system register + * view. To do that safely, we have to prevent any interrupt + * from firing (which would be deadly). + * + * Note that this only makes sense on VHE, as interrupts are + * already masked for nVHE as part of the exception entry to + * EL2. + */ + if (has_vhe()) + flags = local_daif_save(); + + /* + * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates + * that to be able to set ICC_SRE_EL1.SRE to 0, all the + * interrupt overrides must be set. You've got to love this. + */ + sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); + write_gicreg(0, ICC_SRE_EL1); + isb(); + + val = read_gicreg(ICC_SRE_EL1); + + write_gicreg(sre, ICC_SRE_EL1); + isb(); + sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); + + if (has_vhe()) + local_daif_restore(flags); + + val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); + val |= read_gicreg(ICH_VTR_EL2); + + return val; } u64 __vgic_v3_read_vmcr(void) diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index fd7895945bbc..66f17349f0c3 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 77cb2d28f2a4..8711894db8c2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1312,8 +1312,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ - if (memslot->base_gfn + memslot->npages >= - (kvm_phys_size(kvm) >> PAGE_SHIFT)) + if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; mmap_read_lock(current->mm); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index d45b8b9a4415..739164324afe 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -11,6 +11,8 @@ #include <asm/kvm_emulate.h> +DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + static int kvm_is_in_guest(void) { return kvm_get_running_vcpu() != NULL; @@ -48,6 +50,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { + /* + * Check if HW_PERF_EVENTS are supported by checking the number of + * hardware performance counters. This could ensure the presence of + * a physical PMU and CONFIG_PERF_EVENT is selected. + */ + if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0) + static_branch_enable(&kvm_arm_pmu_available); + return perf_register_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e9ec08b0b070..e32c6e139a09 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -823,16 +823,6 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return val & mask; } -bool kvm_arm_support_pmu_v3(void) -{ - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - return (perf_num_counters() > 0); -} - int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_has_pmu(vcpu)) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 47f3f035f3ea..bd354cd45d28 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -311,23 +311,24 @@ int kvm_set_ipa_limit(void) } switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { - default: - case 1: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; - case 0: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT: kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); break; - case 2: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX: kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); break; + default: + kvm_err("Unsupported value for TGRAN_2, giving up\n"); + return -EINVAL; } kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); - WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, - "KVM IPA Size Limit (%d bits) is smaller than default size\n", - kvm_ipa_limit); - kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); + kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, + ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? + " (Reduced IPA size, limited VM/VMM compatibility)" : "")); return 0; } @@ -356,6 +357,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } } mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 52915b342351..6f530925a231 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -574,9 +574,13 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); */ int vgic_v3_probe(const struct gic_kvm_info *info) { - u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); + u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); + bool has_v2; int ret; + has_v2 = ich_vtr_el2 >> 63; + ich_vtr_el2 = (u32)ich_vtr_el2; + /* * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... @@ -594,13 +598,15 @@ int vgic_v3_probe(const struct gic_kvm_info *info) gicv4_enable ? "en" : "dis"); } + kvm_vgic_global_state.vcpu_base = 0; + if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); - kvm_vgic_global_state.vcpu_base = 0; + } else if (!has_v2) { + pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); - kvm_vgic_global_state.vcpu_base = 0; } else { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0ace5e68efba..3685e12aba9b 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -219,17 +219,40 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) int pfn_valid(unsigned long pfn) { - phys_addr_t addr = pfn << PAGE_SHIFT; + phys_addr_t addr = PFN_PHYS(pfn); - if ((addr >> PAGE_SHIFT) != pfn) + /* + * Ensure the upper PAGE_SHIFT bits are clear in the + * pfn. Else it might lead to false positives when + * some of the upper bits are set, but the lower bits + * match a valid pfn. + */ + if (PHYS_PFN(addr) != pfn) return 0; #ifdef CONFIG_SPARSEMEM +{ + struct mem_section *ms; + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - if (!valid_section(__pfn_to_section(pfn))) + ms = __pfn_to_section(pfn); + if (!valid_section(ms)) return 0; + + /* + * ZONE_DEVICE memory does not have the memblock entries. + * memblock_is_map_memory() check for ZONE_DEVICE based + * addresses will always fail. Even the normal hotplugged + * memory will never have MEMBLOCK_NOMAP flag set in their + * memblock entries. Skip memblock search for all non early + * memory sections covering all of hotplug memory including + * both normal and ZONE_DEVICE based. + */ + if (!early_section(ms)) + return pfn_section_valid(ms, pfn); +} #endif return memblock_is_map_memory(addr); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3802cfbdd20d..5d9550fdb9cf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -40,7 +40,7 @@ #define NO_BLOCK_MAPPINGS BIT(0) #define NO_CONT_MAPPINGS BIT(1) -u64 idmap_t0sz = TCR_T0SZ(VA_BITS); +u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN); u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; u64 __section(".mmuoff.data.write") vabits_actual; @@ -512,7 +512,8 @@ static void __init map_mem(pgd_t *pgdp) * if MTE is present. Otherwise, it has the same attributes as * PAGE_KERNEL. */ - __map_memblock(pgdp, start, end, PAGE_KERNEL_TAGGED, flags); + __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), + flags); } /* @@ -1447,6 +1448,22 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; + u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + u64 end_linear_pa = __pa(PAGE_END - 1); + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + /* + * Check for a wrap, it is possible because of randomized linear + * mapping the start physical address is actually bigger than + * the end physical address. In this case set start to zero + * because [0, end_linear_pa] range must still be able to cover + * all addressable physical addresses. + */ + if (start_linear_pa > end_linear_pa) + start_linear_pa = 0; + } + + WARN_ON(start_linear_pa > end_linear_pa); /* * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] @@ -1454,8 +1471,9 @@ struct range arch_get_mappable_range(void) * range which can be mapped inside this linear mapping range, must * also be derived from its end points. */ - mhp_range.start = __pa(_PAGE_OFFSET(vabits_actual)); - mhp_range.end = __pa(PAGE_END - 1); + mhp_range.start = start_linear_pa; + mhp_range.end = end_linear_pa; + return mhp_range; } diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 34e91224adc3..8de5b987edb9 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -314,7 +314,7 @@ config FORCE_MAX_ZONEORDER int "Maximum zone order" default "11" -config RAM_BASE +config DRAM_BASE hex "DRAM start addr (the same with memory-section in dts)" default 0x0 diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h index 3b91fc3cf36f..ed7451478b1b 100644 --- a/arch/csky/include/asm/page.h +++ b/arch/csky/include/asm/page.h @@ -28,7 +28,7 @@ #define SSEG_SIZE 0x20000000 #define LOWMEM_LIMIT (SSEG_SIZE * 2) -#define PHYS_OFFSET_OFFSET (CONFIG_RAM_BASE & (SSEG_SIZE - 1)) +#define PHYS_OFFSET_OFFSET (CONFIG_DRAM_BASE & (SSEG_SIZE - 1)) #ifndef __ASSEMBLY__ diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index ae2b1c7b3b5c..ef2bb9bd9605 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -9,7 +9,7 @@ int arch_check_ftrace_location(struct kprobe *p) return 0; } -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig index ca0d596c800d..8916a2850c48 100644 --- a/arch/ia64/configs/generic_defconfig +++ b/arch/ia64/configs/generic_defconfig @@ -55,8 +55,6 @@ CONFIG_CHR_DEV_SG=m CONFIG_SCSI_FC_ATTRS=y CONFIG_SCSI_SYM53C8XX_2=y CONFIG_SCSI_QLOGIC_1280=y -CONFIG_ATA=y -CONFIG_ATA_PIIX=y CONFIG_SATA_VITESSE=y CONFIG_MD=y CONFIG_BLK_DEV_MD=m diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index b3aa46090101..08179135905c 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -54,8 +54,7 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) { - /* FIXME: should this be bspstore + nr_dirty regs? */ - return regs->ar_bspstore; + return regs->r12; } static inline int is_syscall_success(struct pt_regs *regs) @@ -79,11 +78,6 @@ static inline long regs_return_value(struct pt_regs *regs) unsigned long __ip = instruction_pointer(regs); \ (__ip & ~3UL) + ((__ip & 3UL) << 2); \ }) -/* - * Why not default? Because user_stack_pointer() on ia64 gives register - * stack backing store instead... - */ -#define current_user_stack_pointer() (current_pt_regs()->r12) /* given a pointer to a task_struct, return the user's pt_regs */ # define task_pt_regs(t) (((struct pt_regs *) ((char *) (t) + IA64_STK_OFFSET)) - 1) diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h index 6c6f16e409a8..0d23c0049301 100644 --- a/arch/ia64/include/asm/syscall.h +++ b/arch/ia64/include/asm/syscall.h @@ -32,7 +32,7 @@ static inline void syscall_rollback(struct task_struct *task, static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { - return regs->r10 == -1 ? regs->r8:0; + return regs->r10 == -1 ? -regs->r8:0; } static inline long syscall_get_return_value(struct task_struct *task, diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 8b5b8e6bc9d9..dd5bfed52031 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -59,7 +59,7 @@ show_##name(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ u32 cpu=dev->id; \ - return sprintf(buf, "%lx\n", name[cpu]); \ + return sprintf(buf, "%llx\n", name[cpu]); \ } #define store(name) \ @@ -86,9 +86,9 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu); - printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]); - printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]); - printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n", + printk(KERN_DEBUG "err_type_info=%llx,\n", err_type_info[cpu]); + printk(KERN_DEBUG "err_struct_info=%llx,\n", err_struct_info[cpu]); + printk(KERN_DEBUG "err_data_buffer=%llx, %llx, %llx.\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -117,8 +117,8 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]); - printk(KERN_DEBUG "capabilities=%lx,\n", capabilities[cpu]); - printk(KERN_DEBUG "resources=%lx\n", resources[cpu]); + printk(KERN_DEBUG "capabilities=%llx,\n", capabilities[cpu]); + printk(KERN_DEBUG "resources=%llx\n", resources[cpu]); #endif return size; } @@ -131,7 +131,7 @@ show_virtual_to_phys(struct device *dev, struct device_attribute *attr, char *buf) { unsigned int cpu=dev->id; - return sprintf(buf, "%lx\n", phys_addr[cpu]); + return sprintf(buf, "%llx\n", phys_addr[cpu]); } static ssize_t @@ -145,7 +145,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG - printk("Virtual address %lx is not existing.\n",virt_addr); + printk("Virtual address %llx is not existing.\n", virt_addr); #endif return -EINVAL; } @@ -163,7 +163,7 @@ show_err_data_buffer(struct device *dev, { unsigned int cpu=dev->id; - return sprintf(buf, "%lx, %lx, %lx\n", + return sprintf(buf, "%llx, %llx, %llx\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -178,13 +178,13 @@ store_err_data_buffer(struct device *dev, int ret; #ifdef ERR_INJ_DEBUG - printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n", + printk("write err_data_buffer=[%llx,%llx,%llx] on cpu%d\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3, cpu); #endif - ret=sscanf(buf, "%lx, %lx, %lx", + ret = sscanf(buf, "%llx, %llx, %llx", &err_data_buffer[cpu].data1, &err_data_buffer[cpu].data2, &err_data_buffer[cpu].data3); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index d4cae2fc69ca..adf6521525f4 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1824,7 +1824,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = (void *)__get_free_pages(GFP_KERNEL, + data = (void *)__get_free_pages(GFP_ATOMIC, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index c3490ee2daa5..e14f5653393a 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -2013,27 +2013,39 @@ static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data) { struct syscall_get_set_args *args = data; struct pt_regs *pt = args->regs; - unsigned long *krbs, cfm, ndirty; + unsigned long *krbs, cfm, ndirty, nlocals, nouts; int i, count; if (unw_unwind_to_user(info) < 0) return; + /* + * We get here via a few paths: + * - break instruction: cfm is shared with caller. + * syscall args are in out= regs, locals are non-empty. + * - epsinstruction: cfm is set by br.call + * locals don't exist. + * + * For both cases argguments are reachable in cfm.sof - cfm.sol. + * CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ] + */ cfm = pt->cr_ifs; + nlocals = (cfm >> 7) & 0x7f; /* aka sol */ + nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */ krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8; ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19)); count = 0; if (in_syscall(pt)) - count = min_t(int, args->n, cfm & 0x7f); + count = min_t(int, args->n, nouts); + /* Iterate over outs. */ for (i = 0; i < count; i++) { + int j = ndirty + nlocals + i + args->i; if (args->rw) - *ia64_rse_skip_regs(krbs, ndirty + i + args->i) = - args->args[i]; + *ia64_rse_skip_regs(krbs, j) = args->args[i]; else - args->args[i] = *ia64_rse_skip_regs(krbs, - ndirty + i + args->i); + args->args[i] = *ia64_rse_skip_regs(krbs, j); } if (!args->rw) { diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 03b3a02375ff..c310b4c99fb3 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -95,7 +95,7 @@ static int __init build_node_maps(unsigned long start, unsigned long len, * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been * called yet. Note that node 0 will also count all non-existent cpus. */ -static int __meminit early_nr_cpus_node(int node) +static int early_nr_cpus_node(int node) { int cpu, n = 0; @@ -110,7 +110,7 @@ static int __meminit early_nr_cpus_node(int node) * compute_pernodesize - compute size of pernode data * @node: the node id. */ -static unsigned long __meminit compute_pernodesize(int node) +static unsigned long compute_pernodesize(int node) { unsigned long pernodesize = 0, cpus; @@ -367,7 +367,7 @@ static void __init reserve_pernode_space(void) } } -static void __meminit scatter_node_data(void) +static void scatter_node_data(void) { pg_data_t **dst; int node; diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index 7f5912af2a52..2411ea9ef578 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -167,11 +167,11 @@ static inline __attribute_const__ int __virt_to_node_shift(void) ((__p) - pgdat->node_mem_map) + pgdat->node_start_pfn; \ }) #else -#define ARCH_PFN_OFFSET (m68k_memory[0].addr) +#define ARCH_PFN_OFFSET (m68k_memory[0].addr >> PAGE_SHIFT) #include <asm-generic/memory_model.h> #endif -#define virt_addr_valid(kaddr) ((void *)(kaddr) >= (void *)PAGE_OFFSET && (void *)(kaddr) < high_memory) +#define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory) #define pfn_valid(pfn) virt_addr_valid(pfn_to_virt(pfn)) #endif /* __ASSEMBLY__ */ diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 6bbe52025de3..8d0f862ee9d7 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -30,8 +30,8 @@ extern unsigned long memory_end; #define page_to_pfn(page) virt_to_pfn(page_to_virt(page)) #define pfn_valid(pfn) ((pfn) < max_mapnr) -#define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ - ((void *)(kaddr) < (void *)memory_end)) +#define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ + ((unsigned long)(kaddr) < memory_end)) #endif /* __ASSEMBLY__ */ diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c index e3946b06e840..3d70d15ada28 100644 --- a/arch/mips/boot/compressed/decompress.c +++ b/arch/mips/boot/compressed/decompress.c @@ -14,6 +14,7 @@ #include <asm/addrspace.h> #include <asm/unaligned.h> +#include <asm-generic/vmlinux.lds.h> /* * These two variables specify the free mem region @@ -120,6 +121,13 @@ void decompress_kernel(unsigned long boot_heap_start) /* last four bytes is always image size in little endian */ image_size = get_unaligned_le32((void *)&__image_end - 4); + /* The device tree's address must be properly aligned */ + image_size = ALIGN(image_size, STRUCT_ALIGNMENT); + + puts("Copy device tree to address "); + puthex(VMLINUX_LOAD_ADDRESS_ULL + image_size); + puts("\n"); + /* copy dtb to where the booted kernel will expect it */ memcpy((void *)VMLINUX_LOAD_ADDRESS_ULL + image_size, __appended_dtb, dtb_size); diff --git a/arch/mips/crypto/Makefile b/arch/mips/crypto/Makefile index 8e1deaf00e0c..5e4105cccf9f 100644 --- a/arch/mips/crypto/Makefile +++ b/arch/mips/crypto/Makefile @@ -12,8 +12,8 @@ AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o poly1305-mips-y := poly1305-core.o poly1305-glue.o -perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32 -perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64 +perlasm-flavour-$(CONFIG_32BIT) := o32 +perlasm-flavour-$(CONFIG_64BIT) := 64 quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) diff --git a/arch/mips/crypto/poly1305-glue.c b/arch/mips/crypto/poly1305-glue.c index fc881b46d911..bc6110fb98e0 100644 --- a/arch/mips/crypto/poly1305-glue.c +++ b/arch/mips/crypto/poly1305-glue.c @@ -17,7 +17,7 @@ asmlinkage void poly1305_init_mips(void *state, const u8 *key); asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit); asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce); -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_init_mips(&dctx->h, key); dctx->s[0] = get_unaligned_le32(key + 16); diff --git a/arch/mips/include/asm/traps.h b/arch/mips/include/asm/traps.h index 6aa8f126a43d..b710e76c9c65 100644 --- a/arch/mips/include/asm/traps.h +++ b/arch/mips/include/asm/traps.h @@ -24,8 +24,11 @@ extern void (*board_ebase_setup)(void); extern void (*board_cache_error_setup)(void); extern int register_nmi_notifier(struct notifier_block *nb); +extern void reserve_exception_space(phys_addr_t addr, unsigned long size); extern char except_vec_nmi[]; +#define VECTORSPACING 0x100 /* for EI/VI mode */ + #define nmi_notifier(fn, pri) \ ({ \ static struct notifier_block fn##_nb = { \ diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index 9a89637b4ecf..b71892064f27 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -26,6 +26,7 @@ #include <asm/elf.h> #include <asm/pgtable-bits.h> #include <asm/spram.h> +#include <asm/traps.h> #include <linux/uaccess.h> #include "fpu-probe.h" @@ -1628,6 +1629,7 @@ static inline void cpu_probe_broadcom(struct cpuinfo_mips *c, unsigned int cpu) c->cputype = CPU_BMIPS3300; __cpu_name[cpu] = "Broadcom BMIPS3300"; set_elf_platform(cpu, "bmips3300"); + reserve_exception_space(0x400, VECTORSPACING * 64); break; case PRID_IMP_BMIPS43XX: { int rev = c->processor_id & PRID_REV_MASK; @@ -1638,6 +1640,7 @@ static inline void cpu_probe_broadcom(struct cpuinfo_mips *c, unsigned int cpu) __cpu_name[cpu] = "Broadcom BMIPS4380"; set_elf_platform(cpu, "bmips4380"); c->options |= MIPS_CPU_RIXI; + reserve_exception_space(0x400, VECTORSPACING * 64); } else { c->cputype = CPU_BMIPS4350; __cpu_name[cpu] = "Broadcom BMIPS4350"; @@ -1654,6 +1657,7 @@ static inline void cpu_probe_broadcom(struct cpuinfo_mips *c, unsigned int cpu) __cpu_name[cpu] = "Broadcom BMIPS5000"; set_elf_platform(cpu, "bmips5000"); c->options |= MIPS_CPU_ULRI | MIPS_CPU_RIXI; + reserve_exception_space(0x1000, VECTORSPACING * 64); break; } } @@ -2133,6 +2137,8 @@ void cpu_probe(void) if (cpu == 0) __ua_limit = ~((1ull << cpu_vmbits) - 1); #endif + + reserve_exception_space(0, 0x1000); } void cpu_report(void) diff --git a/arch/mips/kernel/cpu-r3k-probe.c b/arch/mips/kernel/cpu-r3k-probe.c index abdbbe8c5a43..af654771918c 100644 --- a/arch/mips/kernel/cpu-r3k-probe.c +++ b/arch/mips/kernel/cpu-r3k-probe.c @@ -21,6 +21,7 @@ #include <asm/fpu.h> #include <asm/mipsregs.h> #include <asm/elf.h> +#include <asm/traps.h> #include "fpu-probe.h" @@ -158,6 +159,8 @@ void cpu_probe(void) cpu_set_fpu_opts(c); else cpu_set_nofpu_opts(c); + + reserve_exception_space(0, 0x400); } void cpu_report(void) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 279be0153f8b..23a140327a0b 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -43,7 +43,7 @@ #include <asm/prom.h> #ifdef CONFIG_MIPS_ELF_APPENDED_DTB -const char __section(".appended_dtb") __appended_dtb[0x100000]; +char __section(".appended_dtb") __appended_dtb[0x100000]; #endif /* CONFIG_MIPS_ELF_APPENDED_DTB */ struct cpuinfo_mips cpu_data[NR_CPUS] __read_mostly; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e0352958e2f7..808b8b61ded1 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -2009,13 +2009,16 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs) nmi_exit(); } -#define VECTORSPACING 0x100 /* for EI/VI mode */ - unsigned long ebase; EXPORT_SYMBOL_GPL(ebase); unsigned long exception_handlers[32]; unsigned long vi_handlers[64]; +void reserve_exception_space(phys_addr_t addr, unsigned long size) +{ + memblock_reserve(addr, size); +} + void __init *set_except_vector(int n, void *addr) { unsigned long handler = (unsigned long) addr; @@ -2367,10 +2370,7 @@ void __init trap_init(void) if (!cpu_has_mips_r2_r6) { ebase = CAC_BASE; - ebase_pa = virt_to_phys((void *)ebase); vec_size = 0x400; - - memblock_reserve(ebase_pa, vec_size); } else { if (cpu_has_veic || cpu_has_vint) vec_size = 0x200 + VECTORSPACING*64; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index c1c345be04ff..1f98947fe715 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -145,6 +145,7 @@ SECTIONS } #ifdef CONFIG_MIPS_ELF_APPENDED_DTB + STRUCT_ALIGN(); .appended_dtb : AT(ADDR(.appended_dtb) - LOAD_OFFSET) { *(.appended_dtb) KEEP(*(.appended_dtb)) @@ -172,6 +173,11 @@ SECTIONS #endif #ifdef CONFIG_MIPS_RAW_APPENDED_DTB + .fill : { + FILL(0); + BYTE(0); + STRUCT_ALIGN(); + } __appended_dtb = .; /* leave space for appended DTB */ . += 0x100000; diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index 6eb98a7ad27d..ad5344ef5d33 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -238,7 +238,7 @@ void flush_dcache_page(struct page *page) { struct address_space *mapping; - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (mapping && !mapping_mapped(mapping)) set_bit(PG_dcache_dirty, &page->flags); else { diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 4e53ac46e857..afc3b8d03572 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -203,9 +203,12 @@ config PREFETCH def_bool y depends on PA8X00 || PA7200 +config PARISC_HUGE_KERNEL + def_bool y if !MODULES || UBSAN || FTRACE || COMPILE_TEST + config MLONGCALLS - def_bool y if !MODULES || UBSAN || FTRACE - bool "Enable the -mlong-calls compiler option for big kernels" if MODULES && !UBSAN && !FTRACE + def_bool y if PARISC_HUGE_KERNEL + bool "Enable the -mlong-calls compiler option for big kernels" if !PARISC_HUGE_KERNEL depends on PA8X00 help If you configure the kernel to include many drivers built-in instead diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h index cf5ee9b0b393..84ee232278a6 100644 --- a/arch/parisc/include/asm/cmpxchg.h +++ b/arch/parisc/include/asm/cmpxchg.h @@ -72,7 +72,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size) #endif case 4: return __cmpxchg_u32((unsigned int *)ptr, (unsigned int)old, (unsigned int)new_); - case 1: return __cmpxchg_u8((u8 *)ptr, (u8)old, (u8)new_); + case 1: return __cmpxchg_u8((u8 *)ptr, old & 0xff, new_ & 0xff); } __cmpxchg_called_with_bad_pointer(); return old; diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 11ece0d07374..b5fbcd2c1780 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -272,7 +272,6 @@ on downward growing arches, it looks like this: regs->gr[23] = 0; \ } while(0) -struct task_struct; struct mm_struct; /* Free all resources held by a thread. */ diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 2127974982df..65de6c4c9354 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -567,8 +567,6 @@ static const struct user_regset_view user_parisc_native_view = { }; #ifdef CONFIG_64BIT -#include <linux/compat.h> - static int gpr32_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) diff --git a/arch/parisc/math-emu/fpu.h b/arch/parisc/math-emu/fpu.h index 853c19c03828..dec951d40286 100644 --- a/arch/parisc/math-emu/fpu.h +++ b/arch/parisc/math-emu/fpu.h @@ -5,34 +5,10 @@ * Floating-point emulation code * Copyright (C) 2001 Hewlett-Packard (Paul Bame) <bame@debian.org> */ -/* - * BEGIN_DESC - * - * File: - * @(#) pa/fp/fpu.h $Revision: 1.1 $ - * - * Purpose: - * <<please update with a synopis of the functionality provided by this file>> - * - * - * END_DESC -*/ - -#ifdef __NO_PA_HDRS - PA header file -- do not include this header file for non-PA builds. -#endif - #ifndef _MACHINE_FPU_INCLUDED /* allows multiple inclusion */ #define _MACHINE_FPU_INCLUDED -#if 0 -#ifndef _SYS_STDSYMS_INCLUDED -# include <sys/stdsyms.h> -#endif /* _SYS_STDSYMS_INCLUDED */ -#include <machine/pdc/pdc_rqsts.h> -#endif - #define PA83_FPU_FLAG 0x00000001 #define PA89_FPU_FLAG 0x00000002 #define PA2_0_FPU_FLAG 0x00000010 @@ -43,21 +19,19 @@ #define COPR_FP 0x00000080 /* Floating point -- Coprocessor 0 */ #define SFU_MPY_DIVIDE 0x00008000 /* Multiply/Divide __ SFU 0 */ - #define EM_FPU_TYPE_OFFSET 272 /* version of EMULATION software for COPR,0,0 instruction */ #define EMULATION_VERSION 4 /* - * The only was to differeniate between TIMEX and ROLEX (or PCX-S and PCX-T) - * is thorough the potential type field from the PDC_MODEL call. The - * following flags are used at assist this differeniation. + * The only way to differentiate between TIMEX and ROLEX (or PCX-S and PCX-T) + * is through the potential type field from the PDC_MODEL call. + * The following flags are used to assist this differentiation. */ #define ROLEX_POTENTIAL_KEY_FLAGS PDC_MODEL_CPU_KEY_WORD_TO_IO #define TIMEX_POTENTIAL_KEY_FLAGS (PDC_MODEL_CPU_KEY_QUAD_STORE | \ PDC_MODEL_CPU_KEY_RECIP_SQRT) - #endif /* ! _MACHINE_FPU_INCLUDED */ diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c index b1e577cbf00c..88e8ea73bfa7 100644 --- a/arch/powerpc/crypto/sha1-spe-glue.c +++ b/arch/powerpc/crypto/sha1-spe-glue.c @@ -107,7 +107,7 @@ static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data, src += bytes; len -= bytes; - }; + } memcpy((char *)sctx->buffer, src, len); return 0; diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index eacc9102c251..f1d029bf906e 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -73,9 +73,10 @@ void __patch_exception(int exc, unsigned long addr); #endif #define OP_RT_RA_MASK 0xffff0000UL -#define LIS_R2 0x3c020000UL -#define ADDIS_R2_R12 0x3c4c0000UL -#define ADDI_R2_R2 0x38420000UL +#define LIS_R2 (PPC_INST_ADDIS | __PPC_RT(R2)) +#define ADDIS_R2_R12 (PPC_INST_ADDIS | __PPC_RT(R2) | __PPC_RA(R12)) +#define ADDI_R2_R2 (PPC_INST_ADDI | __PPC_RT(R2) | __PPC_RA(R2)) + static inline unsigned long ppc_function_entry(void *func) { diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h index 7897d16e0990..727d4b321937 100644 --- a/arch/powerpc/include/asm/cpu_has_feature.h +++ b/arch/powerpc/include/asm/cpu_has_feature.h @@ -7,7 +7,7 @@ #include <linux/bug.h> #include <asm/cputable.h> -static inline bool early_cpu_has_feature(unsigned long feature) +static __always_inline bool early_cpu_has_feature(unsigned long feature) { return !!((CPU_FTRS_ALWAYS & feature) || (CPU_FTRS_POSSIBLE & cur_cpu_spec->cpu_features & feature)); @@ -46,7 +46,7 @@ static __always_inline bool cpu_has_feature(unsigned long feature) return static_branch_likely(&cpu_feature_keys[i]); } #else -static inline bool cpu_has_feature(unsigned long feature) +static __always_inline bool cpu_has_feature(unsigned long feature) { return early_cpu_has_feature(feature); } diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h index 7141ccea8c94..a92059964579 100644 --- a/arch/powerpc/include/asm/dcr-native.h +++ b/arch/powerpc/include/asm/dcr-native.h @@ -53,8 +53,8 @@ static inline void mtdcrx(unsigned int reg, unsigned int val) #define mfdcr(rn) \ ({unsigned int rval; \ if (__builtin_constant_p(rn) && rn < 1024) \ - asm volatile("mfdcr %0," __stringify(rn) \ - : "=r" (rval)); \ + asm volatile("mfdcr %0, %1" : "=r" (rval) \ + : "n" (rn)); \ else if (likely(cpu_has_feature(CPU_FTR_INDEXED_DCR))) \ rval = mfdcrx(rn); \ else \ @@ -64,8 +64,8 @@ static inline void mtdcrx(unsigned int reg, unsigned int val) #define mtdcr(rn, v) \ do { \ if (__builtin_constant_p(rn) && rn < 1024) \ - asm volatile("mtdcr " __stringify(rn) ",%0" \ - : : "r" (v)); \ + asm volatile("mtdcr %0, %1" \ + : : "n" (rn), "r" (v)); \ else if (likely(cpu_has_feature(CPU_FTR_INDEXED_DCR))) \ mtdcrx(rn, v); \ else \ diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index aedfba29e43a..e8d09a841373 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -410,7 +410,6 @@ DECLARE_INTERRUPT_HANDLER(altivec_assist_exception); DECLARE_INTERRUPT_HANDLER(CacheLockingException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointRoundException); -DECLARE_INTERRUPT_HANDLER(unrecoverable_exception); DECLARE_INTERRUPT_HANDLER(WatchdogException); DECLARE_INTERRUPT_HANDLER(kernel_bad_stack); @@ -437,6 +436,8 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode); DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException); +void unrecoverable_exception(struct pt_regs *regs); + void replay_system_reset(void); void replay_soft_interrupts(void); diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 80b27f5d9648..607168b1aef4 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -228,7 +228,7 @@ enum { #define MMU_FTRS_ALWAYS 0 #endif -static inline bool early_mmu_has_feature(unsigned long feature) +static __always_inline bool early_mmu_has_feature(unsigned long feature) { if (MMU_FTRS_ALWAYS & feature) return true; @@ -286,7 +286,7 @@ static inline void mmu_feature_keys_init(void) } -static inline bool mmu_has_feature(unsigned long feature) +static __always_inline bool mmu_has_feature(unsigned long feature) { return early_mmu_has_feature(feature); } diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 975ba260006a..1499e928ea6a 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -195,7 +195,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) #define TRAP_FLAGS_MASK 0x11 #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) #define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap |= 1) +#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) #endif #define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs)) #define NV_REG_POISON 0xdeadbeefdeadbeefUL @@ -210,7 +210,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) #define TRAP_FLAGS_MASK 0x1F #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) #define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap |= 1) +#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) #define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0) #define IS_MCHECK_EXC(regs) (((regs)->trap & 4) != 0) #define IS_DEBUG_EXC(regs) (((regs)->trap & 8) != 0) diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index fdab93428372..9d1fbd8be1c7 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -71,6 +71,16 @@ static inline void disable_kernel_vsx(void) { msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); } +#else +static inline void enable_kernel_vsx(void) +{ + BUILD_BUG(); +} + +static inline void disable_kernel_vsx(void) +{ + BUILD_BUG(); +} #endif #ifdef CONFIG_SPE diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h index 0cf52746531b..721c0d6715ac 100644 --- a/arch/powerpc/include/asm/vio.h +++ b/arch/powerpc/include/asm/vio.h @@ -113,7 +113,7 @@ struct vio_driver { const char *name; const struct vio_device_id *id_table; int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); - int (*remove)(struct vio_dev *dev); + void (*remove)(struct vio_dev *dev); /* A driver must have a get_desired_dma() function to * be loaded in a CMO environment if it uses DMA. */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 6084fa499aa3..f66b63e81c3b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -191,3 +191,7 @@ $(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE targets += prom_init_check clean-files := vmlinux.lds + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32/vdso32.so.dbg +$(obj)/vdso64_wrapper.o : $(obj)/vdso64/vdso64.so.dbg diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 60d3051a8bc8..8082b690e874 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -466,7 +466,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) ld r10,PACAKMSR(r13) /* get MSR value for kernel */ /* MSR[RI] is clear iff using SRR regs */ - .if IHSRR == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION xori r10,r10,MSR_RI END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 727fdab557c9..565e84e20a72 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -457,11 +457,12 @@ InstructionTLBMiss: cmplw 0,r1,r3 #endif mfspr r2, SPRN_SDR1 - li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC + li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC | _PAGE_USER rlwinm r2, r2, 28, 0xfffff000 #ifdef CONFIG_MODULES bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ #endif 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ @@ -520,10 +521,11 @@ DataLoadTLBMiss: lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SDR1 - li r1, _PAGE_PRESENT | _PAGE_ACCESSED + li r1, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER rlwinm r2, r2, 28, 0xfffff000 bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + li r1, _PAGE_PRESENT | _PAGE_ACCESSED addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ @@ -597,10 +599,11 @@ DataStoreTLBMiss: lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SDR1 - li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER rlwinm r2, r2, 28, 0xfffff000 bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 398cd86b6ada..c475a229a42a 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -149,7 +149,7 @@ notrace long system_call_exception(long r3, long r4, long r5, * enabled when the interrupt handler returns (indicating a process-context / * synchronous interrupt) then irqs_enabled should be true. */ -static notrace inline bool __prep_irq_for_enabled_exit(bool clear_ri) +static notrace __always_inline bool __prep_irq_for_enabled_exit(bool clear_ri) { /* This must be done with RI=1 because tracing may touch vmaps */ trace_hardirqs_on(); @@ -436,7 +436,6 @@ again: return ret; } -void unrecoverable_exception(struct pt_regs *regs); void preempt_schedule_irq(void); notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr) diff --git a/arch/powerpc/kernel/ptrace/Makefile b/arch/powerpc/kernel/ptrace/Makefile index 8ebc11d1168d..77abd1a5a508 100644 --- a/arch/powerpc/kernel/ptrace/Makefile +++ b/arch/powerpc/kernel/ptrace/Makefile @@ -6,11 +6,11 @@ CFLAGS_ptrace-view.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' obj-y += ptrace.o ptrace-view.o -obj-$(CONFIG_PPC_FPU_REGS) += ptrace-fpu.o +obj-y += ptrace-fpu.o obj-$(CONFIG_COMPAT) += ptrace32.o obj-$(CONFIG_VSX) += ptrace-vsx.o ifneq ($(CONFIG_VSX),y) -obj-$(CONFIG_PPC_FPU_REGS) += ptrace-novsx.o +obj-y += ptrace-novsx.o endif obj-$(CONFIG_ALTIVEC) += ptrace-altivec.o obj-$(CONFIG_SPE) += ptrace-spe.o diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h index 3487f2c9735c..eafe5f0f6289 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-decl.h +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -165,22 +165,8 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data); extern const struct user_regset_view user_ppc_native_view; /* ptrace-fpu */ -#ifdef CONFIG_PPC_FPU_REGS int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data); int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data); -#else -static inline int -ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data) -{ - return -EIO; -} - -static inline int -ptrace_put_fpr(struct task_struct *child, int index, unsigned long data) -{ - return -EIO; -} -#endif /* ptrace-(no)adv */ void ppc_gethwdinfo(struct ppc_debug_info *dbginfo); diff --git a/arch/powerpc/kernel/ptrace/ptrace-fpu.c b/arch/powerpc/kernel/ptrace/ptrace-fpu.c index 8301cb52dd99..5dca19361316 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-fpu.c +++ b/arch/powerpc/kernel/ptrace/ptrace-fpu.c @@ -8,32 +8,42 @@ int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data) { +#ifdef CONFIG_PPC_FPU_REGS unsigned int fpidx = index - PT_FPR0; +#endif if (index > PT_FPSCR) return -EIO; +#ifdef CONFIG_PPC_FPU_REGS flush_fp_to_thread(child); if (fpidx < (PT_FPSCR - PT_FPR0)) memcpy(data, &child->thread.TS_FPR(fpidx), sizeof(long)); else *data = child->thread.fp_state.fpscr; +#else + *data = 0; +#endif return 0; } int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data) { +#ifdef CONFIG_PPC_FPU_REGS unsigned int fpidx = index - PT_FPR0; +#endif if (index > PT_FPSCR) return -EIO; +#ifdef CONFIG_PPC_FPU_REGS flush_fp_to_thread(child); if (fpidx < (PT_FPSCR - PT_FPR0)) memcpy(&child->thread.TS_FPR(fpidx), &data, sizeof(long)); else child->thread.fp_state.fpscr = data; +#endif return 0; } diff --git a/arch/powerpc/kernel/ptrace/ptrace-novsx.c b/arch/powerpc/kernel/ptrace/ptrace-novsx.c index b3b36835658a..7433f3db979a 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-novsx.c +++ b/arch/powerpc/kernel/ptrace/ptrace-novsx.c @@ -21,12 +21,16 @@ int fpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { +#ifdef CONFIG_PPC_FPU_REGS BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != offsetof(struct thread_fp_state, fpr[32])); flush_fp_to_thread(target); return membuf_write(&to, &target->thread.fp_state, 33 * sizeof(u64)); +#else + return membuf_write(&to, &empty_zero_page, 33 * sizeof(u64)); +#endif } /* @@ -46,6 +50,7 @@ int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { +#ifdef CONFIG_PPC_FPU_REGS BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != offsetof(struct thread_fp_state, fpr[32])); @@ -53,4 +58,7 @@ int fpr_set(struct task_struct *target, const struct user_regset *regset, return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.fp_state, 0, -1); +#else + return 0; +#endif } diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 2bad8068f598..6ccffc65ac97 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -522,13 +522,11 @@ static const struct user_regset native_regsets[] = { .size = sizeof(long), .align = sizeof(long), .regset_get = gpr_get, .set = gpr_set }, -#ifdef CONFIG_PPC_FPU_REGS [REGSET_FPR] = { .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, .size = sizeof(double), .align = sizeof(double), .regset_get = fpr_get, .set = fpr_set }, -#endif #ifdef CONFIG_ALTIVEC [REGSET_VMX] = { .core_note_type = NT_PPC_VMX, .n = 34, diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 75ee918a120a..f651b992fe01 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -775,7 +775,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, else prepare_save_user_regs(1); - if (!user_write_access_begin(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) goto badframe; /* Put the siginfo & fill in most of the ucontext */ @@ -809,17 +809,15 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, unsafe_put_user(PPC_INST_ADDI + __NR_rt_sigreturn, &mctx->mc_pad[0], failed); unsafe_put_user(PPC_INST_SC, &mctx->mc_pad[1], failed); + asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0])); } unsafe_put_sigset_t(&frame->uc.uc_sigmask, oldset, failed); - user_write_access_end(); + user_access_end(); if (copy_siginfo_to_user(&frame->info, &ksig->info)) goto badframe; - if (tramp == (unsigned long)mctx->mc_pad) - flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long)); - regs->link = tramp; #ifdef CONFIG_PPC_FPU_REGS @@ -844,7 +842,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, return 0; failed: - user_write_access_end(); + user_access_end(); badframe: signal_fault(tsk, regs, "handle_rt_signal32", frame); @@ -879,7 +877,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, else prepare_save_user_regs(1); - if (!user_write_access_begin(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) goto badframe; sc = (struct sigcontext __user *) &frame->sctx; @@ -908,11 +906,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, /* Set up the sigreturn trampoline: li r0,sigret; sc */ unsafe_put_user(PPC_INST_ADDI + __NR_sigreturn, &mctx->mc_pad[0], failed); unsafe_put_user(PPC_INST_SC, &mctx->mc_pad[1], failed); + asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0])); } - user_write_access_end(); - - if (tramp == (unsigned long)mctx->mc_pad) - flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long)); + user_access_end(); regs->link = tramp; @@ -935,7 +931,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, return 0; failed: - user_write_access_end(); + user_access_end(); badframe: signal_fault(tsk, regs, "handle_signal32", frame); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1583fd1c6010..a44a30b0688c 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -2170,7 +2170,7 @@ DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException) * in the MSR is 0. This indicates that SRR0/1 are live, and that * we therefore lost state by taking this exception. */ -DEFINE_INTERRUPT_HANDLER(unrecoverable_exception) +void unrecoverable_exception(struct pt_regs *regs) { pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", regs->trap, regs->nip, regs->msr); diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index a6e29f880e0e..d21d08140a5e 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -65,3 +65,14 @@ V_FUNCTION_END(__kernel_clock_getres) V_FUNCTION_BEGIN(__kernel_time) cvdso_call_time __c_kernel_time V_FUNCTION_END(__kernel_time) + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ +_GLOBAL(_restgpr_31_x) +_GLOBAL(_rest32gpr_31_x) + lwz r0,4(r11) + lwz r31,-4(r11) + mtlr r0 + mr r1,r11 + blr diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index bb5c20d4ca91..c6aebc149d14 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -904,7 +904,7 @@ static nokprobe_inline int do_vsx_load(struct instruction_op *op, if (!address_ok(regs, ea, size) || copy_mem_in(mem, ea, size, regs)) return -EFAULT; - nr_vsx_regs = size / sizeof(__vector128); + nr_vsx_regs = max(1ul, size / sizeof(__vector128)); emulate_vsx_load(op, buf, mem, cross_endian); preempt_disable(); if (reg < 32) { @@ -951,7 +951,7 @@ static nokprobe_inline int do_vsx_store(struct instruction_op *op, if (!address_ok(regs, ea, size)) return -EFAULT; - nr_vsx_regs = size / sizeof(__vector128); + nr_vsx_regs = max(1ul, size / sizeof(__vector128)); preempt_disable(); if (reg < 32) { /* FP regs + extensions */ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6817331e22ff..766f064f00fb 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -222,7 +222,7 @@ static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs * if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid) *addrp = mfspr(SPRN_SDAR); - if (is_kernel_addr(mfspr(SPRN_SDAR)) && perf_allow_kernel(&event->attr) != 0) + if (is_kernel_addr(mfspr(SPRN_SDAR)) && event->attr.exclude_kernel) *addrp = 0; } @@ -507,7 +507,7 @@ static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events * * addresses, hence include a check before filtering code */ if (!(ppmu->flags & PPMU_ARCH_31) && - is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0) + is_kernel_addr(addr) && event->attr.exclude_kernel) continue; /* Branches are read most recent first (ie. mfbhrb 0 is diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 764170fdb0f7..3805519a6469 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -887,7 +887,8 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, want_v = hpte_encode_avpn(vpn, psize, ssize); - flags = (newpp & 7) | H_AVPN; + flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN; + flags |= (newpp & HPTE_R_KEY_HI) >> 48; if (mmu_has_feature(MMU_FTR_KERNEL_RO)) /* Move pp0 into bit 8 (IBM 55) */ flags |= (newpp & HPTE_R_PP0) >> 55; diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index ea4d6a660e0d..e83e0891272d 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -452,12 +452,28 @@ static int do_suspend(void) return ret; } +/** + * struct pseries_suspend_info - State shared between CPUs for join/suspend. + * @counter: Threads are to increment this upon resuming from suspend + * or if an error is received from H_JOIN. The thread which performs + * the first increment (i.e. sets it to 1) is responsible for + * waking the other threads. + * @done: False if join/suspend is in progress. True if the operation is + * complete (successful or not). + */ +struct pseries_suspend_info { + atomic_t counter; + bool done; +}; + static int do_join(void *arg) { - atomic_t *counter = arg; + struct pseries_suspend_info *info = arg; + atomic_t *counter = &info->counter; long hvrc; int ret; +retry: /* Must ensure MSR.EE off for H_JOIN. */ hard_irq_disable(); hvrc = plpar_hcall_norets(H_JOIN); @@ -473,8 +489,20 @@ static int do_join(void *arg) case H_SUCCESS: /* * The suspend is complete and this cpu has received a - * prod. + * prod, or we've received a stray prod from unrelated + * code (e.g. paravirt spinlocks) and we need to join + * again. + * + * This barrier orders the return from H_JOIN above vs + * the load of info->done. It pairs with the barrier + * in the wakeup/prod path below. */ + smp_mb(); + if (READ_ONCE(info->done) == false) { + pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying", + smp_processor_id()); + goto retry; + } ret = 0; break; case H_BAD_MODE: @@ -488,6 +516,13 @@ static int do_join(void *arg) if (atomic_inc_return(counter) == 1) { pr_info("CPU %u waking all threads\n", smp_processor_id()); + WRITE_ONCE(info->done, true); + /* + * This barrier orders the store to info->done vs subsequent + * H_PRODs to wake the other CPUs. It pairs with the barrier + * in the H_SUCCESS case above. + */ + smp_mb(); prod_others(); } /* @@ -535,11 +570,16 @@ static int pseries_suspend(u64 handle) int ret; while (true) { - atomic_t counter = ATOMIC_INIT(0); + struct pseries_suspend_info info; unsigned long vasi_state; int vasi_err; - ret = stop_machine(do_join, &counter, cpu_online_mask); + info = (struct pseries_suspend_info) { + .counter = ATOMIC_INIT(0), + .done = false, + }; + + ret = stop_machine(do_join, &info, cpu_online_mask); if (ret == 0) break; /* diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index b3ac2455faad..637300330507 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -4,6 +4,7 @@ * Copyright 2006-2007 Michael Ellerman, IBM Corp. */ +#include <linux/crash_dump.h> #include <linux/device.h> #include <linux/irq.h> #include <linux/msi.h> @@ -458,8 +459,28 @@ again: return hwirq; } - virq = irq_create_mapping_affinity(NULL, hwirq, - entry->affinity); + /* + * Depending on the number of online CPUs in the original + * kernel, it is likely for CPU #0 to be offline in a kdump + * kernel. The associated IRQs in the affinity mappings + * provided by irq_create_affinity_masks() are thus not + * started by irq_startup(), as per-design for managed IRQs. + * This can be a problem with multi-queue block devices driven + * by blk-mq : such a non-started IRQ is very likely paired + * with the single queue enforced by blk-mq during kdump (see + * blk_mq_alloc_tag_set()). This causes the device to remain + * silent and likely hangs the guest at some point. + * + * We don't really care for fine-grained affinity when doing + * kdump actually : simply ignore the pre-computed affinity + * masks in this case and let the default mask with all CPUs + * be used when creating the IRQ mappings. + */ + if (is_kdump_kernel()) + virq = irq_create_mapping(NULL, hwirq); + else + virq = irq_create_mapping_affinity(NULL, hwirq, + entry->affinity); if (!virq) { pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq); diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index b2797cfe4e2b..9cb4fc839fd5 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1261,7 +1261,6 @@ static int vio_bus_remove(struct device *dev) struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); struct device *devptr; - int ret = 1; /* * Hold a reference to the device after the remove function is called @@ -1270,13 +1269,13 @@ static int vio_bus_remove(struct device *dev) devptr = get_device(dev); if (viodrv->remove) - ret = viodrv->remove(viodev); + viodrv->remove(viodev); - if (!ret && firmware_has_feature(FW_FEATURE_CMO)) + if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_bus_remove(viodev); put_device(devptr); - return ret; + return 0; } /** diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 85d626b8ce5e..4515a10c5d22 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -93,7 +93,6 @@ config RISCV select PCI_MSI if PCI select RISCV_INTC select RISCV_TIMER if RISCV_SBI - select SPARSEMEM_STATIC if 32BIT select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK @@ -154,7 +153,8 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y depends on MMU - select SPARSEMEM_VMEMMAP_ENABLE + select SPARSEMEM_STATIC if 32BIT && SPARSEMEM + select SPARSEMEM_VMEMMAP_ENABLE if 64BIT config ARCH_SELECT_MEMORY_MODEL def_bool ARCH_SPARSEMEM_ENABLE @@ -314,7 +314,7 @@ endchoice # Common NUMA Features config NUMA bool "NUMA Memory Allocation and Scheduler Support" - depends on SMP + depends on SMP && MMU select GENERIC_ARCH_NUMA select OF_NUMA select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 7efcece8896c..e1b2690b6e45 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -31,6 +31,8 @@ config SOC_CANAAN select SIFIVE_PLIC select ARCH_HAS_RESET_CONTROLLER select PINCTRL + select COMMON_CLK + select COMMON_CLK_K210 help This enables support for Canaan Kendryte K210 SoC platform hardware. diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index 27e005fca584..2a652b0c987d 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -9,4 +9,20 @@ long long __lshrti3(long long a, int b); long long __ashrti3(long long a, int b); long long __ashlti3(long long a, int b); + +#define DECLARE_DO_ERROR_INFO(name) asmlinkage void name(struct pt_regs *regs) + +DECLARE_DO_ERROR_INFO(do_trap_unknown); +DECLARE_DO_ERROR_INFO(do_trap_insn_misaligned); +DECLARE_DO_ERROR_INFO(do_trap_insn_fault); +DECLARE_DO_ERROR_INFO(do_trap_insn_illegal); +DECLARE_DO_ERROR_INFO(do_trap_load_fault); +DECLARE_DO_ERROR_INFO(do_trap_load_misaligned); +DECLARE_DO_ERROR_INFO(do_trap_store_misaligned); +DECLARE_DO_ERROR_INFO(do_trap_store_fault); +DECLARE_DO_ERROR_INFO(do_trap_ecall_u); +DECLARE_DO_ERROR_INFO(do_trap_ecall_s); +DECLARE_DO_ERROR_INFO(do_trap_ecall_m); +DECLARE_DO_ERROR_INFO(do_trap_break); + #endif /* _ASM_RISCV_PROTOTYPES_H */ diff --git a/arch/riscv/include/asm/irq.h b/arch/riscv/include/asm/irq.h index 9807ad164015..e4c435509983 100644 --- a/arch/riscv/include/asm/irq.h +++ b/arch/riscv/include/asm/irq.h @@ -12,4 +12,6 @@ #include <asm-generic/irq.h> +extern void __init init_IRQ(void); + #endif /* _ASM_RISCV_IRQ_H */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 3a240037bde2..021ed64ee608 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -71,6 +71,7 @@ int riscv_of_processor_hartid(struct device_node *node); int riscv_of_parent_hartid(struct device_node *node); extern void riscv_fill_hwcap(void); +extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index cb4abb639e8d..09ad4e923510 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -119,6 +119,11 @@ extern int regs_query_register_offset(const char *name); extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer); +int do_syscall_trace_enter(struct pt_regs *regs); +void do_syscall_trace_exit(struct pt_regs *regs); + /** * regs_get_register() - get register value from its offset * @regs: pt_regs from which register value is gotten diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 99895d9c3bdd..d7027411dde8 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -51,10 +51,10 @@ enum sbi_ext_rfence_fid { SBI_EXT_RFENCE_REMOTE_FENCE_I = 0, SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, - SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID, - SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, + SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, + SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, }; enum sbi_ext_hsm_fid { diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h index 81de51e6aa32..507cae273bc6 100644 --- a/arch/riscv/include/asm/timex.h +++ b/arch/riscv/include/asm/timex.h @@ -88,4 +88,6 @@ static inline int read_current_timer(unsigned long *timer_val) return 0; } +extern void time_init(void); + #endif /* _ASM_RISCV_TIMEX_H */ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 824b2c9da75b..f944062c9d99 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -306,7 +306,9 @@ do { \ * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable - * to the result of dereferencing @ptr. + * to the result of dereferencing @ptr. The value of @x is copied to avoid + * re-ordering where @x is evaluated inside the block that enables user-space + * access (thus bypassing user space protection if @x is a function). * * Caller must check the pointer with access_ok() before calling this * function. @@ -316,12 +318,13 @@ do { \ #define __put_user(x, ptr) \ ({ \ __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ + __typeof__(*__gu_ptr) __val = (x); \ long __pu_err = 0; \ \ __chk_user_ptr(__gu_ptr); \ \ __enable_user_access(); \ - __put_user_nocheck(x, __gu_ptr, __pu_err); \ + __put_user_nocheck(__val, __gu_ptr, __pu_err); \ __disable_user_access(); \ \ __pu_err; \ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 3dc0abde988a..647a47f5484a 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -8,6 +8,7 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) endif +CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) extra-y += head.o extra-y += vmlinux.lds diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 744f3209c48d..83095faa680e 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -130,6 +130,9 @@ skip_context_tracking: */ andi t0, s1, SR_PIE beqz t0, 1f + /* kprobes, entered via ebreak, must have interrupts disabled. */ + li t0, EXC_BREAKPOINT + beq s4, t0, 1f #ifdef CONFIG_TRACE_IRQFLAGS call trace_hardirqs_on #endif @@ -447,6 +450,7 @@ ENDPROC(__switch_to) #endif .section ".rodata" + .align LGREG /* Exception vector table */ ENTRY(excp_vect_table) RISCV_PTR do_trap_insn_misaligned diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index e6372490aa0b..aab85a82f419 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -2,39 +2,47 @@ #include <linux/kprobes.h> -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe *p; + struct pt_regs *regs; struct kprobe_ctlblk *kcb; + int bit; + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; + regs = ftrace_get_regs(fregs); kcb = get_kprobe_ctlblk(); if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { - unsigned long orig_ip = instruction_pointer(&(regs->regs)); + unsigned long orig_ip = instruction_pointer(regs); - instruction_pointer_set(&(regs->regs), ip); + instruction_pointer_set(regs, ip); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, &(regs->regs))) { + if (!p->pre_handler || !p->pre_handler(p, regs)) { /* * Emulate singlestep (and also recover regs->pc) * as if there is a nop */ - instruction_pointer_set(&(regs->regs), + instruction_pointer_set(regs, (unsigned long)p->addr + MCOUNT_INSN_SIZE); if (unlikely(p->post_handler)) { kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, &(regs->regs), 0); + p->post_handler(p, regs, 0); } - instruction_pointer_set(&(regs->regs), orig_ip); + instruction_pointer_set(regs, orig_ip); } /* @@ -43,6 +51,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, */ __this_cpu_write(current_kprobe, NULL); } +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index a2ec18662fee..7e2c78e2ca6b 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -256,8 +256,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) * normal page fault. */ regs->epc = (unsigned long) cur->addr; - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 6f728e731bed..f9cd57c9c67d 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -10,6 +10,7 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/tick.h> #include <linux/ptrace.h> diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index f4a7db3d309e..d3bf756321a5 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -116,7 +116,7 @@ void sbi_clear_ipi(void) EXPORT_SYMBOL(sbi_clear_ipi); /** - * sbi_set_timer_v01() - Program the timer for next timer event. + * __sbi_set_timer_v01() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. * * Return: None diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index e85bacff1b50..f8f15332caa2 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -147,7 +147,8 @@ static void __init init_resources(void) bss_res.end = __pa_symbol(__bss_stop) - 1; bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt) * sizeof(*mem_res); + /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */ + mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt + 1) * sizeof(*mem_res); mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); if (!mem_res) panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 3f893c9d9d85..2b3e0cb90d78 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -14,7 +14,7 @@ #include <asm/stacktrace.h> -register const unsigned long sp_in_global __asm__("sp"); +register unsigned long sp_in_global __asm__("sp"); #ifdef CONFIG_FRAME_POINTER diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 8a5cf99c0776..1b432264f7ef 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -9,6 +9,7 @@ #include <linux/delay.h> #include <asm/sbi.h> #include <asm/processor.h> +#include <asm/timex.h> unsigned long riscv_timebase; EXPORT_SYMBOL_GPL(riscv_timebase); diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 3ed2c23601a0..1357abf79570 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/irq.h> +#include <asm/asm-prototypes.h> #include <asm/bug.h> #include <asm/processor.h> #include <asm/ptrace.h> @@ -177,6 +178,7 @@ asmlinkage __visible void do_trap_break(struct pt_regs *regs) else die(regs, "Kernel BUG"); } +NOKPROBE_SYMBOL(do_trap_break); #ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long pc) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 8f17519208c7..c5dbd55cbf7c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -328,3 +328,4 @@ good_area: } return; } +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 3fc18f469efb..937d13ce9ab8 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -155,7 +155,7 @@ static void __init kasan_populate(void *start, void *end) memset(start, KASAN_SHADOW_INIT, end - start); } -void __init kasan_shallow_populate(void *start, void *end) +static void __init kasan_shallow_populate(void *start, void *end) { unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); @@ -187,6 +187,8 @@ void __init kasan_shallow_populate(void *start, void *end) } vaddr += PAGE_SIZE; } + + local_flush_tlb_all(); } void __init kasan_init(void) @@ -214,7 +216,7 @@ void __init kasan_init(void) break; kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); - }; + } for (i = 0; i < PTRS_PER_PTE; i++) set_pte(&kasan_early_shadow_pte[i], diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 02056b024091..dc0b69058ac4 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -275,9 +275,9 @@ CONFIG_IP_VS_DH=m CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m CONFIG_IP_VS_FTP=m CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_IP_NF_IPTABLES=m @@ -298,7 +298,6 @@ CONFIG_IP_NF_SECURITY=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_TABLES_IPV6=y CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m @@ -481,7 +480,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_ATHEROS is not set -# CONFIG_NET_VENDOR_AURORA is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set @@ -581,7 +579,6 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m -# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -635,6 +632,7 @@ CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y CONFIG_HUGETLBFS=y CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m @@ -714,12 +712,8 @@ CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_BLAKE2S=m CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -731,7 +725,6 @@ CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_SM4=m @@ -796,12 +789,9 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y CONFIG_SLUB_DEBUG_ON=y CONFIG_SLUB_STATS=y -CONFIG_DEBUG_KMEMLEAK=y -CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_VMACACHE=y -CONFIG_DEBUG_VM_RB=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m @@ -838,6 +828,7 @@ CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y CONFIG_FTRACE_STARTUP_TEST=y # CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_DEBUG_ENTRY=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y @@ -861,4 +852,3 @@ CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m -CONFIG_DEBUG_ENTRY=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index bac721a501da..320379da96d9 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -266,9 +266,9 @@ CONFIG_IP_VS_DH=m CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m CONFIG_IP_VS_FTP=m CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_IP_NF_IPTABLES=m @@ -289,7 +289,6 @@ CONFIG_IP_NF_SECURITY=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_TABLES_IPV6=y CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m @@ -473,7 +472,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_ATHEROS is not set -# CONFIG_NET_VENDOR_AURORA is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set @@ -573,7 +571,6 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m -# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -623,6 +620,7 @@ CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y CONFIG_HUGETLBFS=y CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m @@ -703,12 +701,8 @@ CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_BLAKE2S=m CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -720,7 +714,6 @@ CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_SM4=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index acf982a2ae4c..76123a4b26ab 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -26,7 +26,6 @@ CONFIG_CRASH_DUMP=y # CONFIG_SECCOMP is not set # CONFIG_GCC_PLUGINS is not set CONFIG_PARTITION_ADVANCED=y -CONFIG_IBM_PARTITION=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_COMPACTION is not set # CONFIG_MIGRATION is not set @@ -61,11 +60,9 @@ CONFIG_RAW_DRIVER=y # CONFIG_HID is not set # CONFIG_VIRTIO_MENU is not set # CONFIG_VHOST_MENU is not set -# CONFIG_SURFACE_PLATFORMS is not set # CONFIG_IOMMU_SUPPORT is not set # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set -CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_LSM="yama,loadpin,safesetid,integrity" diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index b04f6a794cdf..5cea629c548e 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -14,12 +14,12 @@ struct s390_idle_data { seqcount_t seqcount; - unsigned long long idle_count; - unsigned long long idle_time; - unsigned long long clock_idle_enter; - unsigned long long clock_idle_exit; - unsigned long long timer_idle_enter; - unsigned long long timer_idle_exit; + unsigned long idle_count; + unsigned long idle_time; + unsigned long clock_idle_enter; + unsigned long clock_idle_exit; + unsigned long timer_idle_enter; + unsigned long timer_idle_exit; unsigned long mt_cycles_enter[8]; }; diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 053fe8b8dec7..a75d94a9bcb2 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -202,7 +202,7 @@ extern unsigned int s390_pci_no_rid; ----------------------------------------------------------------------------- */ /* Base stuff */ int zpci_create_device(u32 fid, u32 fh, enum zpci_state state); -void zpci_remove_device(struct zpci_dev *zdev); +void zpci_remove_device(struct zpci_dev *zdev, bool set_error); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index ee056f4a4fa3..2b543163d90a 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -12,6 +12,7 @@ enum stack_type { STACK_TYPE_IRQ, STACK_TYPE_NODAT, STACK_TYPE_RESTART, + STACK_TYPE_MCCK, }; struct stack_info { diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index c4e23e925665..f6326c6d2abe 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -98,10 +98,10 @@ extern unsigned char ptff_function_mask[16]; /* Query TOD offset result */ struct ptff_qto { - unsigned long long physical_clock; - unsigned long long tod_offset; - unsigned long long logical_tod_offset; - unsigned long long tod_epoch_difference; + unsigned long physical_clock; + unsigned long tod_offset; + unsigned long logical_tod_offset; + unsigned long tod_epoch_difference; } __packed; static inline int ptff_query(unsigned int nr) @@ -151,9 +151,9 @@ struct ptff_qui { rc; \ }) -static inline unsigned long long local_tick_disable(void) +static inline unsigned long local_tick_disable(void) { - unsigned long long old; + unsigned long old; old = S390_lowcore.clock_comparator; S390_lowcore.clock_comparator = clock_comparator_max; @@ -161,7 +161,7 @@ static inline unsigned long long local_tick_disable(void) return old; } -static inline void local_tick_enable(unsigned long long comp) +static inline void local_tick_enable(unsigned long comp) { S390_lowcore.clock_comparator = comp; set_clock_comparator(S390_lowcore.clock_comparator); @@ -169,9 +169,9 @@ static inline void local_tick_enable(unsigned long long comp) #define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ -typedef unsigned long long cycles_t; +typedef unsigned long cycles_t; -static inline unsigned long long get_tod_clock(void) +static inline unsigned long get_tod_clock(void) { union tod_clock clk; @@ -179,10 +179,10 @@ static inline unsigned long long get_tod_clock(void) return clk.tod; } -static inline unsigned long long get_tod_clock_fast(void) +static inline unsigned long get_tod_clock_fast(void) { #ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - unsigned long long clk; + unsigned long clk; asm volatile("stckf %0" : "=Q" (clk) : : "cc"); return clk; @@ -208,9 +208,9 @@ extern union tod_clock tod_clock_base; * Therefore preemption must be disabled, otherwise the returned * value is not guaranteed to be monotonic. */ -static inline unsigned long long get_tod_clock_monotonic(void) +static inline unsigned long get_tod_clock_monotonic(void) { - unsigned long long tod; + unsigned long tod; preempt_disable_notrace(); tod = get_tod_clock() - tod_clock_base.tod; @@ -237,7 +237,7 @@ static inline unsigned long long get_tod_clock_monotonic(void) * -> ns = (th * 125) + ((tl * 125) >> 9); * */ -static inline unsigned long long tod_to_ns(unsigned long long todval) +static inline unsigned long tod_to_ns(unsigned long todval) { return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } @@ -249,10 +249,10 @@ static inline unsigned long long tod_to_ns(unsigned long long todval) * * Returns: true if a is later than b */ -static inline int tod_after(unsigned long long a, unsigned long long b) +static inline int tod_after(unsigned long a, unsigned long b) { if (MACHINE_HAS_SCC) - return (long long) a > (long long) b; + return (long) a > (long) b; return a > b; } @@ -263,10 +263,10 @@ static inline int tod_after(unsigned long long a, unsigned long long b) * * Returns: true if a is later than b */ -static inline int tod_after_eq(unsigned long long a, unsigned long long b) +static inline int tod_after_eq(unsigned long a, unsigned long b) { if (MACHINE_HAS_SCC) - return (long long) a >= (long long) b; + return (long) a >= (long) b; return a >= b; } diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h index 7b3cdb4a5f48..73ee89142666 100644 --- a/arch/s390/include/asm/vdso/data.h +++ b/arch/s390/include/asm/vdso/data.h @@ -6,7 +6,7 @@ #include <vdso/datapage.h> struct arch_vdso_data { - __u64 tod_steering_delta; + __s64 tod_steering_delta; __u64 tod_steering_end; }; diff --git a/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h b/arch/s390/include/uapi/asm/hwctrset.h index 3d8284b95f87..3d8284b95f87 100644 --- a/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h +++ b/arch/s390/include/uapi/asm/hwctrset.h diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index af013b4244d3..2da027359798 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -37,10 +37,12 @@ static int diag8_noresponse(int cmdlen) static int diag8_response(int cmdlen, char *response, int *rlen) { + unsigned long _cmdlen = cmdlen | 0x40000000L; + unsigned long _rlen = *rlen; register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf; register unsigned long reg3 asm ("3") = (addr_t) response; - register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L; - register unsigned long reg5 asm ("5") = *rlen; + register unsigned long reg4 asm ("4") = _cmdlen; + register unsigned long reg5 asm ("5") = _rlen; asm volatile( " diag %2,%0,0x8\n" diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 0dc4b258b98d..db1bc00229ca 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -79,6 +79,15 @@ static bool in_nodat_stack(unsigned long sp, struct stack_info *info) return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); } +static bool in_mcck_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.mcck_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_MCCK, top - THREAD_SIZE, top); +} + static bool in_restart_stack(unsigned long sp, struct stack_info *info) { unsigned long frame_size, top; @@ -108,7 +117,8 @@ int get_stack_info(unsigned long sp, struct task_struct *task, /* Check per-cpu stacks */ if (!in_irq_stack(sp, info) && !in_nodat_stack(sp, info) && - !in_restart_stack(sp, info)) + !in_restart_stack(sp, info) && + !in_mcck_stack(sp, info)) goto unknown; recursion_check: diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index c10b9f31eef7..12de7a9c85b3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -401,15 +401,13 @@ ENTRY(\name) brasl %r14,.Lcleanup_sie_int #endif 0: CHECK_STACK __LC_SAVE_AREA_ASYNC - lgr %r11,%r15 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - stg %r11,__SF_BACKCHAIN(%r15) j 2f 1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP lctlg %c1,%c1,__LC_KERNEL_ASCE lg %r15,__LC_KERNEL_STACK - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) -2: la %r11,STACK_FRAME_OVERHEAD(%r15) +2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 @@ -445,6 +443,7 @@ INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq * Load idle PSW. */ ENTRY(psw_idle) + stg %r14,(__SF_GPRS+8*8)(%r15) stg %r3,__SF_EMPTY(%r15) larl %r1,psw_idle_exit stg %r1,__SF_EMPTY+8(%r15) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 812073ea073e..4bf1ee293f2b 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -47,7 +47,7 @@ void account_idle_time_irq(void) void arch_cpu_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - unsigned long long idle_time; + unsigned long idle_time; unsigned long psw_mask; /* Wait for external, I/O or machine check interrupt. */ @@ -73,7 +73,7 @@ static ssize_t show_idle_count(struct device *dev, struct device_attribute *attr, char *buf) { struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned long long idle_count; + unsigned long idle_count; unsigned int seq; do { @@ -82,14 +82,14 @@ static ssize_t show_idle_count(struct device *dev, if (READ_ONCE(idle->clock_idle_enter)) idle_count++; } while (read_seqcount_retry(&idle->seqcount, seq)); - return sprintf(buf, "%llu\n", idle_count); + return sprintf(buf, "%lu\n", idle_count); } DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); static ssize_t show_idle_time(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned long long now, idle_time, idle_enter, idle_exit, in_idle; + unsigned long now, idle_time, idle_enter, idle_exit, in_idle; struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); unsigned int seq; @@ -109,14 +109,14 @@ static ssize_t show_idle_time(struct device *dev, } } idle_time += in_idle; - return sprintf(buf, "%llu\n", idle_time >> 12); + return sprintf(buf, "%lu\n", idle_time >> 12); } DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); u64 arch_cpu_idle_time(int cpu) { struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); - unsigned long long now, idle_enter, idle_exit, in_idle; + unsigned long now, idle_enter, idle_exit, in_idle; unsigned int seq; do { diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 601c21791338..714269e10eec 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -174,7 +174,7 @@ void noinstr do_ext_irq(struct pt_regs *regs) memcpy(®s->int_code, &S390_lowcore.ext_cpu_addr, 4); regs->int_parm = S390_lowcore.ext_params; - regs->int_parm_long = *(unsigned long *)S390_lowcore.ext_params2; + regs->int_parm_long = S390_lowcore.ext_params2; from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; if (from_idle) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 0eb1d1cc53a8..b3beef64d3d4 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -269,7 +269,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) case CPUMF_CTR_SET_MAX: /* The counter could not be associated to a counter set */ return -EINVAL; - }; + } /* Initialize for using the CPU-measurement counter facility */ if (!atomic_inc_not_zero(&num_events)) { diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index db4877bbb9aa..2e3e7edbe3a0 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -26,12 +26,10 @@ #include <asm/timex.h> #include <asm/debug.h> -#include <asm/perf_cpum_cf_diag.h> +#include <asm/hwctrset.h> #define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ -#define CF_DIAG_MIN_INTERVAL 60 /* Minimum counter set read */ /* interval in seconds */ -static unsigned long cf_diag_interval = CF_DIAG_MIN_INTERVAL; static unsigned int cf_diag_cpu_speed; static debug_info_t *cf_diag_dbg; @@ -729,7 +727,6 @@ static DEFINE_MUTEX(cf_diag_ctrset_mutex); static struct cf_diag_ctrset { unsigned long ctrset; /* Bit mask of counter set to read */ cpumask_t mask; /* CPU mask to read from */ - time64_t lastread; /* Epoch counter set last read */ } cf_diag_ctrset; static void cf_diag_ctrset_clear(void) @@ -866,27 +863,16 @@ static int cf_diag_all_read(unsigned long arg) { struct cf_diag_call_on_cpu_parm p; cpumask_var_t mask; - time64_t now; - int rc = 0; + int rc; debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - now = ktime_get_seconds(); - if (cf_diag_ctrset.lastread + cf_diag_interval > now) { - debug_sprintf_event(cf_diag_dbg, 5, "%s now %lld " - " lastread %lld\n", __func__, now, - cf_diag_ctrset.lastread); - rc = -EAGAIN; - goto out; - } else { - cf_diag_ctrset.lastread = now; - } + p.sets = cf_diag_ctrset.ctrset; cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1); rc = cf_diag_all_copy(arg, mask); -out: free_cpumask_var(mask); debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc); return rc; @@ -982,7 +968,7 @@ static int cf_diag_all_start(void) */ static size_t cf_diag_needspace(unsigned int sets) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); size_t bytes = 0; int i; @@ -998,6 +984,7 @@ static size_t cf_diag_needspace(unsigned int sets) sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__, bytes); + put_cpu_ptr(&cpu_cf_events); return bytes; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 60da976eee6f..72134f9f6ff5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -354,7 +354,7 @@ static int __init stack_realloc(void) if (!new) panic("Couldn't allocate machine check stack"); WRITE_ONCE(S390_lowcore.mcck_stack, new + STACK_INIT_OFFSET); - memblock_free(old, THREAD_SIZE); + memblock_free_late(old, THREAD_SIZE); return 0; } early_initcall(stack_realloc); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 7f1266c24f6b..101477b3e263 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -24,12 +24,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 06bcfa636638..326cb8f75f58 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -68,10 +68,10 @@ EXPORT_SYMBOL(s390_epoch_delta_notifier); unsigned char ptff_function_mask[16]; -static unsigned long long lpar_offset; -static unsigned long long initial_leap_seconds; -static unsigned long long tod_steering_end; -static long long tod_steering_delta; +static unsigned long lpar_offset; +static unsigned long initial_leap_seconds; +static unsigned long tod_steering_end; +static long tod_steering_delta; /* * Get time offsets with PTFF @@ -80,10 +80,12 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; + int cs; /* Initialize TOD steering parameters */ tod_steering_end = tod_clock_base.tod; - vdso_data->arch_data.tod_steering_end = tod_steering_end; + for (cs = 0; cs < CS_BASES; cs++) + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -96,7 +98,7 @@ void __init time_early_init(void) /* get initial leap seconds */ if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) - initial_leap_seconds = (unsigned long long) + initial_leap_seconds = (unsigned long) ((long) qui.old_leap * 4096000000L); } @@ -222,7 +224,7 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, static u64 read_tod_clock(struct clocksource *cs) { - unsigned long long now, adj; + unsigned long now, adj; preempt_disable(); /* protect from changes to steering parameters */ now = get_tod_clock(); @@ -362,10 +364,11 @@ static inline int check_sync_clock(void) * Apply clock delta to the global data structures. * This is called once on the CPU that performed the clock sync. */ -static void clock_sync_global(unsigned long long delta) +static void clock_sync_global(unsigned long delta) { unsigned long now, adj; struct ptff_qto qto; + int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; @@ -378,10 +381,13 @@ static void clock_sync_global(unsigned long long delta) -(adj >> 15) : (adj >> 15); tod_steering_delta += delta; if ((abs(tod_steering_delta) >> 48) != 0) - panic("TOD clock sync offset %lli is too large to drift\n", + panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - vdso_data->arch_data.tod_steering_end = tod_steering_end; + for (cs = 0; cs < CS_BASES; cs++) { + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; + } /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) @@ -394,7 +400,7 @@ static void clock_sync_global(unsigned long long delta) * Apply clock delta to the per-CPU data structures of this CPU. * This is called for each online CPU after the call to clock_sync_global. */ -static void clock_sync_local(unsigned long long delta) +static void clock_sync_local(unsigned long delta) { /* Add the delta to the clock comparator. */ if (S390_lowcore.clock_comparator != clock_comparator_max) { @@ -418,7 +424,7 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long long clock_delta; + unsigned long clock_delta; }; /* @@ -538,7 +544,7 @@ static int stpinfo_valid(void) static int stp_sync_clock(void *data) { struct clock_sync_data *sync = data; - unsigned long long clock_delta, flags; + u64 clock_delta, flags; static int first; int rc; @@ -720,8 +726,8 @@ static ssize_t ctn_id_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%016llx\n", - *(unsigned long long *) stp_info.ctnid); + ret = sprintf(buf, "%016lx\n", + *(unsigned long *) stp_info.ctnid); mutex_unlock(&stp_mutex); return ret; } @@ -794,7 +800,7 @@ static ssize_t leap_seconds_scheduled_show(struct device *dev, if (!stzi.lsoib.p) return sprintf(buf, "0,0\n"); - return sprintf(buf, "%llu,%d\n", + return sprintf(buf, "%lu,%d\n", tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, stzi.lsoib.nlso - stzi.lsoib.also); } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index e7ce447651b9..bfcc327acc6b 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -76,8 +76,6 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c } info = info->next; } - if (cpumask_empty(&mask)) - cpumask_copy(&mask, cpumask_of(cpu)); break; case TOPOLOGY_MODE_PACKAGE: cpumask_copy(&mask, cpu_present_mask); diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 73c7afcc0527..f216a1b2f825 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -214,7 +214,7 @@ void vtime_flush(struct task_struct *tsk) avg_steal = S390_lowcore.avg_steal_timer / 2; if ((s64) steal > 0) { S390_lowcore.steal_timer = 0; - account_steal_time(steal); + account_steal_time(cputime_to_nsecs(steal)); avg_steal += steal; } S390_lowcore.avg_steal_timer = avg_steal; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index e3183bd05910..d548d60caed2 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1287,7 +1287,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu) /* already expired? */ if (cputm >> 63) return 0; - return min(sltime, tod_to_ns(cputm)); + return min_t(u64, sltime, tod_to_ns(cputm)); } } else if (cpu_timer_interrupts_enabled(vcpu)) { sltime = kvm_s390_get_cpu_timer(vcpu); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 600881d894dd..91064077526d 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -682,16 +682,36 @@ int zpci_disable_device(struct zpci_dev *zdev) } EXPORT_SYMBOL_GPL(zpci_disable_device); -void zpci_remove_device(struct zpci_dev *zdev) +/* zpci_remove_device - Removes the given zdev from the PCI core + * @zdev: the zdev to be removed from the PCI core + * @set_error: if true the device's error state is set to permanent failure + * + * Sets a zPCI device to a configured but offline state; the zPCI + * device is still accessible through its hotplug slot and the zPCI + * API but is removed from the common code PCI bus, making it + * no longer available to drivers. + */ +void zpci_remove_device(struct zpci_dev *zdev, bool set_error) { struct zpci_bus *zbus = zdev->zbus; struct pci_dev *pdev; + if (!zdev->zbus->bus) + return; + pdev = pci_get_slot(zbus->bus, zdev->devfn); if (pdev) { - if (pdev->is_virtfn) - return zpci_iov_remove_virtfn(pdev, zdev->vfn); + if (set_error) + pdev->error_state = pci_channel_io_perm_failure; + if (pdev->is_virtfn) { + zpci_iov_remove_virtfn(pdev, zdev->vfn); + /* balance pci_get_slot */ + pci_dev_put(pdev); + return; + } pci_stop_and_remove_bus_device_locked(pdev); + /* balance pci_get_slot */ + pci_dev_put(pdev); } } @@ -765,7 +785,7 @@ void zpci_release_device(struct kref *kref) struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); if (zdev->zbus->bus) - zpci_remove_device(zdev); + zpci_remove_device(zdev, false); switch (zdev->state) { case ZPCI_FN_STATE_ONLINE: diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index b4162da4e8a2..ac0c65cdd69d 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -76,13 +76,10 @@ void zpci_event_error(void *data) static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); - struct pci_dev *pdev = NULL; enum zpci_state state; + struct pci_dev *pdev; int ret; - if (zdev && zdev->zbus->bus) - pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); - zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); @@ -124,8 +121,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0303: /* Deconfiguration requested */ if (!zdev) break; - if (pdev) - zpci_remove_device(zdev); + zpci_remove_device(zdev, false); ret = zpci_disable_device(zdev); if (ret) @@ -140,12 +136,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0304: /* Configured -> Standby|Reserved */ if (!zdev) break; - if (pdev) { - /* Give the driver a hint that the function is - * already unusable. */ - pdev->error_state = pci_channel_io_perm_failure; - zpci_remove_device(zdev); - } + /* Give the driver a hint that the function is + * already unusable. + */ + zpci_remove_device(zdev, true); zdev->fh = ccdf->fh; zpci_disable_device(zdev); diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig index 148f44b33890..12a4fb0bd52a 100644 --- a/arch/sparc/configs/sparc64_defconfig +++ b/arch/sparc/configs/sparc64_defconfig @@ -93,7 +93,7 @@ CONFIG_NETDEVICES=y CONFIG_NET_ETHERNET=y CONFIG_MII=m CONFIG_SUNLANCE=m -CONFIG_HAPPYMEAL=m +CONFIG_HAPPYMEAL=y CONFIG_SUNGEM=m CONFIG_SUNVNET=m CONFIG_LDMVSW=m @@ -234,9 +234,7 @@ CONFIG_CRYPTO_TWOFISH=m CONFIG_CRC16=m CONFIG_LIBCRC32C=m CONFIG_VCC=m -CONFIG_ATA=y CONFIG_PATA_CMD64X=y -CONFIG_HAPPYMEAL=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_DEVTMPFS=y diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h index 7e078bc73ef5..8fb09eec8c3e 100644 --- a/arch/sparc/include/asm/elf_64.h +++ b/arch/sparc/include/asm/elf_64.h @@ -8,7 +8,6 @@ #include <asm/ptrace.h> #include <asm/processor.h> -#include <asm/extable_64.h> #include <asm/spitfire.h> #include <asm/adi.h> diff --git a/arch/sparc/include/asm/extable_64.h b/arch/sparc/include/asm/extable.h index 5a0171907b7e..554a9dc376fc 100644 --- a/arch/sparc/include/asm/extable_64.h +++ b/arch/sparc/include/asm/extable.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_EXTABLE64_H -#define __ASM_EXTABLE64_H +#ifndef __ASM_EXTABLE_H +#define __ASM_EXTABLE_H /* * The exception table consists of pairs of addresses: the first is the * address of an instruction that is allowed to fault, and the second is diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index 3c4bc2189092..b6242f7771e9 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -50,16 +50,12 @@ struct thread_struct { unsigned long fsr; unsigned long fpqdepth; struct fpq fpqueue[16]; - unsigned long flags; mm_segment_t current_ds; }; -#define SPARC_FLAG_KTHREAD 0x1 /* task is a kernel thread */ -#define SPARC_FLAG_UNALIGNED 0x2 /* is allowed to do unaligned accesses */ - #define INIT_THREAD { \ - .flags = SPARC_FLAG_KTHREAD, \ .current_ds = KERNEL_DS, \ + .kregs = (struct pt_regs *)(init_stack+THREAD_SIZE)-1 \ } /* Do necessary setup to start up a newly executed thread. */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 42cd4cd3892e..8047a9caab2f 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -118,6 +118,7 @@ struct thread_info { .task = &tsk, \ .current_ds = ASI_P, \ .preempt_count = INIT_PREEMPT_COUNT, \ + .kregs = (struct pt_regs *)(init_stack+THREAD_SIZE)-1 \ } /* how to get the thread information struct from C */ diff --git a/arch/sparc/include/asm/uaccess.h b/arch/sparc/include/asm/uaccess.h index dd85bc2c2cad..390094200fc4 100644 --- a/arch/sparc/include/asm/uaccess.h +++ b/arch/sparc/include/asm/uaccess.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef ___ASM_SPARC_UACCESS_H #define ___ASM_SPARC_UACCESS_H + +#include <asm/extable.h> + #if defined(__sparc__) && defined(__arch64__) #include <asm/uaccess_64.h> #else diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h index 0a2d3ebc4bb8..4a12346bb69c 100644 --- a/arch/sparc/include/asm/uaccess_32.h +++ b/arch/sparc/include/asm/uaccess_32.h @@ -13,9 +13,6 @@ #include <asm/processor.h> -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE - /* Sparc is not segmented, however we need to be able to fool access_ok() * when doing system calls from kernel mode legitimately. * @@ -40,36 +37,6 @@ #define __access_ok(addr, size) (__user_ok((addr) & get_fs().seg, (size))) #define access_ok(addr, size) __access_ok((unsigned long)(addr), size) -/* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. - * - * All the routines below use bits of fixup code that are out of line - * with the main instruction path. This means when everything is well, - * we don't even have to jump over them. Further, they do not intrude - * on our cache or tlb entries. - * - * There is a special way how to put a range of potentially faulting - * insns (like twenty ldd/std's with now intervening other instructions) - * You specify address of first in insn and 0 in fixup and in the next - * exception_table_entry you specify last potentially faulting insn + 1 - * and in fixup the routine which should handle the fault. - * That fixup code will get - * (faulting_insn_address - first_insn_in_the_range_address)/4 - * in %g2 (ie. index of the faulting instruction in the range). - */ - -struct exception_table_entry -{ - unsigned long insn, fixup; -}; - -/* Returns 0 if exception not found and fixup otherwise. */ -unsigned long search_extables_range(unsigned long addr, unsigned long *g2); - /* Uh, these should become the main single-value transfer routines.. * They automatically use the right size if we just have the right * pointer type.. @@ -252,12 +219,7 @@ static inline unsigned long __clear_user(void __user *addr, unsigned long size) unsigned long ret; __asm__ __volatile__ ( - ".section __ex_table,#alloc\n\t" - ".align 4\n\t" - ".word 1f,3\n\t" - ".previous\n\t" "mov %2, %%o1\n" - "1:\n\t" "call __bzero\n\t" " mov %1, %%o0\n\t" "mov %%o0, %0\n" diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index 698cf69f74e9..30eb4c6414d1 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -10,7 +10,6 @@ #include <linux/string.h> #include <asm/asi.h> #include <asm/spitfire.h> -#include <asm/extable_64.h> #include <asm/processor.h> diff --git a/arch/sparc/kernel/head_32.S b/arch/sparc/kernel/head_32.S index be30c8d4cc73..6044b82b9767 100644 --- a/arch/sparc/kernel/head_32.S +++ b/arch/sparc/kernel/head_32.S @@ -515,7 +515,7 @@ continue_boot: /* I want a kernel stack NOW! */ set init_thread_union, %g1 - set (THREAD_SIZE - STACKFRAME_SZ), %g2 + set (THREAD_SIZE - STACKFRAME_SZ - TRACEREG_SZ), %g2 add %g1, %g2, %sp mov 0, %fp /* And for good luck */ diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index c5ff2472b3d9..72a5bdc833ea 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -706,7 +706,7 @@ tlb_fixup_done: wr %g0, ASI_P, %asi mov 1, %g1 sllx %g1, THREAD_SHIFT, %g1 - sub %g1, (STACKFRAME_SZ + STACK_BIAS), %g1 + sub %g1, (STACKFRAME_SZ + STACK_BIAS + TRACEREG_SZ), %g1 add %g6, %g1, %sp /* Set per-cpu pointer initially to zero, this makes diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index b91e88058e0c..3b9794978e5b 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -216,16 +216,6 @@ void flush_thread(void) clear_thread_flag(TIF_USEDFPU); #endif } - - /* This task is no longer a kernel thread. */ - if (current->thread.flags & SPARC_FLAG_KTHREAD) { - current->thread.flags &= ~SPARC_FLAG_KTHREAD; - - /* We must fixup kregs as well. */ - /* XXX This was not fixed for ti for a while, worked. Unused? */ - current->thread.kregs = (struct pt_regs *) - (task_stack_page(current) + (THREAD_SIZE - TRACEREG_SZ)); - } } static inline struct sparc_stackf __user * @@ -313,7 +303,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, extern int nwindows; unsigned long psr; memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ); - p->thread.flags |= SPARC_FLAG_KTHREAD; p->thread.current_ds = KERNEL_DS; ti->kpc = (((unsigned long) ret_from_kernel_thread) - 0x8); childregs->u_regs[UREG_G1] = sp; /* function */ @@ -325,7 +314,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, } memcpy(new_stack, (char *)regs - STACKFRAME_SZ, STACKFRAME_SZ + TRACEREG_SZ); childregs->u_regs[UREG_FP] = sp; - p->thread.flags &= ~SPARC_FLAG_KTHREAD; p->thread.current_ds = USER_DS; ti->kpc = (((unsigned long) ret_from_fork) - 0x8); ti->kpsr = current->thread.fork_kpsr | PSR_PIL; diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index eea43a1aef1b..c8e0dd99f370 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -266,7 +266,6 @@ static __init void leon_patch(void) } struct tt_entry *sparc_ttable; -static struct pt_regs fake_swapper_regs; /* Called from head_32.S - before we have setup anything * in the kernel. Be very careful with what you do here. @@ -363,8 +362,6 @@ void __init setup_arch(char **cmdline_p) (*(linux_dbvec->teach_debugger))(); } - init_task.thread.kregs = &fake_swapper_regs; - /* Run-time patch instructions to match the cpu model */ per_cpu_patch(); diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index d87244197d5c..48abee4eee29 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -165,8 +165,6 @@ extern int root_mountflags; char reboot_command[COMMAND_LINE_SIZE]; -static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 }; - static void __init per_cpu_patch(void) { struct cpuid_patch_entry *p; @@ -661,8 +659,6 @@ void __init setup_arch(char **cmdline_p) rd_image_start = ram_flags & RAMDISK_IMAGE_START_MASK; #endif - task_thread_info(&init_task)->kregs = &fake_swapper_regs; - #ifdef CONFIG_IP_PNP if (!ic_set_manually) { phandle chosen = prom_finddevice("/chosen"); diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index d92e5eaa4c1d..a850dccd78ea 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -275,14 +275,13 @@ bool is_no_fault_exception(struct pt_regs *regs) asi = (regs->tstate >> 24); /* saved %asi */ else asi = (insn >> 5); /* immediate asi */ - if ((asi & 0xf2) == ASI_PNF) { - if (insn & 0x1000000) { /* op3[5:4]=3 */ - handle_ldf_stq(insn, regs); - return true; - } else if (insn & 0x200000) { /* op3[2], stores */ + if ((asi & 0xf6) == ASI_PNF) { + if (insn & 0x200000) /* op3[2], stores */ return false; - } - handle_ld_nf(insn, regs); + if (insn & 0x1000000) /* op3[5:4]=3 (fp) */ + handle_ldf_stq(insn, regs); + else + handle_ld_nf(insn, regs); return true; } } diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c index 83db94c0b431..ef5c5207c9ff 100644 --- a/arch/sparc/kernel/unaligned_32.c +++ b/arch/sparc/kernel/unaligned_32.c @@ -16,6 +16,7 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/perf_event.h> +#include <linux/extable.h> #include <asm/setup.h> @@ -213,10 +214,10 @@ static inline int ok_for_kernel(unsigned int insn) static void kernel_mna_trap_fault(struct pt_regs *regs, unsigned int insn) { - unsigned long g2 = regs->u_regs [UREG_G2]; - unsigned long fixup = search_extables_range(regs->pc, &g2); + const struct exception_table_entry *entry; - if (!fixup) { + entry = search_exception_tables(regs->pc); + if (!entry) { unsigned long address = compute_effective_address(regs, insn); if(address < PAGE_SIZE) { printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference in mna handler"); @@ -232,9 +233,8 @@ static void kernel_mna_trap_fault(struct pt_regs *regs, unsigned int insn) die_if_kernel("Oops", regs); /* Not reached */ } - regs->pc = fixup; + regs->pc = entry->fixup; regs->npc = regs->pc + 4; - regs->u_regs [UREG_G2] = g2; } asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn) @@ -274,103 +274,9 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn) } } -static inline int ok_for_user(struct pt_regs *regs, unsigned int insn, - enum direction dir) -{ - unsigned int reg; - int size = ((insn >> 19) & 3) == 3 ? 8 : 4; - - if ((regs->pc | regs->npc) & 3) - return 0; - - /* Must access_ok() in all the necessary places. */ -#define WINREG_ADDR(regnum) \ - ((void __user *)(((unsigned long *)regs->u_regs[UREG_FP])+(regnum))) - - reg = (insn >> 25) & 0x1f; - if (reg >= 16) { - if (!access_ok(WINREG_ADDR(reg - 16), size)) - return -EFAULT; - } - reg = (insn >> 14) & 0x1f; - if (reg >= 16) { - if (!access_ok(WINREG_ADDR(reg - 16), size)) - return -EFAULT; - } - if (!(insn & 0x2000)) { - reg = (insn & 0x1f); - if (reg >= 16) { - if (!access_ok(WINREG_ADDR(reg - 16), size)) - return -EFAULT; - } - } -#undef WINREG_ADDR - return 0; -} - -static void user_mna_trap_fault(struct pt_regs *regs, unsigned int insn) +asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn) { send_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)safe_compute_effective_address(regs, insn), 0, current); } - -asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn) -{ - enum direction dir; - - if(!(current->thread.flags & SPARC_FLAG_UNALIGNED) || - (((insn >> 30) & 3) != 3)) - goto kill_user; - dir = decode_direction(insn); - if(!ok_for_user(regs, insn, dir)) { - goto kill_user; - } else { - int err, size = decode_access_size(insn); - unsigned long addr; - - if(floating_point_load_or_store_p(insn)) { - printk("User FPU load/store unaligned unsupported.\n"); - goto kill_user; - } - - addr = compute_effective_address(regs, insn); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); - switch(dir) { - case load: - err = do_int_load(fetch_reg_addr(((insn>>25)&0x1f), - regs), - size, (unsigned long *) addr, - decode_signedness(insn)); - break; - - case store: - err = do_int_store(((insn>>25)&0x1f), size, - (unsigned long *) addr, regs); - break; - - case both: - /* - * This was supported in 2.4. However, we question - * the value of SWAP instruction across word boundaries. - */ - printk("Unaligned SWAP unsupported.\n"); - err = -EFAULT; - break; - - default: - unaligned_panic("Impossible user unaligned trap."); - goto out; - } - if (err) - goto kill_user; - else - advance(regs); - goto out; - } - -kill_user: - user_mna_trap_fault(regs, insn); -out: - ; -} diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S index 7488d130faf7..781e39b3c009 100644 --- a/arch/sparc/lib/checksum_32.S +++ b/arch/sparc/lib/checksum_32.S @@ -155,13 +155,6 @@ cpout: retl ! get outta here .text; \ .align 4 -#define EXT(start,end) \ - .section __ex_table,ALLOC; \ - .align 4; \ - .word start, 0, end, cc_fault; \ - .text; \ - .align 4 - /* This aligned version executes typically in 8.5 superscalar cycles, this * is the best I can do. I say 8.5 because the final add will pair with * the next ldd in the main unrolled loop. Thus the pipe is always full. @@ -169,20 +162,20 @@ cpout: retl ! get outta here * please check the fixup code below as well. */ #define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ - ldd [src + off + 0x00], t0; \ - ldd [src + off + 0x08], t2; \ + EX(ldd [src + off + 0x00], t0); \ + EX(ldd [src + off + 0x08], t2); \ addxcc t0, sum, sum; \ - ldd [src + off + 0x10], t4; \ + EX(ldd [src + off + 0x10], t4); \ addxcc t1, sum, sum; \ - ldd [src + off + 0x18], t6; \ + EX(ldd [src + off + 0x18], t6); \ addxcc t2, sum, sum; \ - std t0, [dst + off + 0x00]; \ + EX(std t0, [dst + off + 0x00]); \ addxcc t3, sum, sum; \ - std t2, [dst + off + 0x08]; \ + EX(std t2, [dst + off + 0x08]); \ addxcc t4, sum, sum; \ - std t4, [dst + off + 0x10]; \ + EX(std t4, [dst + off + 0x10]); \ addxcc t5, sum, sum; \ - std t6, [dst + off + 0x18]; \ + EX(std t6, [dst + off + 0x18]); \ addxcc t6, sum, sum; \ addxcc t7, sum, sum; @@ -191,39 +184,39 @@ cpout: retl ! get outta here * Viking MXCC into streaming mode. Ho hum... */ #define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ - ldd [src + off + 0x00], t0; \ - ldd [src + off + 0x08], t2; \ - ldd [src + off + 0x10], t4; \ - ldd [src + off + 0x18], t6; \ - st t0, [dst + off + 0x00]; \ + EX(ldd [src + off + 0x00], t0); \ + EX(ldd [src + off + 0x08], t2); \ + EX(ldd [src + off + 0x10], t4); \ + EX(ldd [src + off + 0x18], t6); \ + EX(st t0, [dst + off + 0x00]); \ addxcc t0, sum, sum; \ - st t1, [dst + off + 0x04]; \ + EX(st t1, [dst + off + 0x04]); \ addxcc t1, sum, sum; \ - st t2, [dst + off + 0x08]; \ + EX(st t2, [dst + off + 0x08]); \ addxcc t2, sum, sum; \ - st t3, [dst + off + 0x0c]; \ + EX(st t3, [dst + off + 0x0c]); \ addxcc t3, sum, sum; \ - st t4, [dst + off + 0x10]; \ + EX(st t4, [dst + off + 0x10]); \ addxcc t4, sum, sum; \ - st t5, [dst + off + 0x14]; \ + EX(st t5, [dst + off + 0x14]); \ addxcc t5, sum, sum; \ - st t6, [dst + off + 0x18]; \ + EX(st t6, [dst + off + 0x18]); \ addxcc t6, sum, sum; \ - st t7, [dst + off + 0x1c]; \ + EX(st t7, [dst + off + 0x1c]); \ addxcc t7, sum, sum; /* Yuck, 6 superscalar cycles... */ #define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \ - ldd [src - off - 0x08], t0; \ - ldd [src - off - 0x00], t2; \ + EX(ldd [src - off - 0x08], t0); \ + EX(ldd [src - off - 0x00], t2); \ addxcc t0, sum, sum; \ - st t0, [dst - off - 0x08]; \ + EX(st t0, [dst - off - 0x08]); \ addxcc t1, sum, sum; \ - st t1, [dst - off - 0x04]; \ + EX(st t1, [dst - off - 0x04]); \ addxcc t2, sum, sum; \ - st t2, [dst - off - 0x00]; \ + EX(st t2, [dst - off - 0x00]); \ addxcc t3, sum, sum; \ - st t3, [dst - off + 0x04]; + EX(st t3, [dst - off + 0x04]); /* Handle the end cruft code out of band for better cache patterns. */ cc_end_cruft: @@ -331,7 +324,6 @@ __csum_partial_copy_sparc_generic: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) -10: EXT(5b, 10b) ! note for exception handling sub %g1, 128, %g1 ! detract from length addx %g0, %g7, %g7 ! add in last carry bit andcc %g1, 0xffffff80, %g0 ! more to csum? @@ -356,8 +348,7 @@ cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5) CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5) CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5) CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5) -12: EXT(cctbl, 12b) ! note for exception table handling - addx %g0, %g7, %g7 +12: addx %g0, %g7, %g7 andcc %o3, 0xf, %g0 ! check for low bits set ccte: bne cc_end_cruft ! something left, handle it out of band andcc %o3, 8, %g0 ! begin checks for that code @@ -367,7 +358,6 @@ ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) -11: EXT(ccdbl, 11b) ! note for exception table handling sub %g1, 128, %g1 ! detract from length addx %g0, %g7, %g7 ! add in last carry bit andcc %g1, 0xffffff80, %g0 ! more to csum? diff --git a/arch/sparc/lib/copy_user.S b/arch/sparc/lib/copy_user.S index dc72f2b970b7..954572c78539 100644 --- a/arch/sparc/lib/copy_user.S +++ b/arch/sparc/lib/copy_user.S @@ -21,98 +21,134 @@ /* Work around cpp -rob */ #define ALLOC #alloc #define EXECINSTR #execinstr + +#define EX_ENTRY(l1, l2) \ + .section __ex_table,ALLOC; \ + .align 4; \ + .word l1, l2; \ + .text; + #define EX(x,y,a,b) \ 98: x,y; \ .section .fixup,ALLOC,EXECINSTR; \ .align 4; \ -99: ba fixupretl; \ - a, b, %g3; \ - .section __ex_table,ALLOC; \ - .align 4; \ - .word 98b, 99b; \ - .text; \ - .align 4 +99: retl; \ + a, b, %o0; \ + EX_ENTRY(98b, 99b) #define EX2(x,y,c,d,e,a,b) \ 98: x,y; \ .section .fixup,ALLOC,EXECINSTR; \ .align 4; \ 99: c, d, e; \ - ba fixupretl; \ - a, b, %g3; \ - .section __ex_table,ALLOC; \ - .align 4; \ - .word 98b, 99b; \ - .text; \ - .align 4 + retl; \ + a, b, %o0; \ + EX_ENTRY(98b, 99b) #define EXO2(x,y) \ 98: x, y; \ - .section __ex_table,ALLOC; \ - .align 4; \ - .word 98b, 97f; \ - .text; \ - .align 4 + EX_ENTRY(98b, 97f) -#define EXT(start,end,handler) \ - .section __ex_table,ALLOC; \ - .align 4; \ - .word start, 0, end, handler; \ - .text; \ - .align 4 +#define LD(insn, src, offset, reg, label) \ +98: insn [%src + (offset)], %reg; \ + .section .fixup,ALLOC,EXECINSTR; \ +99: ba label; \ + mov offset, %g5; \ + EX_ENTRY(98b, 99b) -/* Please do not change following macros unless you change logic used - * in .fixup at the end of this file as well - */ +#define ST(insn, dst, offset, reg, label) \ +98: insn %reg, [%dst + (offset)]; \ + .section .fixup,ALLOC,EXECINSTR; \ +99: ba label; \ + mov offset, %g5; \ + EX_ENTRY(98b, 99b) /* Both these macros have to start with exactly the same insn */ +/* left: g7 + (g1 % 128) - offset */ #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ - ldd [%src + (offset) + 0x00], %t0; \ - ldd [%src + (offset) + 0x08], %t2; \ - ldd [%src + (offset) + 0x10], %t4; \ - ldd [%src + (offset) + 0x18], %t6; \ - st %t0, [%dst + (offset) + 0x00]; \ - st %t1, [%dst + (offset) + 0x04]; \ - st %t2, [%dst + (offset) + 0x08]; \ - st %t3, [%dst + (offset) + 0x0c]; \ - st %t4, [%dst + (offset) + 0x10]; \ - st %t5, [%dst + (offset) + 0x14]; \ - st %t6, [%dst + (offset) + 0x18]; \ - st %t7, [%dst + (offset) + 0x1c]; - + LD(ldd, src, offset + 0x00, t0, bigchunk_fault) \ + LD(ldd, src, offset + 0x08, t2, bigchunk_fault) \ + LD(ldd, src, offset + 0x10, t4, bigchunk_fault) \ + LD(ldd, src, offset + 0x18, t6, bigchunk_fault) \ + ST(st, dst, offset + 0x00, t0, bigchunk_fault) \ + ST(st, dst, offset + 0x04, t1, bigchunk_fault) \ + ST(st, dst, offset + 0x08, t2, bigchunk_fault) \ + ST(st, dst, offset + 0x0c, t3, bigchunk_fault) \ + ST(st, dst, offset + 0x10, t4, bigchunk_fault) \ + ST(st, dst, offset + 0x14, t5, bigchunk_fault) \ + ST(st, dst, offset + 0x18, t6, bigchunk_fault) \ + ST(st, dst, offset + 0x1c, t7, bigchunk_fault) + +/* left: g7 + (g1 % 128) - offset */ #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ - ldd [%src + (offset) + 0x00], %t0; \ - ldd [%src + (offset) + 0x08], %t2; \ - ldd [%src + (offset) + 0x10], %t4; \ - ldd [%src + (offset) + 0x18], %t6; \ - std %t0, [%dst + (offset) + 0x00]; \ - std %t2, [%dst + (offset) + 0x08]; \ - std %t4, [%dst + (offset) + 0x10]; \ - std %t6, [%dst + (offset) + 0x18]; + LD(ldd, src, offset + 0x00, t0, bigchunk_fault) \ + LD(ldd, src, offset + 0x08, t2, bigchunk_fault) \ + LD(ldd, src, offset + 0x10, t4, bigchunk_fault) \ + LD(ldd, src, offset + 0x18, t6, bigchunk_fault) \ + ST(std, dst, offset + 0x00, t0, bigchunk_fault) \ + ST(std, dst, offset + 0x08, t2, bigchunk_fault) \ + ST(std, dst, offset + 0x10, t4, bigchunk_fault) \ + ST(std, dst, offset + 0x18, t6, bigchunk_fault) + .section .fixup,#alloc,#execinstr +bigchunk_fault: + sub %g7, %g5, %o0 + and %g1, 127, %g1 + retl + add %o0, %g1, %o0 + +/* left: offset + 16 + (g1 % 16) */ #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ - ldd [%src - (offset) - 0x10], %t0; \ - ldd [%src - (offset) - 0x08], %t2; \ - st %t0, [%dst - (offset) - 0x10]; \ - st %t1, [%dst - (offset) - 0x0c]; \ - st %t2, [%dst - (offset) - 0x08]; \ - st %t3, [%dst - (offset) - 0x04]; + LD(ldd, src, -(offset + 0x10), t0, lastchunk_fault) \ + LD(ldd, src, -(offset + 0x08), t2, lastchunk_fault) \ + ST(st, dst, -(offset + 0x10), t0, lastchunk_fault) \ + ST(st, dst, -(offset + 0x0c), t1, lastchunk_fault) \ + ST(st, dst, -(offset + 0x08), t2, lastchunk_fault) \ + ST(st, dst, -(offset + 0x04), t3, lastchunk_fault) -#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \ - lduh [%src + (offset) + 0x00], %t0; \ - lduh [%src + (offset) + 0x02], %t1; \ - lduh [%src + (offset) + 0x04], %t2; \ - lduh [%src + (offset) + 0x06], %t3; \ - sth %t0, [%dst + (offset) + 0x00]; \ - sth %t1, [%dst + (offset) + 0x02]; \ - sth %t2, [%dst + (offset) + 0x04]; \ - sth %t3, [%dst + (offset) + 0x06]; + .section .fixup,#alloc,#execinstr +lastchunk_fault: + and %g1, 15, %g1 + retl + sub %g1, %g5, %o0 +/* left: o3 + (o2 % 16) - offset */ +#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \ + LD(lduh, src, offset + 0x00, t0, halfchunk_fault) \ + LD(lduh, src, offset + 0x02, t1, halfchunk_fault) \ + LD(lduh, src, offset + 0x04, t2, halfchunk_fault) \ + LD(lduh, src, offset + 0x06, t3, halfchunk_fault) \ + ST(sth, dst, offset + 0x00, t0, halfchunk_fault) \ + ST(sth, dst, offset + 0x02, t1, halfchunk_fault) \ + ST(sth, dst, offset + 0x04, t2, halfchunk_fault) \ + ST(sth, dst, offset + 0x06, t3, halfchunk_fault) + +/* left: o3 + (o2 % 16) + offset + 2 */ #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ - ldub [%src - (offset) - 0x02], %t0; \ - ldub [%src - (offset) - 0x01], %t1; \ - stb %t0, [%dst - (offset) - 0x02]; \ - stb %t1, [%dst - (offset) - 0x01]; + LD(ldub, src, -(offset + 0x02), t0, halfchunk_fault) \ + LD(ldub, src, -(offset + 0x01), t1, halfchunk_fault) \ + ST(stb, dst, -(offset + 0x02), t0, halfchunk_fault) \ + ST(stb, dst, -(offset + 0x01), t1, halfchunk_fault) + + .section .fixup,#alloc,#execinstr +halfchunk_fault: + and %o2, 15, %o2 + sub %o3, %g5, %o3 + retl + add %o2, %o3, %o0 + +/* left: offset + 2 + (o2 % 2) */ +#define MOVE_LAST_SHORTCHUNK(src, dst, offset, t0, t1) \ + LD(ldub, src, -(offset + 0x02), t0, last_shortchunk_fault) \ + LD(ldub, src, -(offset + 0x01), t1, last_shortchunk_fault) \ + ST(stb, dst, -(offset + 0x02), t0, last_shortchunk_fault) \ + ST(stb, dst, -(offset + 0x01), t1, last_shortchunk_fault) + + .section .fixup,#alloc,#execinstr +last_shortchunk_fault: + and %o2, 1, %o2 + retl + sub %o2, %g5, %o0 .text .align 4 @@ -182,8 +218,6 @@ __copy_user: /* %o0=dst %o1=src %o2=len */ MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) -80: - EXT(5b, 80b, 50f) subcc %g7, 128, %g7 add %o1, 128, %o1 bne 5b @@ -201,7 +235,6 @@ __copy_user: /* %o0=dst %o1=src %o2=len */ jmpl %o5 + %lo(copy_user_table_end), %g0 add %o0, %g7, %o0 -copy_user_table: MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) @@ -210,7 +243,6 @@ copy_user_table: MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) copy_user_table_end: - EXT(copy_user_table, copy_user_table_end, 51f) be copy_user_last7 andcc %g1, 4, %g0 @@ -250,8 +282,6 @@ ldd_std: MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) -81: - EXT(ldd_std, 81b, 52f) subcc %g7, 128, %g7 add %o1, 128, %o1 bne ldd_std @@ -290,8 +320,6 @@ cannot_optimize: 10: MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5) MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5) -82: - EXT(10b, 82b, 53f) subcc %o3, 0x10, %o3 add %o1, 0x10, %o1 bne 10b @@ -308,8 +336,6 @@ byte_chunk: MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3) MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3) MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3) -83: - EXT(byte_chunk, 83b, 54f) subcc %o3, 0x10, %o3 add %o1, 0x10, %o1 bne byte_chunk @@ -325,16 +351,14 @@ short_end: add %o1, %o3, %o1 jmpl %o5 + %lo(short_table_end), %g0 andcc %o2, 1, %g0 -84: - MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) - MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) - MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) - MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) - MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) - MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) - MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) + MOVE_LAST_SHORTCHUNK(o1, o0, 0x0c, g2, g3) + MOVE_LAST_SHORTCHUNK(o1, o0, 0x0a, g2, g3) + MOVE_LAST_SHORTCHUNK(o1, o0, 0x08, g2, g3) + MOVE_LAST_SHORTCHUNK(o1, o0, 0x06, g2, g3) + MOVE_LAST_SHORTCHUNK(o1, o0, 0x04, g2, g3) + MOVE_LAST_SHORTCHUNK(o1, o0, 0x02, g2, g3) + MOVE_LAST_SHORTCHUNK(o1, o0, 0x00, g2, g3) short_table_end: - EXT(84b, short_table_end, 55f) be 1f nop EX(ldub [%o1], %g2, add %g0, 1) @@ -363,123 +387,8 @@ short_aligned_end: .section .fixup,#alloc,#execinstr .align 4 97: - mov %o2, %g3 -fixupretl: retl - mov %g3, %o0 - -/* exception routine sets %g2 to (broken_insn - first_insn)>>2 */ -50: -/* This magic counts how many bytes are left when crash in MOVE_BIGCHUNK - * happens. This is derived from the amount ldd reads, st stores, etc. - * x = g2 % 12; - * g3 = g1 + g7 - ((g2 / 12) * 32 + (x < 4) ? 0 : (x - 4) * 4); - * o0 += (g2 / 12) * 32; - */ - cmp %g2, 12 - add %o0, %g7, %o0 - bcs 1f - cmp %g2, 24 - bcs 2f - cmp %g2, 36 - bcs 3f - nop - sub %g2, 12, %g2 - sub %g7, 32, %g7 -3: sub %g2, 12, %g2 - sub %g7, 32, %g7 -2: sub %g2, 12, %g2 - sub %g7, 32, %g7 -1: cmp %g2, 4 - bcs,a 60f - clr %g2 - sub %g2, 4, %g2 - sll %g2, 2, %g2 -60: and %g1, 0x7f, %g3 - sub %o0, %g7, %o0 - add %g3, %g7, %g3 - ba fixupretl - sub %g3, %g2, %g3 -51: -/* i = 41 - g2; j = i % 6; - * g3 = (g1 & 15) + (i / 6) * 16 + (j < 4) ? (j + 1) * 4 : 16; - * o0 -= (i / 6) * 16 + 16; - */ - neg %g2 - and %g1, 0xf, %g1 - add %g2, 41, %g2 - add %o0, %g1, %o0 -1: cmp %g2, 6 - bcs,a 2f - cmp %g2, 4 - add %g1, 16, %g1 - b 1b - sub %g2, 6, %g2 -2: bcc,a 2f - mov 16, %g2 - inc %g2 - sll %g2, 2, %g2 -2: add %g1, %g2, %g3 - ba fixupretl - sub %o0, %g3, %o0 -52: -/* g3 = g1 + g7 - (g2 / 8) * 32 + (g2 & 4) ? (g2 & 3) * 8 : 0; - o0 += (g2 / 8) * 32 */ - andn %g2, 7, %g4 - add %o0, %g7, %o0 - andcc %g2, 4, %g0 - and %g2, 3, %g2 - sll %g4, 2, %g4 - sll %g2, 3, %g2 - bne 60b - sub %g7, %g4, %g7 - ba 60b - clr %g2 -53: -/* g3 = o3 + (o2 & 15) - (g2 & 8) - (g2 & 4) ? (g2 & 3) * 2 : 0; - o0 += (g2 & 8) */ - and %g2, 3, %g4 - andcc %g2, 4, %g0 - and %g2, 8, %g2 - sll %g4, 1, %g4 - be 1f - add %o0, %g2, %o0 - add %g2, %g4, %g2 -1: and %o2, 0xf, %g3 - add %g3, %o3, %g3 - ba fixupretl - sub %g3, %g2, %g3 -54: -/* g3 = o3 + (o2 & 15) - (g2 / 4) * 2 - (g2 & 2) ? (g2 & 1) : 0; - o0 += (g2 / 4) * 2 */ - srl %g2, 2, %o4 - and %g2, 1, %o5 - srl %g2, 1, %g2 - add %o4, %o4, %o4 - and %o5, %g2, %o5 - and %o2, 0xf, %o2 - add %o0, %o4, %o0 - sub %o3, %o5, %o3 - sub %o2, %o4, %o2 - ba fixupretl - add %o2, %o3, %g3 -55: -/* i = 27 - g2; - g3 = (o2 & 1) + i / 4 * 2 + !(i & 3); - o0 -= i / 4 * 2 + 1 */ - neg %g2 - and %o2, 1, %o2 - add %g2, 27, %g2 - srl %g2, 2, %o5 - andcc %g2, 3, %g0 - mov 1, %g2 - add %o5, %o5, %o5 - be,a 1f - clr %g2 -1: add %g2, %o5, %g3 - sub %o0, %g3, %o0 - ba fixupretl - add %g3, %o2, %g3 + mov %o2, %o0 .globl __copy_user_end __copy_user_end: diff --git a/arch/sparc/lib/memset.S b/arch/sparc/lib/memset.S index f427f34b8b79..eaff68213fdf 100644 --- a/arch/sparc/lib/memset.S +++ b/arch/sparc/lib/memset.S @@ -19,7 +19,7 @@ 98: x,y; \ .section .fixup,ALLOC,EXECINSTR; \ .align 4; \ -99: ba 30f; \ +99: retl; \ a, b, %o0; \ .section __ex_table,ALLOC; \ .align 4; \ @@ -27,35 +27,44 @@ .text; \ .align 4 -#define EXT(start,end,handler) \ +#define STORE(source, base, offset, n) \ +98: std source, [base + offset + n]; \ + .section .fixup,ALLOC,EXECINSTR; \ + .align 4; \ +99: ba 30f; \ + sub %o3, n - offset, %o3; \ .section __ex_table,ALLOC; \ .align 4; \ - .word start, 0, end, handler; \ + .word 98b, 99b; \ .text; \ - .align 4 + .align 4; + +#define STORE_LAST(source, base, offset, n) \ + EX(std source, [base - offset - n], \ + add %o1, offset + n); /* Please don't change these macros, unless you change the logic * in the .fixup section below as well. * Store 64 bytes at (BASE + OFFSET) using value SOURCE. */ -#define ZERO_BIG_BLOCK(base, offset, source) \ - std source, [base + offset + 0x00]; \ - std source, [base + offset + 0x08]; \ - std source, [base + offset + 0x10]; \ - std source, [base + offset + 0x18]; \ - std source, [base + offset + 0x20]; \ - std source, [base + offset + 0x28]; \ - std source, [base + offset + 0x30]; \ - std source, [base + offset + 0x38]; +#define ZERO_BIG_BLOCK(base, offset, source) \ + STORE(source, base, offset, 0x00); \ + STORE(source, base, offset, 0x08); \ + STORE(source, base, offset, 0x10); \ + STORE(source, base, offset, 0x18); \ + STORE(source, base, offset, 0x20); \ + STORE(source, base, offset, 0x28); \ + STORE(source, base, offset, 0x30); \ + STORE(source, base, offset, 0x38); #define ZERO_LAST_BLOCKS(base, offset, source) \ - std source, [base - offset - 0x38]; \ - std source, [base - offset - 0x30]; \ - std source, [base - offset - 0x28]; \ - std source, [base - offset - 0x20]; \ - std source, [base - offset - 0x18]; \ - std source, [base - offset - 0x10]; \ - std source, [base - offset - 0x08]; \ - std source, [base - offset - 0x00]; + STORE_LAST(source, base, offset, 0x38); \ + STORE_LAST(source, base, offset, 0x30); \ + STORE_LAST(source, base, offset, 0x28); \ + STORE_LAST(source, base, offset, 0x20); \ + STORE_LAST(source, base, offset, 0x18); \ + STORE_LAST(source, base, offset, 0x10); \ + STORE_LAST(source, base, offset, 0x08); \ + STORE_LAST(source, base, offset, 0x00); .text .align 4 @@ -68,8 +77,6 @@ __bzero_begin: .globl memset EXPORT_SYMBOL(__bzero) EXPORT_SYMBOL(memset) - .globl __memset_start, __memset_end -__memset_start: memset: mov %o0, %g1 mov 1, %g4 @@ -122,8 +129,6 @@ __bzero: ZERO_BIG_BLOCK(%o0, 0x00, %g2) subcc %o3, 128, %o3 ZERO_BIG_BLOCK(%o0, 0x40, %g2) -11: - EXT(10b, 11b, 20f) bne 10b add %o0, 128, %o0 @@ -138,11 +143,9 @@ __bzero: jmp %o4 add %o0, %o2, %o0 -12: ZERO_LAST_BLOCKS(%o0, 0x48, %g2) ZERO_LAST_BLOCKS(%o0, 0x08, %g2) 13: - EXT(12b, 13b, 21f) be 8f andcc %o1, 4, %g0 @@ -182,37 +185,13 @@ __bzero: 5: retl clr %o0 -__memset_end: .section .fixup,#alloc,#execinstr .align 4 -20: - cmp %g2, 8 - bleu 1f - and %o1, 0x7f, %o1 - sub %g2, 9, %g2 - add %o3, 64, %o3 -1: - sll %g2, 3, %g2 - add %o3, %o1, %o0 - b 30f - sub %o0, %g2, %o0 -21: - mov 8, %o0 - and %o1, 7, %o1 - sub %o0, %g2, %o0 - sll %o0, 3, %o0 - b 30f - add %o0, %o1, %o0 30: -/* %o4 is faulting address, %o5 is %pc where fault occurred */ - save %sp, -104, %sp - mov %i5, %o0 - mov %i7, %o1 - call lookup_fault - mov %i4, %o2 - ret - restore + and %o1, 0x7f, %o1 + retl + add %o3, %o1, %o0 .globl __bzero_end __bzero_end: diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile index 68db1f859b02..871354aa3c00 100644 --- a/arch/sparc/mm/Makefile +++ b/arch/sparc/mm/Makefile @@ -8,7 +8,7 @@ ccflags-y := -Werror obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o obj-y += fault_$(BITS).o obj-y += init_$(BITS).o -obj-$(CONFIG_SPARC32) += extable.o srmmu.o iommu.o io-unit.o +obj-$(CONFIG_SPARC32) += srmmu.o iommu.o io-unit.o obj-$(CONFIG_SPARC32) += srmmu_access.o obj-$(CONFIG_SPARC32) += hypersparc.o viking.o tsunami.o swift.o obj-$(CONFIG_SPARC32) += leon_mm.o diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c deleted file mode 100644 index 241b40641873..000000000000 --- a/arch/sparc/mm/extable.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/arch/sparc/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/extable.h> -#include <linux/uaccess.h> - -void sort_extable(struct exception_table_entry *start, - struct exception_table_entry *finish) -{ -} - -/* Caller knows they are in a range if ret->fixup == 0 */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *base, - const size_t num, - unsigned long value) -{ - int i; - - /* Single insn entries are encoded as: - * word 1: insn address - * word 2: fixup code address - * - * Range entries are encoded as: - * word 1: first insn address - * word 2: 0 - * word 3: last insn address + 4 bytes - * word 4: fixup code address - * - * Deleted entries are encoded as: - * word 1: unused - * word 2: -1 - * - * See asm/uaccess.h for more details. - */ - - /* 1. Try to find an exact match. */ - for (i = 0; i < num; i++) { - if (base[i].fixup == 0) { - /* A range entry, skip both parts. */ - i++; - continue; - } - - /* A deleted entry; see trim_init_extable */ - if (base[i].fixup == -1) - continue; - - if (base[i].insn == value) - return &base[i]; - } - - /* 2. Try to find a range match. */ - for (i = 0; i < (num - 1); i++) { - if (base[i].fixup) - continue; - - if (base[i].insn <= value && base[i + 1].insn > value) - return &base[i]; - - i++; - } - - return NULL; -} - -#ifdef CONFIG_MODULES -/* We could memmove them around; easier to mark the trimmed ones. */ -void trim_init_extable(struct module *m) -{ - unsigned int i; - bool range; - - for (i = 0; i < m->num_exentries; i += range ? 2 : 1) { - range = m->extable[i].fixup == 0; - - if (within_module_init(m->extable[i].insn, m)) { - m->extable[i].fixup = -1; - if (range) - m->extable[i+1].fixup = -1; - } - if (range) - i++; - } -} -#endif /* CONFIG_MODULES */ - -/* Special extable search, which handles ranges. Returns fixup */ -unsigned long search_extables_range(unsigned long addr, unsigned long *g2) -{ - const struct exception_table_entry *entry; - - entry = search_exception_tables(addr); - if (!entry) - return 0; - - /* Inside range? Fix g2 and return correct fixup */ - if (!entry->fixup) { - *g2 = (addr - entry->insn) / 4; - return (entry + 1)->fixup; - } - - return entry->fixup; -} diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 40ce087dfecf..de2031c2b2d7 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -23,6 +23,7 @@ #include <linux/interrupt.h> #include <linux/kdebug.h> #include <linux/uaccess.h> +#include <linux/extable.h> #include <asm/page.h> #include <asm/openprom.h> @@ -54,54 +55,6 @@ static void __noreturn unhandled_fault(unsigned long address, die_if_kernel("Oops", regs); } -asmlinkage int lookup_fault(unsigned long pc, unsigned long ret_pc, - unsigned long address) -{ - struct pt_regs regs; - unsigned long g2; - unsigned int insn; - int i; - - i = search_extables_range(ret_pc, &g2); - switch (i) { - case 3: - /* load & store will be handled by fixup */ - return 3; - - case 1: - /* store will be handled by fixup, load will bump out */ - /* for _to_ macros */ - insn = *((unsigned int *) pc); - if ((insn >> 21) & 1) - return 1; - break; - - case 2: - /* load will be handled by fixup, store will bump out */ - /* for _from_ macros */ - insn = *((unsigned int *) pc); - if (!((insn >> 21) & 1) || ((insn>>19)&0x3f) == 15) - return 2; - break; - - default: - break; - } - - memset(®s, 0, sizeof(regs)); - regs.pc = pc; - regs.npc = pc + 4; - __asm__ __volatile__( - "rd %%psr, %0\n\t" - "nop\n\t" - "nop\n\t" - "nop\n" : "=r" (regs.psr)); - unhandled_fault(address, current, ®s); - - /* Not reached */ - return 0; -} - static inline void show_signal_msg(struct pt_regs *regs, int sig, int code, unsigned long address, struct task_struct *tsk) @@ -162,8 +115,6 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, struct vm_area_struct *vma; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - unsigned int fixup; - unsigned long g2; int from_user = !(regs->psr & PSR_PS); int code; vm_fault_t fault; @@ -281,30 +232,19 @@ bad_area_nosemaphore: /* Is this in ex_table? */ no_context: - g2 = regs->u_regs[UREG_G2]; if (!from_user) { - fixup = search_extables_range(regs->pc, &g2); - /* Values below 10 are reserved for other things */ - if (fixup > 10) { - extern const unsigned int __memset_start[]; - extern const unsigned int __memset_end[]; + const struct exception_table_entry *entry; + entry = search_exception_tables(regs->pc); #ifdef DEBUG_EXCEPTIONS - printk("Exception: PC<%08lx> faddr<%08lx>\n", - regs->pc, address); - printk("EX_TABLE: insn<%08lx> fixup<%08x> g2<%08lx>\n", - regs->pc, fixup, g2); + printk("Exception: PC<%08lx> faddr<%08lx>\n", + regs->pc, address); + printk("EX_TABLE: insn<%08lx> fixup<%08x>\n", + regs->pc, entry->fixup); #endif - if ((regs->pc >= (unsigned long)__memset_start && - regs->pc < (unsigned long)__memset_end)) { - regs->u_regs[UREG_I4] = address; - regs->u_regs[UREG_I5] = regs->pc; - } - regs->u_regs[UREG_G2] = g2; - regs->pc = fixup; - regs->npc = regs->pc + 4; - return; - } + regs->pc = entry->fixup; + regs->npc = regs->pc + 4; + return; } unhandled_fault(address, tsk, regs); diff --git a/arch/sparc/mm/mm_32.h b/arch/sparc/mm/mm_32.h index ce750a99eea9..ee55f1080634 100644 --- a/arch/sparc/mm/mm_32.h +++ b/arch/sparc/mm/mm_32.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* fault_32.c - visible as they are called from assembler */ -asmlinkage int lookup_fault(unsigned long pc, unsigned long ret_pc, - unsigned long address); asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, unsigned long address); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2792879d398e..6e0b08310949 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -571,6 +571,7 @@ config X86_UV depends on X86_EXTENDED_PLATFORM depends on NUMA depends on EFI + depends on KEXEC_CORE depends on X86_X2APIC depends on PCI help @@ -777,6 +778,7 @@ if HYPERVISOR_GUEST config PARAVIRT bool "Enable paravirtualization code" + depends on HAVE_STATIC_CALL help This changes the kernel so it can modify itself when it is run under a hypervisor, potentially improving performance significantly @@ -1406,7 +1408,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1518,6 +1520,7 @@ config AMD_MEM_ENCRYPT select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -1931,6 +1934,7 @@ config X86_SGX depends on CRYPTO_SHA256=y select SRCU select MMU_NOTIFIER + select NUMA_KEEP_MEMINFO if NUMA help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d6d5a28c3bf..78faf9c7e3ae 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,12 +27,13 @@ endif REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse + -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) REALMODE_CFLAGS += -ffreestanding REALMODE_CFLAGS += -fno-stack-protector REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) +REALMODE_CFLAGS += $(CLANG_FLAGS) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e0bc3988c3fa..6e5522aebbbd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -46,6 +46,7 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h +KBUILD_CFLAGS += $(CLANG_FLAGS) # sev-es.c indirectly inludes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index c4bb0f9363f5..95a223b3e56a 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -5,7 +5,7 @@ * Early support for invoking 32-bit EFI services from a 64-bit kernel. * * Because this thunking occurs before ExitBootServices() we have to - * restore the firmware's 32-bit GDT before we make EFI serivce calls, + * restore the firmware's 32-bit GDT before we make EFI service calls, * since the firmware's 32-bit IDT is still currently installed and it * needs to be able to service interrupts. * diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index e94874f4bbc1..a2347ded77ea 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -34,6 +34,7 @@ #include <asm/asm-offsets.h> #include <asm/bootparam.h> #include <asm/desc_defs.h> +#include <asm/trapnr.h> #include "pgtable.h" /* @@ -107,9 +108,19 @@ SYM_FUNC_START(startup_32) movl %eax, %gs movl %eax, %ss -/* setup a stack and make sure cpu supports long mode. */ + /* Setup a stack and load CS from current GDT */ leal rva(boot_stack_end)(%ebp), %esp + pushl $__KERNEL32_CS + leal rva(1f)(%ebp), %eax + pushl %eax + lretl +1: + + /* Setup Exception handling for SEV-ES */ + call startup32_load_idt + + /* Make sure cpu supports long mode. */ call verify_cpu testl %eax, %eax jnz .Lno_longmode @@ -172,11 +183,21 @@ SYM_FUNC_START(startup_32) */ call get_sev_encryption_bit xorl %edx, %edx +#ifdef CONFIG_AMD_MEM_ENCRYPT testl %eax, %eax jz 1f subl $32, %eax /* Encryption bit is always above bit 31 */ bts %eax, %edx /* Set encryption mask for page tables */ + /* + * Mark SEV as active in sev_status so that startup32_check_sev_cbit() + * will do a check. The sev_status memory will be fully initialized + * with the contents of MSR_AMD_SEV_STATUS later in + * set_sev_encryption_mask(). For now it is sufficient to know that SEV + * is active. + */ + movl $1, rva(sev_status)(%ebp) 1: +#endif /* Initialize Page tables to 0 */ leal rva(pgtable)(%ebx), %edi @@ -231,7 +252,7 @@ SYM_FUNC_START(startup_32) /* * Setup for the jump to 64bit mode * - * When the jump is performend we will be in long mode but + * When the jump is performed we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 * (and in turn EFER.LMA = 1). To jump into 64bit mode we use * the new gdt/idt that has __KERNEL_CS with CS.L = 1. @@ -261,6 +282,9 @@ SYM_FUNC_START(startup_32) movl %esi, %edx 1: #endif + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit + pushl $__KERNEL_CS pushl %eax @@ -694,6 +718,19 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA_START(boot32_idt_desc) + .word boot32_idt_end - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) + .balign 8 +SYM_DATA_START(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) +#endif + #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif @@ -786,6 +823,137 @@ SYM_DATA_START_LOCAL(loaded_image_proto) SYM_DATA_END(loaded_image_proto) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + __HEAD + .code32 +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * + * Physical offset is expected in %ebp + */ +SYM_FUNC_START(startup32_set_idt_entry) + push %ebx + push %ecx + + /* IDT entry address to %ebx */ + leal rva(boot32_idt)(%ebp), %ebx + shl $3, %edx + addl %edx, %ebx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + movl $__KERNEL32_CS, %ecx # Target code segment selector + shl $16, %ecx + orl %ecx, %edx + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ebx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ebx) + + pop %ecx + pop %ebx + ret +SYM_FUNC_END(startup32_set_idt_entry) +#endif + +SYM_FUNC_START(startup32_load_idt) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* #VC handler */ + leal rva(startup32_vc_handler)(%ebp), %eax + movl $X86_TRAP_VC, %edx + call startup32_set_idt_entry + + /* Load IDT */ + leal rva(boot32_idt)(%ebp), %eax + movl %eax, rva(boot32_idt_desc+2)(%ebp) + lidt rva(boot32_idt_desc)(%ebp) +#endif + ret +SYM_FUNC_END(startup32_load_idt) + +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Check for non-zero sev_status */ + movl rva(sev_status)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + movl %eax, rva(sev_check_data)(%ebp) + movl %ebx, rva(sev_check_data+4)(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, rva(sev_check_data)(%ebp) + jne 3f + cmpl %ebx, rva(sev_check_data+4)(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %edx + popl %ecx + popl %ebx + popl %eax +#endif + ret +SYM_FUNC_END(startup32_check_sev_cbit) + /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 804a502ee0d2..9b93567d663a 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -52,3 +52,17 @@ void load_stage2_idt(void) load_boot_idt(&boot_idt_desc); } + +void cleanup_exception_handling(void) +{ + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + + /* Set a null-idt, disabling #PF and #VC handling */ + boot_idt_desc.size = 0; + boot_idt_desc.address = 0; + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b92fffbe761f..e36690778497 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -639,9 +639,9 @@ static bool process_mem_region(struct mem_vector *region, if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); - return 1; + return true; } - return 0; + return false; } #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index aa561795efd1..c1e81a848b2a 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -23,12 +23,6 @@ SYM_FUNC_START(get_sev_encryption_bit) push %ecx push %edx - /* Check if running under a hypervisor */ - movl $1, %eax - cpuid - bt $31, %ecx /* Check the hypervisor bit */ - jnc .Lno_sev - movl $0x80000000, %eax /* CPUID to check the highest leaf */ cpuid cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ @@ -67,10 +61,132 @@ SYM_FUNC_START(get_sev_encryption_bit) ret SYM_FUNC_END(get_sev_encryption_bit) +/** + * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using + * the GHCB MSR protocol + * + * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX) + * @%edx: CPUID Function + * + * Returns 0 in %eax on success, non-zero on failure + * %edx returns CPUID value on success + */ +SYM_CODE_START_LOCAL(sev_es_req_cpuid) + shll $30, %eax + orl $0x00000004, %eax + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall # VMGEXIT + rdmsr + + /* Check response */ + movl %eax, %ecx + andl $0x3ffff000, %ecx # Bits [12-29] MBZ + jnz 2f + + /* Check return code */ + andl $0xfff, %eax + cmpl $5, %eax + jne 2f + + /* All good - return success */ + xorl %eax, %eax +1: + ret +2: + movl $-1, %eax + jmp 1b +SYM_CODE_END(sev_es_req_cpuid) + +SYM_CODE_START(startup32_vc_handler) + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Keep CPUID function in %ebx */ + movl %eax, %ebx + + /* Check if error-code == SVM_EXIT_CPUID */ + cmpl $0x72, 16(%esp) + jne .Lfail + + movl $0, %eax # Request CPUID[fn].EAX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 12(%esp) # Store result + + movl $1, %eax # Request CPUID[fn].EBX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 8(%esp) # Store result + + movl $2, %eax # Request CPUID[fn].ECX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 4(%esp) # Store result + + movl $3, %eax # Request CPUID[fn].EDX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 0(%esp) # Store result + + /* + * Sanity check CPUID results from the Hypervisor. See comment in + * do_vc_no_ghcb() for more details on why this is necessary. + */ + + /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */ + cmpl $0x80000000, %ebx + jne .Lcheck_sev + cmpl $0x8000001f, 12(%esp) + jb .Lfail + jmp .Ldone + +.Lcheck_sev: + /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */ + cmpl $0x8000001f, %ebx + jne .Ldone + btl $1, 12(%esp) + jnc .Lfail + +.Ldone: + popl %edx + popl %ecx + popl %ebx + popl %eax + + /* Remove error code */ + addl $4, %esp + + /* Jump over CPUID instruction */ + addl $2, (%esp) + + iret +.Lfail: + /* Send terminate request to Hypervisor */ + movl $0x100, %eax + xorl %edx, %edx + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall + + /* If request fails, go to hlt loop */ + hlt + jmp .Lfail +SYM_CODE_END(startup32_vc_handler) + .code64 #include "../../kernel/sev_verify_cbit.S" - SYM_FUNC_START(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 267e7f93050e..dde042f64cca 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -430,8 +430,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, error("Destination address too large"); #endif #ifndef CONFIG_RELOCATABLE - if ((unsigned long)output != LOAD_PHYSICAL_ADDR) - error("Destination address does not match LOAD_PHYSICAL_ADDR"); if (virt_addr != LOAD_PHYSICAL_ADDR) error("Destination virtual address changed when not relocatable"); #endif @@ -443,11 +441,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); - /* - * Flush GHCB from cache and map it encrypted again when running as - * SEV-ES guest. - */ - sev_es_shutdown_ghcb(); + /* Disable exception handling before booting the kernel */ + cleanup_exception_handling(); return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 901ea5ebec22..e5612f035498 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -155,6 +155,12 @@ extern pteval_t __default_kernel_pte_mask; extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; extern struct desc_ptr boot_idt_desc; +#ifdef CONFIG_X86_64 +void cleanup_exception_handling(void); +#else +static inline void cleanup_exception_handling(void) { } +#endif + /* IDT Entry Points */ void boot_page_fault(void); void boot_stage1_vc(void); diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c index 27826c265aab..d904bd56b3e3 100644 --- a/arch/x86/boot/compressed/sev-es.c +++ b/arch/x86/boot/compressed/sev-es.c @@ -200,14 +200,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) } finish: - if (result == ES_OK) { + if (result == ES_OK) vc_finish_insn(&ctxt); - } else if (result != ES_RETRY) { - /* - * For now, just halt the machine. That makes debugging easier, - * later we just call sev_es_terminate() here. - */ - while (true) - asm volatile("hlt\n"); - } + else if (result != ES_RETRY) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 7c4c7b2fbf05..98cf3b4e4c9f 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -24,7 +24,7 @@ /* * Copyright 2012 Xyratex Technology Limited * - * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation. */ #include <linux/init.h> #include <linux/module.h> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 5af8021b98ce..6706b6cb1d0f 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -114,11 +114,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) ); } -/* Computes the field substraction of two field elements */ +/* Computes the field subtraction of two field elements */ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( - /* Compute the raw substraction of f1-f2 */ + /* Compute the raw subtraction of f1-f2 */ " movq 0(%1), %%r8;" " subq 0(%2), %%r8;" " movq 8(%1), %%r9;" @@ -135,7 +135,7 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) " mov $38, %%rcx;" " cmovc %%rcx, %%rax;" - /* Step 2: Substract carry*38 from the original difference */ + /* Step 2: Subtract carry*38 from the original difference */ " sub %%rax, %%r8;" " sbb $0, %%r9;" " sbb $0, %%r10;" diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 646da46e8d10..1dfb8af48a3c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -16,7 +16,7 @@ #include <asm/simd.h> asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_KEY_SIZE]); + const u8 key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, const size_t len, const u32 padbit); asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], @@ -81,7 +81,7 @@ static void convert_to_base2_64(void *ctx) state->is_base2_26 = 0; } -static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) { poly1305_init_x86_64(ctx, key); } @@ -129,7 +129,7 @@ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], poly1305_emit_avx(ctx, mac, nonce); } -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_simd_init(&dctx->h, key); dctx->s[0] = get_unaligned_le32(&key[16]); diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index fc23552afe37..bca4cea757ce 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -88,7 +88,7 @@ /* * Combined G1 & G2 function. Reordered with help of rotates to have moves - * at begining. + * at beginning. */ #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ /* G1,1 && G2,1 */ \ diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 03725696397c..3507cf2064f1 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -117,7 +117,7 @@ static bool is_blacklisted_cpu(void) * storing blocks in 64bit registers to allow three blocks to * be processed parallel. Parallel operation then allows gaining * more performance than was trade off, on out-of-order CPUs. - * However Atom does not benefit from this parallellism and + * However Atom does not benefit from this parallelism and * should be blacklisted. */ return true; diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a2433ae8a65e..4efd39aacb9f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) regs->ax = -EFAULT; instrumentation_end(); - syscall_exit_to_user_mode(regs); + local_irq_disable(); + irqentry_exit_to_user_mode(regs); return false; } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index df8c017e6161..ff0034740900 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -40,7 +40,7 @@ #include <asm/processor-flags.h> #include <asm/irq_vectors.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/asm.h> #include <asm/smap.h> #include <asm/frame.h> @@ -209,7 +209,7 @@ * * Lets build a 5 entry IRET frame after that, such that struct pt_regs * is complete and in particular regs->sp is correct. This gives us - * the original 6 enties as gap: + * the original 6 entries as gap: * * 14*4(%esp) - <previous context> * 13*4(%esp) - gap / flags @@ -430,7 +430,7 @@ * will soon execute iret and the tracer was already set to * the irqstate after the IRET: */ - DISABLE_INTERRUPTS(CLBR_ANY) + cli lss (%esp), %esp /* switch to espfix segment */ .Lend_\@: #endif /* CONFIG_X86_ESPFIX32 */ @@ -1077,7 +1077,7 @@ restore_all_switch_stack: * when returning from IPI handler and when returning from * scheduler to user-space. */ - INTERRUPT_RETURN + iret .section .fixup, "ax" SYM_CODE_START(asm_iret_error) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 400908dff42e..a16a5294d55f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -305,7 +305,7 @@ SYM_CODE_END(ret_from_fork) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY pushq %rax - SAVE_FLAGS(CLBR_RAX) + SAVE_FLAGS testl $X86_EFLAGS_IF, %eax jz .Lokay_\@ ud2 @@ -511,7 +511,7 @@ SYM_CODE_START(\asmsym) /* * No need to switch back to the IST stack. The current stack is either * identical to the stack in the IRET frame or the VC fall-back stack, - * so it is definitly mapped even with PTI enabled. + * so it is definitely mapped even with PTI enabled. */ jmp paranoid_exit diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 541fdaf64045..0051cf5c792d 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 2d0f3d8bcc25..edfe9780f6d1 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -218,7 +218,7 @@ int main(int argc, char **argv) /* * Figure out the struct name. If we're writing to a .so file, - * generate raw output insted. + * generate raw output instead. */ name = strdup(argv[3]); namelen = strlen(name); diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index de1fff7188aa..6ddd7a937b3e 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -6,7 +6,7 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> .text .globl __kernel_vsyscall @@ -29,7 +29,7 @@ __kernel_vsyscall: * anyone with an AMD CPU, for example). Nonetheless, we try to keep * it working approximately as well as it ever worked. * - * This link may eludicate some of the history: + * This link may elucidate some of the history: * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 * personally, I find it hard to understand what's going on there. * diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 825e829ffff1..235a5794296a 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -358,7 +358,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) mmap_write_lock(mm); /* * Check if we have already mapped vdso blob - fail to prevent - * abusing from userspace install_speciall_mapping, which may + * abusing from userspace install_special_mapping, which may * not do accounting and rlimit right. * We could search vma near context.vdso, but it's a slowpath, * so let's explicitly check all VMAs to be completely sure. diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S index 86a0e94f68df..99dafac992e2 100644 --- a/arch/x86/entry/vdso/vsgx.S +++ b/arch/x86/entry/vdso/vsgx.S @@ -137,7 +137,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave) /* * If the return from callback is zero or negative, return immediately, - * else re-execute ENCLU with the postive return value interpreted as + * else re-execute ENCLU with the positive return value interpreted as * the requested ENCLU function. */ cmp $0, %eax diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2c1791c4a518..9687a8aef01c 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -623,7 +623,7 @@ static void amd_pmu_disable_all(void) /* * Check each counter for overflow and wait for it to be reset by the * NMI if it has overflowed. This relies on the fact that all active - * counters are always enabled when this function is caled and + * counters are always enabled when this function is called and * ARCH_PERFMON_EVENTSEL_INT is always set. */ for (idx = 0; idx < x86_pmu.num_counters; idx++) { diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index 0e5c036fd7be..e6493a67f1c6 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -17,7 +17,7 @@ #define IOMMU_PC_DEVID_MATCH_REG 0x20 #define IOMMU_PC_COUNTER_REPORT_REG 0x28 -/* maximun specified bank/counters */ +/* maximum specified bank/counters */ #define PC_MAX_SPEC_BNKS 64 #define PC_MAX_SPEC_CNTRS 16 diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6ddeed3cd2ac..4c31cae4707e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -81,7 +81,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); -DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); +/* + * This one is magic, it will get called even when PMU init fails (because + * there is no PMU), in which case it should simply return NULL. + */ +DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -761,7 +765,7 @@ struct perf_sched { }; /* - * Initialize interator that runs through all events and counters. + * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) @@ -1944,13 +1948,6 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } -static inline struct perf_guest_switch_msr * -perf_guest_get_msrs_nop(int *nr) -{ - *nr = 0; - return NULL; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2025,7 +2022,7 @@ static int __init init_hw_perf_events(void) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) - x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop; + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; x86_pmu_static_call_update(); diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 731dd8d0dbb1..6320d2cfd9d3 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -594,7 +594,7 @@ static __init int bts_init(void) * we cannot use the user mapping since it will not be available * if we're not running the owning process. * - * With PTI we can't use the kernal map either, because its not + * With PTI we can't use the kernel map either, because its not * there when we run userspace. * * For now, disable this driver when using PTI. diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5bac48d5c18e..3fd69bd5fa6e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -137,7 +137,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ + INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMPTY */ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ @@ -2186,7 +2186,7 @@ static void intel_pmu_enable_all(int added) * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either * in sequence on the same PMC or on different PMCs. * - * In practise it appears some of these events do in fact count, and + * In practice it appears some of these events do in fact count, and * we need to program all 4 events. */ static void intel_pmu_nhm_workaround(void) @@ -2435,7 +2435,7 @@ static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) /* * The metric is reported as an 8bit integer fraction - * suming up to 0xff. + * summing up to 0xff. * slots-in-metric = (Metric / 0xff) * slots */ val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff; @@ -2776,7 +2776,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * processing loop coming after that the function, otherwise * phony regular samples may be generated in the sampling buffer * not marked with the EXACT tag. Another possibility is to have - * one PEBS event and at least one non-PEBS event whic hoverflows + * one PEBS event and at least one non-PEBS event which overflows * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will * not be set, yet the overflow status bit for the PEBS counter will * be on Skylake. @@ -2824,7 +2824,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* - * Intel Perf mertrics + * Intel Perf metrics */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; @@ -3659,11 +3659,16 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) + return -EINVAL; + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3676,6 +3681,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice @@ -4510,7 +4516,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), @@ -4588,7 +4594,7 @@ static bool check_msr(unsigned long msr, u64 mask) /* * Disable the check for real HW, so we don't - * mess with potentionaly enabled registers: + * mess with potentially enabled registers: */ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) return true; @@ -4653,7 +4659,7 @@ static __init void intel_arch_events_quirk(void) { int bit; - /* disable event that reported as not presend by cpuid */ + /* disable event that reported as not present by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; pr_warn("CPUID marked event: \'%s\' unavailable\n", diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ebae1826403..5aabb0e2964a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1805,7 +1805,7 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) * * [-period, 0] * - * the difference between two consequtive reads is: + * the difference between two consecutive reads is: * * A) value2 - value1; * when no overflows have happened in between, @@ -2010,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d */ if (!pebs_status && cpuc->pebs_enabled && !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; + pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 21890dacfcfe..acb04ef3da3f 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1198,7 +1198,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) /* * The LBR logs any address in the IP, even if the IP just * faulted. This means userspace can control the from address. - * Ensure we don't blindy read any address by validating it is + * Ensure we don't blindly read any address by validating it is * a known text address. */ if (kernel_text_address(from)) { diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index a4cc66005ce8..971dffe0b77d 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -24,7 +24,7 @@ struct p4_event_bind { unsigned int escr_msr[2]; /* ESCR MSR for this event */ unsigned int escr_emask; /* valid ESCR EventMask bits */ unsigned int shared; /* event is shared across threads */ - char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ + char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ }; struct p4_pebs_bind { @@ -45,7 +45,7 @@ struct p4_pebs_bind { * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of * event configuration to find out which values are to be * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT - * resgisters + * registers */ static struct p4_pebs_bind p4_pebs_bind_map[] = { P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), @@ -1313,7 +1313,7 @@ static __initconst const struct x86_pmu p4_pmu = { .get_event_constraints = x86_get_event_constraints, /* * IF HT disabled we may need to use all - * ARCH_P4_MAX_CCCR counters simulaneously + * ARCH_P4_MAX_CCCR counters simultaneously * though leave it restricted at moment assuming * HT is on */ diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index e94af4a54d0d..915847655c06 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -362,7 +362,7 @@ static bool pt_event_valid(struct perf_event *event) /* * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config - * clears the assomption that BranchEn must always be enabled, + * clears the assumption that BranchEn must always be enabled, * as was the case with the first implementation of PT. * If this bit is not set, the legacy behavior is preserved * for compatibility with the older userspace. diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b79951d0707c..4bba0491068c 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -280,17 +280,17 @@ * | [63] | 00h | VALID - When set, indicates the CPU bus * numbers have been initialized. (RO) * |[62:48]| --- | Reserved - * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned + * |[47:40]| 00h | BUS_NUM_5 - Return the bus number BIOS assigned * CPUBUSNO(5). (RO) - * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned + * |[39:32]| 00h | BUS_NUM_4 - Return the bus number BIOS assigned * CPUBUSNO(4). (RO) - * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned + * |[31:24]| 00h | BUS_NUM_3 - Return the bus number BIOS assigned * CPUBUSNO(3). (RO) - * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned + * |[23:16]| 00h | BUS_NUM_2 - Return the bus number BIOS assigned * CPUBUSNO(2). (RO) - * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned + * |[15:8] | 00h | BUS_NUM_1 - Return the bus number BIOS assigned * CPUBUSNO(1). (RO) - * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned + * | [7:0] | 00h | BUS_NUM_0 - Return the bus number BIOS assigned * CPUBUSNO(0). (RO) */ #define SKX_MSR_CPU_BUS_NUMBER 0x300 @@ -1159,7 +1159,6 @@ enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, BDX_PCI_QPI_PORT2_FILTER, - HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -2857,22 +2856,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { NULL, }; -void hswep_uncore_cpu_init(void) +#define HSWEP_PCU_DID 0x2fc0 +#define HSWEP_PCU_CAPID4_OFFET 0x94 +#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3) + +static bool hswep_has_limit_sbox(unsigned int device) { - int pkg = boot_cpu_data.logical_proc_id; + struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL); + u32 capid4; + + if (!dev) + return false; + + pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + if (!hswep_get_chop(capid4)) + return true; + return false; +} + +void hswep_uncore_cpu_init(void) +{ if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; /* Detect 6-8 core systems with only two SBOXes */ - if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - u32 capid4; - - pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3], - 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - hswep_uncore_sbox.num_boxes = 2; - } + if (hswep_has_limit_sbox(HSWEP_PCU_DID)) + hswep_uncore_sbox.num_boxes = 2; uncore_msr_uncores = hswep_msr_uncores; } @@ -3135,11 +3145,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; @@ -3231,27 +3236,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { EVENT_CONSTRAINT_END }; +#define BDX_PCU_DID 0x6fc0 + void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; - /* BDX-DE doesn't have SBOX */ - if (boot_cpu_data.x86_model == 86) { - uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; /* Detect systems with no SBOXes */ - } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - struct pci_dev *pdev; - u32 capid4; - - pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]; - pci_read_config_dword(pdev, 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; - } + if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID)) + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } @@ -3472,11 +3468,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, BDX_PCI_QPI_PORT2_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index e68827e604ad..949d845c922b 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -494,7 +494,7 @@ static __init void zhaoxin_arch_events_quirk(void) { int bit; - /* disable event that reported as not presend by cpuid */ + /* disable event that reported as not present by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; pr_warn("CPUID marked event: \'%s\' unavailable\n", diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index b81047dec1da..e7b94f636cc1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); static inline bool hv_reenlightenment_available(void) { /* - * Check for required features and priviliges to make TSC frequency + * Check for required features and privileges to make TSC frequency * change notifications work. */ return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && @@ -292,7 +292,7 @@ static int hv_suspend(void) /* * Reset the hypercall page as it is going to be invalidated - * accross hibernation. Setting hv_hypercall_pg to NULL ensures + * across hibernation. Setting hv_hypercall_pg to NULL ensures * that any subsequent hypercall operation fails safely instead of * crashing due to an access of an invalid page. The hypercall page * pointer is restored on resume. diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 62da760d6d5a..cd7b14322035 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -9,7 +9,7 @@ * Functions to keep the agpgart mappings coherent with the MMU. The * GART gives the CPU a physical alias of pages in memory. The alias * region is mapped uncacheable. Make sure there are no conflicting - * mappings with different cachability attributes for the same + * mappings with different cacheability attributes for the same * page. This avoids data corruption on some CPUs. */ diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h deleted file mode 100644 index 464034db299f..000000000000 --- a/arch/x86/include/asm/alternative-asm.h +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_ALTERNATIVE_ASM_H -#define _ASM_X86_ALTERNATIVE_ASM_H - -#ifdef __ASSEMBLY__ - -#include <asm/asm.h> - -#ifdef CONFIG_SMP - .macro LOCK_PREFIX -672: lock - .pushsection .smp_locks,"a" - .balign 4 - .long 672b - . - .popsection - .endm -#else - .macro LOCK_PREFIX - .endm -#endif - -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -.macro ANNOTATE_IGNORE_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . - .popsection -.endm - -/* - * Issue one struct alt_instr descriptor entry (need to put it into - * the section .altinstructions, see below). This entry contains - * enough information for the alternatives patching code to patch an - * instruction. See apply_alternatives(). - */ -.macro altinstruction_entry orig alt feature orig_len alt_len pad_len - .long \orig - . - .long \alt - . - .word \feature - .byte \orig_len - .byte \alt_len - .byte \pad_len -.endm - -/* - * Define an alternative between two instructions. If @feature is - * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. ".skip" directive takes care of proper instruction padding - * in case @newinstr is longer than @oldinstr. - */ -.macro ALTERNATIVE oldinstr, newinstr, feature -140: - \oldinstr -141: - .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr -144: - .popsection -.endm - -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f - -/* - * gas compatible max based on the idea from: - * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax - * - * The additional "-" is needed because gas uses a "true" value of -1. - */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) - - -/* - * Same as ALTERNATIVE macro above but for two alternatives. If CPU - * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has - * @feature2, it replaces @oldinstr with @feature2. - */ -.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 -140: - \oldinstr -141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b - altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr1 -144: - \newinstr2 -145: - .popsection -.endm - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_ALTERNATIVE_ASM_H */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13adca37c99a..17b36090d448 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -2,13 +2,17 @@ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H -#ifndef __ASSEMBLY__ - #include <linux/types.h> -#include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> +#define ALTINSTR_FLAG_INV (1 << 15) +#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) + +#ifndef __ASSEMBLY__ + +#include <linux/stddef.h> + /* * Alternative inline assembly for SMP. * @@ -150,7 +154,7 @@ static inline int alternatives_text_reserved(void *start, void *end) " .byte " alt_rlen(num) "\n" /* replacement len */ \ " .byte " alt_pad_len "\n" /* pad len */ -#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ +#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \ "# ALT: replacement " #num "\n" \ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n" @@ -161,7 +165,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ALTINSTR_REPLACEMENT(newinstr, 1) \ ".popsection\n" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ @@ -171,10 +175,15 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ - ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ALTINSTR_REPLACEMENT(newinstr1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, 2) \ ".popsection\n" +/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ +#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ + ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ + newinstr_yes, feature) + #define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \ OLDINSTR_3(oldinsn, 1, 2, 3) \ ".pushsection .altinstructions,\"a\"\n" \ @@ -183,9 +192,9 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feat3, 3) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinsn1, feat1, 1) \ - ALTINSTR_REPLACEMENT(newinsn2, feat2, 2) \ - ALTINSTR_REPLACEMENT(newinsn3, feat3, 3) \ + ALTINSTR_REPLACEMENT(newinsn1, 1) \ + ALTINSTR_REPLACEMENT(newinsn2, 2) \ + ALTINSTR_REPLACEMENT(newinsn3, 3) \ ".popsection\n" /* @@ -206,6 +215,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") +#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \ + asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory") + /* * Alternative inline assembly with input. * @@ -271,6 +283,116 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +672: lock + .pushsection .smp_locks,"a" + .balign 4 + .long 672b - . + .popsection + .endm +#else + .macro LOCK_PREFIX + .endm +#endif + +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +.macro ANNOTATE_IGNORE_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.ignore_alts + .long .Lannotate_\@ - . + .popsection +.endm + +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len + .long \orig - . + .long \alt - . + .word \feature + .byte \orig_len + .byte \alt_len + .byte \pad_len +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +#define old_len 141b-140b +#define new_len1 144f-143f +#define new_len2 145f-144f + +/* + * gas compatible max based on the idea from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. + */ +#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + + +/* + * Same as ALTERNATIVE macro above but for two alternatives. If CPU + * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has + * @feature2, it replaces @oldinstr with @feature2. + */ +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_short(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection +.endm + +/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ +#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ + ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ + newinstr_yes, feature + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 4d4ec5cbdc51..94fbe6ae7431 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -22,7 +22,7 @@ extern void __add_wrong_size(void) /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to * -1 because sizeof will never return -1, thereby making those switch - * case statements guaranteeed dead code which the compiler will + * case statements guaranteed dead code which the compiler will * eliminate, and allowing the "missing symbol in the default case" to * indicate a usage error. */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1728d4ce5730..16a51e7288d5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -8,6 +8,7 @@ #include <asm/asm.h> #include <linux/bitops.h> +#include <asm/alternative.h> enum cpuid_leafs { @@ -175,39 +176,15 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" - "2:\n" - ".skip -(((5f-4f) - (2b-1b)) > 0) * " - "((5f-4f) - (2b-1b)),0x90\n" - "3:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 4f - .\n" /* repl offset */ - " .word %P[always]\n" /* always replace */ - " .byte 3b - 1b\n" /* src len */ - " .byte 5f - 4f\n" /* repl len */ - " .byte 3b - 2b\n" /* pad len */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "4: jmp %l[t_no]\n" - "5:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 0\n" /* no replacement */ - " .word %P[feature]\n" /* feature bit */ - " .byte 3b - 1b\n" /* src len */ - " .byte 0\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .altinstr_aux,\"ax\"\n" - "6:\n" - " testb %[bitnum],%[cap_byte]\n" - " jnz %l[t_yes]\n" - " jmp %l[t_no]\n" - ".previous\n" + asm_volatile_goto( + ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" : : [feature] "i" (bit), - [always] "i" (X86_FEATURE_ALWAYS), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cc96e26d69f7..5e2624a9d890 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -236,6 +236,8 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -290,6 +292,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9224d40cdefe..7d7500806af8 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -283,12 +283,12 @@ extern u32 elf_hwcap2; * * The decision process for determining the results are: * - * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | - * ELF: | | | | + * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | + * ELF: | | | | * ---------------------|------------|------------------|----------------| - * missing PT_GNU_STACK | exec-all | exec-all | exec-none | - * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | - * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | + * missing PT_GNU_STACK | exec-all | exec-all | exec-none | + * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | + * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | * * exec-all : all PROT_READ user mappings are executable, except when * backed by files on a noexec-filesystem. diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 5eb3bdf36a41..e35e342673c7 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -547,7 +547,7 @@ SYM_CODE_END(spurious_entries_start) /* * Dummy trap number so the low level ASM macro vector number checks do not * match which results in emitting plain IDTENTRY stubs without bells and - * whistels. + * whistles. */ #define X86_TRAP_OTHER 0xFFFF diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index a0f839aa144d..98b4dae5e8bc 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); +int insn_fetch_from_user_inatomic(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); bool insn_decode(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h index 3cb002b1d0f9..994638ef171b 100644 --- a/arch/x86/include/asm/intel_pconfig.h +++ b/arch/x86/include/asm/intel_pconfig.h @@ -38,7 +38,7 @@ enum pconfig_leaf { #define MKTME_INVALID_ENC_ALG 4 #define MKTME_DEVICE_BUSY 5 -/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */ +/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */ struct mktme_key_program { u16 keyid; u32 keyid_ctrl; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 423b788f495e..ebe8d2ea44fe 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -3,7 +3,7 @@ #define _ASM_X86_INTEL_PT_H #define PT_CPUID_LEAVES 2 -#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_CPUID_REGS_NUM 4 /* number of registers (eax, ebx, ecx, edx) */ enum pt_capabilities { PT_CAP_max_subleaf = 0, diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d726459d08e5..841a5d104afa 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -159,7 +159,7 @@ static inline void *phys_to_virt(phys_addr_t address) /* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable - * promitions in legacy drivers. + * promotions in legacy drivers. */ static inline unsigned int isa_virt_to_bus(volatile void *address) { diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 9b2a0ff76c73..562854c60808 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -190,7 +190,7 @@ /* * Macro to invoke __do_softirq on the irq stack. This is only called from - * task context when bottom halfs are about to be reenabled and soft + * task context when bottom halves are about to be reenabled and soft * interrupts are pending to be processed. The interrupt stack cannot be in * use here. */ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 144d70ea4393..c5ce9845c999 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -109,18 +109,13 @@ static __always_inline unsigned long arch_local_irq_save(void) } #else -#define ENABLE_INTERRUPTS(x) sti -#define DISABLE_INTERRUPTS(x) cli - #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_ENTRY -#define SAVE_FLAGS(x) pushfq; popq %rax +#define SAVE_FLAGS pushfq; popq %rax #endif #define INTERRUPT_RETURN jmp native_iret -#else -#define INTERRUPT_RETURN iret #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h index 97bbb4a9083a..05b48b33baf0 100644 --- a/arch/x86/include/asm/kfence.h +++ b/arch/x86/include/asm/kfence.h @@ -56,8 +56,13 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect) else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - /* Flush this CPU's TLB. */ + /* + * Flush this CPU's TLB, assuming whoever did the allocation/free is + * likely to continue running on this CPU. + */ + preempt_disable(); flush_tlb_one_kernel(addr); + preempt_enable(); return true; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 877a4025d8da..10eca9e8f7f6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -884,12 +884,29 @@ struct kvm_hv_syndbg { u64 options; }; +/* Current state of Hyper-V TSC page clocksource */ +enum hv_tsc_page_status { + /* TSC page was not set up or disabled */ + HV_TSC_PAGE_UNSET = 0, + /* TSC page MSR was written by the guest, update pending */ + HV_TSC_PAGE_GUEST_CHANGED, + /* TSC page MSR was written by KVM userspace, update pending */ + HV_TSC_PAGE_HOST_CHANGED, + /* TSC page was properly set up and is currently active */ + HV_TSC_PAGE_SET, + /* TSC page is currently being updated and therefore is inactive */ + HV_TSC_PAGE_UPDATING, + /* TSC page was set up with an inaccessible GPA */ + HV_TSC_PAGE_BROKEN, +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; + enum hv_tsc_page_status hv_tsc_page_status; /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; @@ -931,6 +948,12 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +struct kvm_x86_msr_filter { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; +}; + #define APICV_INHIBIT_REASON_DISABLE 0 #define APICV_INHIBIT_REASON_HYPERV 1 #define APICV_INHIBIT_REASON_NESTED 2 @@ -963,7 +986,7 @@ struct kvm_arch { struct kvm_pit *vpit; atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; - struct kvm_apic_map *apic_map; + struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_page_done; @@ -1025,18 +1048,13 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool bus_lock_detection_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; + struct kvm_x86_msr_filter __rcu *msr_filter; - struct { - u8 count; - bool default_allow:1; - struct msr_bitmap_range ranges[16]; - } msr_filter; - - bool bus_lock_detection_enabled; - - struct kvm_pmu_event_filter *pmu_event_filter; + struct kvm_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; #ifdef CONFIG_X86_64 @@ -1470,7 +1488,7 @@ extern u64 kvm_mce_cap_supported; /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context - * should be resued as is, i.e. skip initialization of + * should be reused as is, i.e. skip initialization of * emulation context, instruction fetch and decode. * * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. @@ -1495,7 +1513,7 @@ extern u64 kvm_mce_cap_supported; * * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware * backdoor emulation, which is opt in via module param. - * VMware backoor emulation handles select instructions + * VMware backdoor emulation handles select instructions * and reinjects the #GP for all other cases. * * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index ccf60a809a17..e7be720062a8 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -63,7 +63,7 @@ typedef int (*hyperv_fill_flush_list_func)( static __always_inline void hv_setup_sched_clock(void *sched_clock) { #ifdef CONFIG_PARAVIRT - pv_ops.time.sched_clock = sched_clock; + paravirt_set_sched_clock(sched_clock); #endif } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 546d6ecf0a35..45029354e0a8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -628,8 +628,6 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define MSR_IA32_TSCDEADLINE 0x000006e0 - #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index cb9ad6b73973..c14fb80b9a07 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,7 +7,6 @@ #include <linux/objtool.h> #include <asm/alternative.h> -#include <asm/alternative-asm.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> @@ -33,7 +32,7 @@ /* * Google experimented with loop-unrolling and this turned out to be - * the optimal version — two calls, each with their own speculation + * the optimal version - two calls, each with their own speculation * trap should their return address end up getting used, in a loop. */ #define __FILL_RETURN_BUFFER(reg, nr, sp) \ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4abf110e2243..43992e5c52c2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,11 +15,20 @@ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> +#include <linux/static_call_types.h> #include <asm/frame.h> -static inline unsigned long long paravirt_sched_clock(void) +u64 dummy_steal_clock(int cpu); +u64 dummy_sched_clock(void); + +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); +DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)); + +static inline u64 paravirt_sched_clock(void) { - return PVOP_CALL0(unsigned long long, time.sched_clock); + return static_call(pv_sched_clock)(); } struct static_key; @@ -33,9 +42,13 @@ bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { - return PVOP_CALL1(u64, time.steal_clock, cpu); + return static_call(pv_steal_clock)(cpu); } +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init paravirt_set_cap(void); +#endif + /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { @@ -122,7 +135,9 @@ static inline void write_cr0(unsigned long x) static inline unsigned long read_cr2(void) { - return PVOP_CALLEE0(unsigned long, mmu.read_cr2); + return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, + "mov %%cr2, %%rax;", + ALT_NOT(X86_FEATURE_XENPV)); } static inline void write_cr2(unsigned long x) @@ -132,12 +147,14 @@ static inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { - return PVOP_CALL0(unsigned long, mmu.read_cr3); + return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, + "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } static inline void write_cr3(unsigned long x) { - PVOP_VCALL1(mmu.write_cr3, x); + PVOP_ALT_VCALL1(mmu.write_cr3, x, + "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); } static inline void __write_cr4(unsigned long x) @@ -157,7 +174,7 @@ static inline void halt(void) static inline void wbinvd(void) { - PVOP_VCALL0(cpu.wbinvd); + PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); } static inline u64 paravirt_read_msr(unsigned msr) @@ -371,22 +388,28 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { - return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) }; + return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pteval_t pte_val(pte_t pte) { - return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); + return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline pgd_t __pgd(pgdval_t val) { - return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) }; + return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pgdval_t pgd_val(pgd_t pgd) { - return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); + return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -419,12 +442,15 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { - return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) }; + return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pmdval_t pmd_val(pmd_t pmd) { - return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); + return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -436,14 +462,16 @@ static inline pud_t __pud(pudval_t val) { pudval_t ret; - ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val); + ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { - return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud); + return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void pud_clear(pud_t *pudp) @@ -462,14 +490,17 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline p4d_t __p4d(p4dval_t val) { - p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val); + p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { - return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d); + return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -556,7 +587,9 @@ static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { - PVOP_VCALLEE1(lock.queued_spin_unlock, lock); + PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, + "movb $0, (%%" _ASM_ARG1 ");", + ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) @@ -571,7 +604,9 @@ static __always_inline void pv_kick(int cpu) static __always_inline bool pv_vcpu_is_preempted(long cpu) { - return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu); + return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, + "xor %%" _ASM_AX ", %%" _ASM_AX ";", + ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); @@ -645,17 +680,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #ifdef CONFIG_PARAVIRT_XXL static inline notrace unsigned long arch_local_save_flags(void) { - return PVOP_CALLEE0(unsigned long, irq.save_fl); + return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", + ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace void arch_local_irq_disable(void) { - PVOP_VCALLEE0(irq.irq_disable); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace void arch_local_irq_enable(void) { - PVOP_VCALLEE0(irq.irq_enable); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace unsigned long arch_local_irq_save(void) @@ -700,84 +736,27 @@ extern void default_banner(void); .popsection -#define COND_PUSH(set, mask, reg) \ - .if ((~(set)) & mask); push %reg; .endif -#define COND_POP(set, mask, reg) \ - .if ((~(set)) & mask); pop %reg; .endif - #ifdef CONFIG_X86_64 - -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_RAX, rax); \ - COND_PUSH(set, CLBR_RCX, rcx); \ - COND_PUSH(set, CLBR_RDX, rdx); \ - COND_PUSH(set, CLBR_RSI, rsi); \ - COND_PUSH(set, CLBR_RDI, rdi); \ - COND_PUSH(set, CLBR_R8, r8); \ - COND_PUSH(set, CLBR_R9, r9); \ - COND_PUSH(set, CLBR_R10, r10); \ - COND_PUSH(set, CLBR_R11, r11) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_R11, r11); \ - COND_POP(set, CLBR_R10, r10); \ - COND_POP(set, CLBR_R9, r9); \ - COND_POP(set, CLBR_R8, r8); \ - COND_POP(set, CLBR_RDI, rdi); \ - COND_POP(set, CLBR_RSI, rsi); \ - COND_POP(set, CLBR_RDX, rdx); \ - COND_POP(set, CLBR_RCX, rcx); \ - COND_POP(set, CLBR_RAX, rax) +#ifdef CONFIG_PARAVIRT_XXL #define PARA_PATCH(off) ((off) / 8) #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#else -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_EAX, eax); \ - COND_PUSH(set, CLBR_EDI, edi); \ - COND_PUSH(set, CLBR_ECX, ecx); \ - COND_PUSH(set, CLBR_EDX, edx) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_EDX, edx); \ - COND_POP(set, CLBR_ECX, ecx); \ - COND_POP(set, CLBR_EDI, edi); \ - COND_POP(set, CLBR_EAX, eax) - -#define PARA_PATCH(off) ((off) / 4) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4) -#define PARA_INDIRECT(addr) *%cs:addr -#endif -#ifdef CONFIG_PARAVIRT_XXL #define INTERRUPT_RETURN \ - PARA_SITE(PARA_PATCH(PV_CPU_iret), \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);) - -#define DISABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) - -#define ENABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) -#endif + ANNOTATE_RETPOLINE_SAFE; \ + ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \ + X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;") -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY -#define SAVE_FLAGS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +.macro PARA_IRQ_save_fl + PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) +.endm + +#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ + ALT_NOT(X86_FEATURE_XENPV) #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ @@ -800,5 +779,11 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif + +#ifndef CONFIG_PARAVIRT_SPINLOCKS +static inline void paravirt_set_cap(void) +{ +} +#endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index de87087d3bde..ae692c3194e9 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -3,7 +3,6 @@ #define _ASM_X86_PARAVIRT_TYPES_H /* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 #define CLBR_EAX (1 << 0) #define CLBR_ECX (1 << 1) #define CLBR_EDX (1 << 2) @@ -15,7 +14,6 @@ #define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) #define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) #else #define CLBR_RAX CLBR_EAX #define CLBR_RCX CLBR_ECX @@ -32,12 +30,9 @@ #define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ CLBR_RCX | CLBR_R8 | CLBR_R9) #define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) #endif /* X86_64 */ -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) - #ifndef __ASSEMBLY__ #include <asm/desc_defs.h> @@ -73,19 +68,6 @@ struct pv_info { const char *name; }; -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, void *insn_buff, - unsigned long addr, unsigned len); -} __no_randomize_layout; - #ifdef CONFIG_PARAVIRT_XXL struct pv_lazy_ops { /* Set deferred update mode, used for batching operations. */ @@ -95,11 +77,6 @@ struct pv_lazy_ops { } __no_randomize_layout; #endif -struct pv_time_ops { - unsigned long long (*sched_clock)(void); - unsigned long long (*steal_clock)(int cpu); -} __no_randomize_layout; - struct pv_cpu_ops { /* hooks for various privileged instructions */ void (*io_delay)(void); @@ -156,10 +133,6 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); #endif @@ -290,8 +263,6 @@ struct pv_lock_ops { * number for each function using the offset which we use to indicate * what to patch. */ struct paravirt_patch_template { - struct pv_init_ops init; - struct pv_time_ops time; struct pv_cpu_ops cpu; struct pv_irq_ops irq; struct pv_mmu_ops mmu; @@ -300,6 +271,7 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; +extern void (*paravirt_iret)(void); #define PARAVIRT_PATCH(x) \ (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) @@ -331,11 +303,7 @@ extern struct paravirt_patch_template pv_ops; /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" -unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len); -unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len); -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end); - -unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len); +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); int paravirt_disable_iospace(void); @@ -371,7 +339,7 @@ int paravirt_disable_iospace(void); * on the stack. All caller-save registers (eax,edx,ecx) are expected * to be modified (either clobbered or used for return values). * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * conventions, returning at %rax, with parameters going on %rdi, %rsi, * %rdx, and %rcx. Note that for this reason, x86_64 does not need any * special handling for dealing with 4 arguments, unlike i386. * However, x86_64 also have to clobber all caller saved registers, which @@ -414,11 +382,9 @@ int paravirt_disable_iospace(void); * makes sure the incoming and outgoing types are always correct. */ #ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ +#define PVOP_CALL_ARGS \ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) #define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) @@ -434,12 +400,10 @@ int paravirt_disable_iospace(void); #define VEXTRA_CLOBBERS #else /* CONFIG_X86_64 */ /* [re]ax isn't an arg, but the return val */ -#define PVOP_VCALL_ARGS \ +#define PVOP_CALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ __edx = __edx, __ecx = __ecx, __eax = __eax; -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) #define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) @@ -464,152 +428,138 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)pv_ops.op) #endif -#define PVOP_RETMASK(rettype) \ +#define PVOP_RETVAL(rettype) \ ({ unsigned long __mask = ~0UL; \ + BUILD_BUG_ON(sizeof(rettype) > sizeof(unsigned long)); \ switch (sizeof(rettype)) { \ case 1: __mask = 0xffUL; break; \ case 2: __mask = 0xffffUL; break; \ case 4: __mask = 0xffffffffUL; break; \ default: break; \ } \ - __mask; \ + __mask & __eax; \ }) -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ +#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ ({ \ - rettype __ret; \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \ - } \ - __ret; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : call_clbr, ASM_CALL_CONSTRAINT \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + ret; \ }) -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ + extra_clbr, ...) \ ({ \ - PVOP_VCALL_ARGS; \ + PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ + asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ + ret; \ }) -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) +#define __PVOP_CALL(rettype, op, ...) \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) + +#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ + ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, ...) \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + +#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ + CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + + +#define __PVOP_VCALL(op, ...) \ + (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, ##__VA_ARGS__) + +#define __PVOP_ALT_VCALL(op, alt, cond, ...) \ + (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ + ##__VA_ARGS__) -#define __PVOP_VCALLEESAVE(op, pre, post, ...) \ - ____PVOP_VCALL(op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) +#define __PVOP_VCALLEESAVE(op, ...) \ + (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) +#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") + __PVOP_CALL(rettype, op) #define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") + __PVOP_VCALL(op) +#define PVOP_ALT_CALL0(rettype, op, alt, cond) \ + __PVOP_ALT_CALL(rettype, op, alt, cond) +#define PVOP_ALT_VCALL0(op, alt, cond) \ + __PVOP_ALT_VCALL(op, alt, cond) #define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") + __PVOP_CALLEESAVE(rettype, op) #define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") + __PVOP_VCALLEESAVE(op) +#define PVOP_ALT_CALLEE0(rettype, op, alt, cond) \ + __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond) +#define PVOP_ALT_VCALLEE0(op, alt, cond) \ + __PVOP_ALT_VCALLEESAVE(op, alt, cond) #define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1)) #define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_VCALL1(op, arg1, alt, cond) \ + __PVOP_ALT_VCALL(op, alt, cond, PVOP_CALL_ARG1(arg1)) #define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_CALLEESAVE(rettype, op, PVOP_CALL_ARG1(arg1)) #define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_VCALLEESAVE(op, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_CALLEE1(rettype, op, arg1, alt, cond) \ + __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_VCALLEE1(op, arg1, alt, cond) \ + __PVOP_ALT_VCALLEESAVE(op, alt, cond, PVOP_CALL_ARG1(arg1)) #define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2)) #define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2)) #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), \ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) #define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), \ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif /* Lazy mode for batching updates / context switch */ enum paravirt_lazy_mode { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a02c67291cfc..b1099f2d9800 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1244,7 +1244,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * - * dst - pointer to pgd range anwhere on a pgd page + * dst - pointer to pgd range anywhere on a pgd page * src - "" * count - the number of pgds to copy. * diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index dc6d149bf851..185142b84ebe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -314,11 +314,6 @@ struct x86_hw_tss { struct x86_hw_tss { u32 reserved1; u64 sp0; - - /* - * We store cpu_current_top_of_stack in sp1 so it's always accessible. - * Linux does not use ring 1, so sp1 is not otherwise needed. - */ u64 sp1; /* @@ -426,12 +421,7 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -#ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); -#else -/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ -#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 -#endif #ifdef CONFIG_X86_64 struct fixed_percpu_data { @@ -527,7 +517,7 @@ struct thread_struct { struct io_bitmap *io_bitmap; /* - * IOPL. Priviledge level dependent I/O permission which is + * IOPL. Privilege level dependent I/O permission which is * emulated via the I/O bitmap to prevent user space from disabling * interrupts. */ @@ -551,15 +541,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, *size = fpu_kernel_xstate_size; } -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ - static inline void native_load_sp0(unsigned long sp0) { diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 2c35f1c01a2d..8c5d1910a848 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -4,6 +4,8 @@ #include <asm/ldt.h> +struct task_struct; + /* misc architecture specific prototypes */ void syscall_init(void); @@ -25,6 +27,7 @@ void __end_SYSENTER_singlestep_region(void); void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); +void entry_SYSCALL_compat_safe_stack(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d8324a236696..409f661481e1 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -94,6 +94,8 @@ struct pt_regs { #include <asm/paravirt_types.h> #endif +#include <asm/proto.h> + struct cpuinfo_x86; struct task_struct; @@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp + +static inline bool ip_within_syscall_gap(struct pt_regs *regs) +{ + bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + +#ifdef CONFIG_IA32_EMULATION + ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && + regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); +#endif + + return ret; +} #endif static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 4352f08bfbb5..43fa081a1adb 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -8,8 +8,8 @@ /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack - * Executability : eXeutable, NoteXecutable + * Cacheability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXecutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent * Encryption : Encrypted, Decrypted diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 389d851a02c4..a12458a7a8d4 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -130,11 +130,6 @@ void *extend_brk(size_t size, size_t align); : : "i" (sz)); \ } -/* Helper for reserving space for arrays of things */ -#define RESERVE_BRK_ARRAY(type, name, entries) \ - type *name; \ - RESERVE_BRK(name, sizeof(type) * entries) - extern void probe_roms(void); #ifdef __i386__ diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h index dd7602c44c72..9c31e0ebc55b 100644 --- a/arch/x86/kernel/cpu/sgx/arch.h +++ b/arch/x86/include/asm/sgx.h @@ -2,15 +2,20 @@ /** * Copyright(c) 2016-20 Intel Corporation. * - * Contains data structures defined by the SGX architecture. Data structures - * defined by the Linux software stack should not be placed here. + * Intel Software Guard Extensions (SGX) support. */ -#ifndef _ASM_X86_SGX_ARCH_H -#define _ASM_X86_SGX_ARCH_H +#ifndef _ASM_X86_SGX_H +#define _ASM_X86_SGX_H #include <linux/bits.h> #include <linux/types.h> +/* + * This file contains both data structures defined by SGX architecture and Linux + * defined software data structures and functions. The two should not be mixed + * together for better readibility. The architectural definitions come first. + */ + /* The SGX specific CPUID function. */ #define SGX_CPUID 0x12 /* EPC enumeration. */ @@ -22,16 +27,36 @@ /* The bitmask for the EPC section type. */ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, +}; + /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_UNMASKED_EVENT = 128, }; @@ -271,7 +296,7 @@ struct sgx_pcmd { * @header1: constant byte string * @vendor: must be either 0x0000 or 0x8086 * @date: YYYYMMDD in BCD - * @header2: costant byte string + * @header2: constant byte string * @swdefined: software defined value */ struct sgx_sigstruct_header { @@ -335,4 +360,19 @@ struct sgx_sigstruct { #define SGX_LAUNCH_TOKEN_SIZE 304 -#endif /* _ASM_X86_SGX_ARCH_H */ +/* + * Do not put any hardware-defined SGX structure representations below this + * comment! + */ + +#ifdef CONFIG_X86_SGX_KVM +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr); +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr); +#endif + +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd); + +#endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8b58d6975d5d..d17b39893b79 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -11,6 +11,7 @@ #include <asm/nops.h> #include <asm/cpufeatures.h> +#include <asm/alternative.h> /* "Raw" instruction opcodes */ #define __ASM_CLAC ".byte 0x0f,0x01,0xca" @@ -18,8 +19,6 @@ #ifdef __ASSEMBLY__ -#include <asm/alternative-asm.h> - #ifdef CONFIG_X86_SMAP #define ASM_CLAC \ @@ -37,8 +36,6 @@ #else /* __ASSEMBLY__ */ -#include <asm/alternative.h> - #ifdef CONFIG_X86_SMAP static __always_inline void clac(void) @@ -58,9 +55,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "pushf; pop %0; " __ASM_CLAC "\n\t" - "1:" + ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); return flags; @@ -69,9 +65,8 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "push %0; popf\n\t" - "1:" + ALTERNATIVE("", "push %0; popf\n\t", + X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index c0538f82c9a2..630ff08532be 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -132,6 +132,7 @@ void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); +void cond_wakeup_cpu0(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 9f69cc497f4b..b5f0d2ff47e4 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task) else this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* - * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That - * doesn't work on x86-32 because sp1 and - * cpu_current_top_of_stack have different values (because of - * the non-zero stack-padding on 32bit). - */ + /* Xen PV enters the kernel on the thread stack. */ if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index a84333adeef2..80c08c7d5e72 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -80,6 +80,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); } #define __COND_SYSCALL(abi, name) \ + __weak long __##abi##_##name(const struct pt_regs *__unused); \ __weak long __##abi##_##name(const struct pt_regs *__unused) \ { \ return sys_ni_syscall(); \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0d751d5da702..de406d93b515 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -197,18 +197,25 @@ static inline int arch_within_stack_frames(const void * const stack, #endif } -#else /* !__ASSEMBLY__ */ - -#ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) -#endif +#endif /* !__ASSEMBLY__ */ -#endif +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ +#ifndef __ASSEMBLY__ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ + +#define arch_set_restart_data(restart) \ + do { restart->arch_data = current_thread_info()->status; } while (0) + #endif -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_32 #define in_ia32_syscall() true diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h index f241451035fb..027a9258dbca 100644 --- a/arch/x86/include/asm/uv/uv_geo.h +++ b/arch/x86/include/asm/uv/uv_geo.h @@ -10,7 +10,7 @@ #ifndef _ASM_UV_GEO_H #define _ASM_UV_GEO_H -/* Type declaractions */ +/* Type declarations */ /* Size of a geoid_s structure (must be before decl. of geoid_u) */ #define GEOID_SIZE 8 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 5002f52be332..d3e3197917be 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -353,7 +353,7 @@ union uvh_apicid { * * Note there are NO leds on a UV system. This register is only * used by the system controller to monitor system-wide operation. - * There are 64 regs per node. With Nahelem cpus (2 cores per node, + * There are 64 regs per node. With Nehalem cpus (2 cores per node, * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on * a node. * diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 7068e4bb057d..1a162e559753 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -87,18 +87,6 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, #endif /* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define XEN_EXTRA_MEM_RATIO (10) - -/* * Helper functions to write or read unsigned long values to/from * memory, when the access may fault. */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 600a141c8805..b25d3f82c2f3 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -234,7 +234,7 @@ struct boot_params { * handling of page tables. * * These enums should only ever be used by x86 code, and the code that uses - * it should be well contained and compartamentalized. + * it should be well contained and compartmentalized. * * KVM and Xen HVM do not have a subarch as these are expected to follow * standard x86 boot entries. If there is a genuine need for "hypervisor" type @@ -252,7 +252,7 @@ struct boot_params { * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are - * currently supportd through this PV boot path. + * currently supported through this PV boot path. * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform * systems which do not have the PCI legacy interfaces. * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h index b3d0664fadc9..ac83e25bbf37 100644 --- a/arch/x86/include/uapi/asm/msgbuf.h +++ b/arch/x86/include/uapi/asm/msgbuf.h @@ -12,7 +12,7 @@ * The msqid64_ds structure for x86 architecture with x32 ABI. * * On x86-32 and x86-64 we can just use the generic definition, but - * x32 uses the same binary layout as x86_64, which is differnet + * x32 uses the same binary layout as x86_64, which is different * from other 32-bit architectures. */ diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 9034f3007c4e..9690d6899ad9 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -152,7 +152,7 @@ struct sgx_enclave_run { * Most exceptions reported on ENCLU, including those that occur within the * enclave, are fixed up and reported synchronously instead of being delivered * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are - * never fixed up and are always delivered via standard signals. On synchrously + * never fixed up and are always delivered via standard signals. On synchronously * reported exceptions, -EFAULT is returned and details about the exception are * recorded in @run.exception, the optional sgx_enclave_exception struct. * diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h index f0305dc660c9..fce18eaa070c 100644 --- a/arch/x86/include/uapi/asm/shmbuf.h +++ b/arch/x86/include/uapi/asm/shmbuf.h @@ -9,7 +9,7 @@ * The shmid64_ds structure for x86 architecture with x32 ABI. * * On x86-32 and x86-64 we can just use the generic definition, but - * x32 uses the same binary layout as x86_64, which is differnet + * x32 uses the same binary layout as x86_64, which is different * from other 32-bit architectures. */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 844d60eb1882..d0d9b331d3a1 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -139,7 +139,7 @@ struct _fpstate_32 { * The 64-bit FPU frame. (FXSAVE format and later) * * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is - * larger: 'struct _xstate'. Note that 'struct _xstate' embedds + * larger: 'struct _xstate'. Note that 'struct _xstate' embeds * 'struct _fpstate' so that you can always assume the _fpstate portion * exists so that you can check the magic value. * diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2ddf08351f0b..0704c2a94272 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ KASAN_SANITIZE_sev-es.o := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD_test_nx.o := y -OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y ifdef CONFIG_FRAME_POINTER OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y @@ -121,7 +120,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o +obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7bdc0239a943..e90310cbe73a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -830,7 +830,7 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); /** - * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base + * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base * has been registered * @handle: ACPI handle of the IOAPIC device * @gsi_base: GSI base associated with the IOAPIC @@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void) /* * Initialize the ACPI boot-time table parser. */ - if (acpi_table_init()) { + if (acpi_locate_initial_tables()) disable_acpi(); - return; - } + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void) } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return; + return 1; } } -} - -int __init early_acpi_boot_init(void) -{ - /* - * If acpi_disabled, bail out - */ - if (acpi_disabled) - return 1; /* * Process the Multiple APIC Description Table (MADT), if present @@ -1657,7 +1656,7 @@ static int __init parse_acpi(char *arg) else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); } - /* "acpi=copy_dsdt" copys DSDT */ + /* "acpi=copy_dsdt" copies DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index cc1fea76aab0..3f85fcae450c 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -41,7 +41,7 @@ unsigned long acpi_get_wakeup_address(void) * x86_acpi_enter_sleep_state - enter sleep state * @state: Sleep state to enter. * - * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + * Wrapper around acpi_enter_sleep_state() to be called by assembly. */ asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) { diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 56b6865afb2a..d5d8a352eafa 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) /* * The suspend path may have poisoned some areas deeper in the stack, * which we now need to unpoison. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8d778e46725d..f810e6fececd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -28,6 +28,7 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> +#include <asm/paravirt.h> int __read_mostly alternatives_patched; @@ -388,21 +389,31 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + /* Mask away "NOT" flag bit for feature to test. */ + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); - BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) { + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); + + /* + * Patch if either: + * - feature is present + * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) { if (a->padlen > 1) optimize_nops(a, instr); continue; } - DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", - a->cpuid >> 5, - a->cpuid & 0x1f, + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", + feature >> 5, + feature & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen, a->padlen); @@ -605,7 +616,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insn_buff, p->instr, p->len); - used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); @@ -723,6 +734,33 @@ void __init alternative_instructions(void) * patching. */ + /* + * Paravirt patching and alternative patching can be combined to + * replace a function call with a short direct code sequence (e.g. + * by setting a constant return value instead of doing that in an + * external function). + * In order to make this work the following sequence is required: + * 1. set (artificial) features depending on used paravirt + * functions which can later influence alternative patching + * 2. apply paravirt patching (generally replacing an indirect + * function call with a direct one) + * 3. apply alternative patching (e.g. replacing a direct function + * call with a custom code sequence) + * Doing paravirt patching after alternative patching would clobber + * the optimization of the custom code with a function call again. + */ + paravirt_set_cap(); + + /* + * First patch paravirt functions, such that we overwrite the indirect + * call with the direct call. + */ + apply_paravirt(__parainstructions, __parainstructions_end); + + /* + * Then patch alternatives, such that those paravirt calls that are in + * alternatives can be overwritten by their immediate fragments. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); #ifdef CONFIG_SMP @@ -741,8 +779,6 @@ void __init alternative_instructions(void) } #endif - apply_paravirt(__parainstructions, __parainstructions_end); - restart_nmi(); alternatives_patched = 1; } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b4396952c9a6..09083094eb57 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Shared support code for AMD K8 northbridges and derivates. + * Shared support code for AMD K8 northbridges and derivatives. * Copyright 2006 Andi Kleen, SUSE Labs. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index bda4f2a36868..4a39fb429f15 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -619,7 +619,7 @@ static void setup_APIC_timer(void) if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; - /* Make LAPIC timer preferrable over percpu HPET */ + /* Make LAPIC timer preferable over percpu HPET */ lapic_clockevent.rating = 150; } @@ -666,7 +666,7 @@ void lapic_update_tsc_freq(void) * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * irqs synchronous. CPUs connected by the same APIC bus have the very same bus * frequency. * * This was previously done by reading the PIT/HPET and waiting for a wrap @@ -1532,7 +1532,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * Most probably by now the CPU has serviced that pending interrupt and it * might not have done the ack_APIC_irq() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector * 0x31). Issue an extra EOI to clear ISR. * @@ -1657,7 +1657,7 @@ static void setup_local_APIC(void) */ /* * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more + * frequent as it makes the interrupt distribution model be more * like LRU than MRU (the short-term load is more even across CPUs). */ @@ -1875,7 +1875,7 @@ static __init void try_to_enable_x2apic(int remap_mode) /* * Without IR, all CPUs can be addressed by IOAPIC/MSI only - * in physical mode, and CPUs with an APIC ID that cannnot + * in physical mode, and CPUs with an APIC ID that cannot * be addressed must not be brought online. */ x2apic_set_max_apicid(apic_limit); @@ -2342,6 +2342,11 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c3b60c37c728..d5c691a3208b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -928,7 +928,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) /* * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger - * and polarity attirbutes. So allow the first user to reprogram the + * and polarity attributes. So allow the first user to reprogram the * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { @@ -994,7 +994,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, /* * Legacy ISA IRQ has already been allocated, just add pin to - * the pin list assoicated with this IRQ and program the IOAPIC + * the pin list associated with this IRQ and program the IOAPIC * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { @@ -1032,6 +1032,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); @@ -1742,7 +1752,7 @@ static inline void ioapic_finish_move(struct irq_data *data, bool moveit) * with masking the ioapic entry and then polling until * Remote IRR was clear before reprogramming the * ioapic I don't trust the Remote IRR bit to be - * completey accurate. + * completely accurate. * * However there appears to be no other way to plug * this race, so if the Remote IRR bit is not @@ -1820,7 +1830,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) /* * Tail end of clearing remote IRR bit (either by delivering the EOI * message via io-apic EOI register write or simulating it using - * mask+edge followed by unnask+level logic) manually when the + * mask+edge followed by unmask+level logic) manually when the * level triggered interrupt is seen as the edge triggered interrupt * at the cpu. */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3c9c7492252f..6aa27e08b3e2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1045,7 +1045,7 @@ void irq_force_complete_move(struct irq_desc *desc) * * But in case of cpu hotplug this should be a non issue * because if the affinity update happens right before all - * cpus rendevouz in stop machine, there is no way that the + * cpus rendezvous in stop machine, there is no way that the * interrupt can be blocked on the target cpu because all cpus * loops first with interrupts enabled in stop machine, so the * old vector is not yet cleaned up when the interrupt fires. @@ -1054,7 +1054,7 @@ void irq_force_complete_move(struct irq_desc *desc) * of the interrupt on the apic/system bus would be delayed * beyond the point where the target cpu disables interrupts * in stop machine. I doubt that it can happen, but at least - * there is a theroretical chance. Virtualization might be + * there is a theoretical chance. Virtualization might be * able to expose this, but AFAICT the IOAPIC emulation is not * as stupid as the real hardware. * diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 52bc217ca8c3..f5a48e66e4f5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -369,6 +369,15 @@ static int __init early_get_arch_type(void) return ret; } +/* UV system found, check which APIC MODE BIOS already selected */ +static void __init early_set_apic_mode(void) +{ + if (x2apic_enabled()) + uv_system_type = UV_X2APIC; + else + uv_system_type = UV_LEGACY_APIC; +} + static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) { /* Save OEM_ID passed from ACPI MADT */ @@ -404,11 +413,12 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) else uv_hubless_system |= 0x8; - /* Copy APIC type */ + /* Copy OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", oem_id, oem_table_id, uv_system_type, uv_hubless_system); + return 0; } @@ -453,6 +463,7 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) early_set_hub_type(); /* Other UV setup functions */ + early_set_apic_mode(); early_get_pnodeid(); early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; @@ -472,29 +483,14 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) if (uv_set_system_type(_oem_id, _oem_table_id) == 0) return 0; - /* Save and Decode OEM Table ID */ + /* Save for display of the OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - /* This is the most common hardware variant, x2apic mode */ - if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - - /* Only used for very small systems, usually 1 chassis, legacy mode */ - else if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - - else - goto badbios; - pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), uv_min_hub_revision_id); return 0; - -badbios: - pr_err("UV: UVarchtype:%s not supported\n", uv_archtype); - BUG(); } enum uv_system_type get_uv_system_type(void) @@ -1671,6 +1667,9 @@ static __init int uv_system_init_hubless(void) if (rc < 0) return rc; + /* Set section block size for current node memory */ + set_block_size(); + /* Create user access node */ if (rc >= 0) uv_setup_proc_files(1); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 660270359d39..241dda687eb9 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -94,7 +94,7 @@ * Remove APM dependencies in arch/i386/kernel/process.c * Remove APM dependencies in drivers/char/sysrq.c * Reset time across standby. - * Allow more inititialisation on SMP. + * Allow more initialisation on SMP. * Remove CONFIG_APM_POWER_OFF and make it boot time * configurable (default on). * Make debug only a boot time parameter (remove APM_DEBUG). @@ -766,7 +766,7 @@ static int apm_driver_version(u_short *val) * not cleared until it is acknowledged. * * Additional information is returned in the info pointer, providing - * that APM 1.2 is in use. If no messges are pending the value 0x80 + * that APM 1.2 is in use. If no messages are pending the value 0x80 * is returned (No power management events pending). */ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) @@ -1025,7 +1025,7 @@ static int apm_enable_power_management(int enable) * status which gives the rough battery status, and current power * source. The bat value returned give an estimate as a percentage * of life and a status value for the battery. The estimated life - * if reported is a lifetime in secodnds/minutes at current powwer + * if reported is a lifetime in seconds/minutes at current power * consumption. */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 60b9f42ce3c1..ecd3fd6993d1 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -61,13 +61,6 @@ static void __used common(void) OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -#ifdef CONFIG_PARAVIRT_XXL - BLANK(); - OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); - OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); - OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); -#endif - #ifdef CONFIG_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 3ca9be482a9e..d66af2950e06 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -877,7 +877,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct _cpuid4_info_regs *base) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *this_cpu_ci; struct cacheinfo *this_leaf; int i, sibling; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ab640abe26b6..340107800b36 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -482,7 +482,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) if (pk) pk->pkru = init_pkru_value; /* - * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE + * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we * update that bit in this CPU's "cpu_info". */ @@ -1404,7 +1404,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) * where GS is unused by the prev and next threads. * * Since neither vendor documents this anywhere that I can see, - * detect it directly instead of hardcoding the choice by + * detect it directly instead of hard-coding the choice by * vendor. * * I've designated AMD's behavior as the "bug" because it's @@ -1748,6 +1748,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; + /* May not be marked __init: used by software suspend */ void syscall_init(void) { diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 42af31b64c2c..defda61f372d 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, {} }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 1d9b8aaea06c..7227c15299d0 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -291,7 +291,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif - c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */ + c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */ /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 3b1b01f2b248..da696eb4821a 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ -static void clear_sgx_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_SGX); - setup_clear_cpu_cap(X86_FEATURE_SGX_LC); -} - static int __init nosgx(char *str) { - clear_sgx_caps(); + setup_clear_cpu_cap(X86_FEATURE_SGX); return 0; } @@ -110,23 +104,30 @@ early_param("nosgx", nosgx); void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { + bool enable_sgx_kvm = false, enable_sgx_driver = false; bool tboot = tboot_enabled(); - bool enable_sgx; + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } - /* - * Enable SGX if and only if the kernel supports SGX and Launch Control - * is supported, i.e. disable SGX if the LE hash MSRs can't be written. - */ - enable_sgx = cpu_has(c, X86_FEATURE_SGX) && - cpu_has(c, X86_FEATURE_SGX_LC) && - IS_ENABLED(CONFIG_X86_SGX); + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } - if (enable_sgx) - msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } wrmsrl(MSR_IA32_FEAT_CTL, msr); @@ -173,10 +177,29 @@ update_caps: } update_sgx: - if (!(msr & FEAT_CTL_SGX_ENABLED) || - !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { - if (enable_sgx) - pr_err_once("SGX disabled by BIOS\n"); - clear_sgx_caps(); + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } } } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0e422a544835..63e381a46153 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -301,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * The operating system must reload CR3 to cause the TLB to be flushed" * * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h - * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7962355436da..bf7fe87a7e88 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -529,7 +529,7 @@ static void mce_irq_work_cb(struct irq_work *entry) * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granuality for now. + * parser). So only support physical addresses up to page granularity for now. */ int mce_usable_address(struct mce *m) { diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 7b360731fc2d..4e86d97f9653 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -74,6 +74,7 @@ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); MCE_INJECT_SET(synd); +MCE_INJECT_SET(ipid); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -88,11 +89,13 @@ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); MCE_INJECT_GET(synd); +MCE_INJECT_GET(ipid); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); static void setup_inj_struct(struct mce *m) { @@ -629,6 +632,8 @@ static const char readme_msg[] = "\t is present in hardware. \n" "\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" "\t APIC interrupt handler to handle the error. \n" +"\n" +"ipid:\t IPID (AMD-specific)\n" "\n"; static ssize_t @@ -652,6 +657,7 @@ static struct dfs_node { { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 83df991314c5..55ffa84d30d6 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -142,7 +142,7 @@ static struct severity { MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) ), MCESEV( - KEEP, "Non signalled machine check", + KEEP, "Non signaled machine check", SER, BITCLR(MCI_STATUS_S) ), diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b935e1b5f115..6a6318e9590c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -629,16 +629,16 @@ static ssize_t reload_store(struct device *dev, if (val != 1) return size; - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); - if (tmp_ret != UCODE_NEW) - return size; - get_online_cpus(); ret = check_online_cpus(); if (ret) goto put; + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_NEW) + goto put; + mutex_lock(µcode_mutex); ret = microcode_reload_late(); mutex_unlock(µcode_mutex); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e88bc296afca..415bc05d3dc7 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -197,7 +197,7 @@ static unsigned char hv_get_nmi_reason(void) #ifdef CONFIG_X86_LOCAL_APIC /* * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes - * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle * unknown NMI on the first CPU which gets it. */ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) @@ -428,7 +428,7 @@ static void __init ms_hyperv_init_platform(void) /* * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, - * set x2apic destination mode to physcial mode when x2apic is available + * set x2apic destination mode to physical mode when x2apic is available * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs * have 8-bit APIC id. */ diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 9231640782fa..0c3b372318b7 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -434,7 +434,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, state->range_sizek = sizek - second_sizek; } -/* Mininum size of mtrr block that can take hole: */ +/* Minimum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 28c8a23aa42e..a76694bffe86 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -799,7 +799,7 @@ void mtrr_ap_init(void) * * This routine is called in two cases: * - * 1. very earily time of software resume, when there absolutely + * 1. very early time of software resume, when there absolutely * isn't mtrr entry changes; * * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 698bb26aeb6e..23001ae03e82 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -192,7 +192,7 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz * - * Probe by trying to write the first of the L3 cach mask registers + * Probe by trying to write the first of the L3 cache mask registers * and checking that the bits stick. Max CLOSids is always 4 and max cbm length * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7ac31210e452..dbeaa8409313 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -387,7 +387,7 @@ void mon_event_count(void *info) * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so * that: * - * current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) * * This uses the MBM counters to measure the bandwidth and MBA throttle * MSRs to control the bandwidth for a particular rdtgrp. It builds on the @@ -397,7 +397,7 @@ void mon_event_count(void *info) * timer. Having 1s interval makes the calculation of bandwidth simpler. * * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid uncecessarily restricting + * be a need to increase the bandwidth to avoid unnecessarily restricting * the L2 <-> L3 traffic. * * Since MBA controls the L2 external bandwidth where as MBM measures the @@ -480,7 +480,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) /* * Delta values are updated dynamically package wise for each - * rdtgrp everytime the throttle MSR changes value. + * rdtgrp every time the throttle MSR changes value. * * This is because (1)the increase in bandwidth is not perfectly * linear and only "approximately" linear even when the hardware diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index e916646adc69..935af2ac6b1a 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1307,7 +1307,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * If the thread does not get on the CPU for whatever * reason and the process which sets up the region is * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will derefence + * state and once it gets on the CPU it will dereference * the cleared, but not freed, plr struct resulting in an * empty pseudo-locking loop. */ @@ -1391,7 +1391,7 @@ out: * group is removed from user space via a "rmdir" from userspace or the * unmount of the resctrl filesystem. On removal the resource group does * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus assymmetry with the creation where the + * removed directly. There is thus asymmetry with the creation where the * &struct pseudo_lock_region is removed here while it was not created in * rdtgroup_pseudo_lock_create(). * diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f9190adc52cb..01fd30e7829d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * User interface for Resource Alloction in Resource Director Technology(RDT) + * User interface for Resource Allocation in Resource Director Technology(RDT) * * Copyright (C) 2016 Intel Corporation * @@ -294,7 +294,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against resctrl_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from update_closid_rmid() is proteced against __switch_to() because + * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ static void update_cpu_closid_rmid(void *info) @@ -2555,7 +2555,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, /* * This creates a directory mon_data which contains the monitored data. * - * mon_data has one directory for each domain whic are named + * mon_data has one directory for each domain which are named * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data * with L3 domain looks as below: * ./mon_data: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 972ec3bfa9c0..21d1f062895a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a29..9c1656779b2a 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfb..aa9b8b868867 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7449ef33f081..3be203297988 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,7 +7,7 @@ #include <linux/shmem_fs.h> #include <linux/suspend.h> #include <linux/sched/mm.h> -#include "arch.h" +#include <asm/sgx.h> #include "encl.h" #include "encls.h" #include "sgx.h" @@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } @@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref) xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c..6e74f85b6264 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e70..9b204843b78d 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,21 +11,6 @@ #include <asm/traps.h> #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr * @@ -55,6 +40,19 @@ enum sgx_encls_function { } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -65,7 +63,7 @@ enum sgx_encls_function { */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 90a5caf76939..83df20e3e633 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include <asm/mman.h> +#include <asm/sgx.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +366,7 @@ err_out_unlock: mmap_read_unlock(current->mm); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; @@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); @@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) { struct sgx_sigstruct *sigstruct; struct sgx_enclave_init init_arg; - struct page *initp_page; void *token; int ret; @@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&init_arg, arg, sizeof(init_arg))) return -EFAULT; - initp_page = alloc_page(GFP_KERNEL); - if (!initp_page) + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) return -ENOMEM; - sigstruct = kmap(initp_page); token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); @@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) ret = sgx_encl_init(encl, sigstruct, token); out: - kunmap(initp_page); - __free_page(initp_page); + kfree(sigstruct); return ret; } @@ -665,24 +667,11 @@ out: static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8df81a3ed945..63d3de02bbcc 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include <linux/file.h> #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/miscdevice.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <asm/sgx.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); * with sgx_reclaimer_lock acquired. */ static LIST_HEAD(sgx_active_page_list); - static DEFINE_SPINLOCK(sgx_reclaimer_lock); +/* The free page list lock protected variables prepend the lock. */ +static unsigned long sgx_nr_free_pages; + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + +/* + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. + */ +static struct sgx_numa_node *sgx_numa_nodes; + +static LIST_HEAD(sgx_dirty_page_list); + /* - * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS - * pages whose child pages blocked EREMOVE. + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. */ -static void sgx_sanitize_section(struct sgx_epc_section *section) +static void __sgx_sanitize_pages(struct list_head *dirty_page_list) { struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; - /* init_laundry_list is thread-local, no need for a lock: */ - while (!list_empty(§ion->init_laundry_list)) { + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) return; - /* needed for access to ->page_list: */ - spin_lock(§ion->lock); - - page = list_first_entry(§ion->init_laundry_list, - struct sgx_epc_page, list); + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); ret = __eremove(sgx_get_epc_virt_addr(page)); - if (!ret) - list_move(&page->list, §ion->page_list); - else + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); - - spin_unlock(§ion->lock); + } cond_resched(); } - list_splice(&dirty, §ion->init_laundry_list); + list_splice(&dirty, dirty_page_list); } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -195,10 +214,10 @@ static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) /* * Swap page to the regular memory transformed to the blocked state by using - * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). * * The first trial just tries to write the page assuming that some other thread - * has reset the count for threads inside the enlave by using ETRACK, and + * has reset the count for threads inside the enclave by using ETRACK, and * previous thread count has been zeroed out. The second trial calls ETRACK * before EWB. If that fails we kick all the HW threads out, and then do EWB, * which should be guaranteed the succeed. @@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing, true); @@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void) struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; + struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -379,50 +399,33 @@ skip: epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; section = &sgx_epc_sections[epc_page->section]; - spin_lock(§ion->lock); - list_add_tail(&epc_page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); - } -} - -static unsigned long sgx_nr_free_pages(void) -{ - unsigned long cnt = 0; - int i; - - for (i = 0; i < sgx_nr_epc_sections; i++) - cnt += sgx_epc_sections[i].free_cnt; + node = section->node; - return cnt; + spin_lock(&node->lock); + list_add_tail(&epc_page->list, &node->free_page_list); + sgx_nr_free_pages++; + spin_unlock(&node->lock); + } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages() < watermark && - !list_empty(&sgx_active_page_list); + return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) { - int i; - set_freezable(); /* * Sanitize pages in order to recover from kexec(). The 2nd pass is * required for SECS pages, whose child pages blocked EREMOVE. */ - for (i = 0; i < sgx_nr_epc_sections; i++) - sgx_sanitize_section(&sgx_epc_sections[i]); - - for (i = 0; i < sgx_nr_epc_sections; i++) { - sgx_sanitize_section(&sgx_epc_sections[i]); + __sgx_sanitize_pages(&sgx_dirty_page_list); + __sgx_sanitize_pages(&sgx_dirty_page_list); - /* Should never happen. */ - if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) - WARN(1, "EPC section %d has unsanitized pages.\n", i); - } + /* sanity check: */ + WARN_ON(!list_empty(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void) return true; } -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { - struct sgx_epc_page *page; + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; - spin_lock(§ion->lock); + spin_lock(&node->lock); - if (list_empty(§ion->page_list)) { - spin_unlock(§ion->lock); + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); return NULL; } - page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - section->free_cnt--; + sgx_nr_free_pages--; + + spin_unlock(&node->lock); - spin_unlock(§ion->lock); return page; } /** * __sgx_alloc_epc_page() - Allocate an EPC page * - * Iterate through EPC sections and borrow a free EPC page to the caller. When a - * page is no longer needed it must be released with sgx_free_epc_page(). + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. * * Return: - * an EPC page, - * -errno on error + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. */ struct sgx_epc_page *__sgx_alloc_epc_page(void) { - struct sgx_epc_section *section; struct sgx_epc_page *page; - int i; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; - for (i = 0; i < sgx_nr_epc_sections; i++) { - section = &sgx_epc_sections[i]; + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } + + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; - page = __sgx_alloc_epc_page_from_section(section); + page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; } @@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; - int ret; + struct sgx_numa_node *node = section->node; - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + spin_lock(&node->lock); - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; + list_add_tail(&page->list, &node->free_page_list); + sgx_nr_free_pages++; - spin_lock(§ion->lock); - list_add_tail(&page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); + spin_unlock(&node->lock); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; - spin_lock_init(§ion->lock); - INIT_LIST_HEAD(§ion->page_list); - INIT_LIST_HEAD(§ion->init_laundry_list); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; - list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - section->free_cnt = nr_pages; return true; } @@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; u64 pa, size; + int nid; int i; + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); @@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void) break; } + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + node_set(nid, sgx_numa_mask); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_nr_epc_sections++; } @@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -716,12 +806,28 @@ static int __init sgx_init(void) goto err_page_cache; } - ret = sgx_drv_init(); + ret = misc_register(&sgx_dev_provision); if (ret) goto err_kthread; + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5fa42d143feb..4628acec0009 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,11 +8,15 @@ #include <linux/rwsem.h> #include <linux/types.h> #include <asm/asm.h> -#include "arch.h" +#include <asm/sgx.h> #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 @@ -30,28 +34,25 @@ struct sgx_epc_page { }; /* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + spinlock_t lock; +}; + +/* * The firmware can define multiple chunks of EPC to the different areas of the * physical memory e.g. for memory areas of the each node. This structure is * used to store EPC pages for one EPC section and virtual memory area where * the pages have been mapped. - * - * 'lock' must be held before accessing 'page_list' or 'free_cnt'. */ struct sgx_epc_section { unsigned long phys_addr; void *virt_addr; struct sgx_epc_page *pages; - - spinlock_t lock; - struct list_head page_list; - unsigned long free_cnt; - - /* - * Pages which need EREMOVE run on them before they can be - * used. Only safe to be accessed in ksgxd and init code. - * Not protected by locks. - */ - struct list_head init_laundry_list; + struct sgx_numa_node *node; }; extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; @@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..6ad165a5c0cc --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/xarray.h> +#include <asm/sgx.h> +#include <uapi/asm/sgx.h> + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret; + + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + + return 0; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 8678864ce712..132a2de44d2f 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,7 +30,7 @@ EXPORT_SYMBOL(__max_die_per_package); #ifdef CONFIG_SMP /* - * Check if given CPUID extended toplogy "leaf" is implemented + * Check if given CPUID extended topology "leaf" is implemented */ static int check_extended_topology_leaf(int leaf) { @@ -44,7 +44,7 @@ static int check_extended_topology_leaf(int leaf) return 0; } /* - * Return best CPUID Extended Toplogy Leaf supported + * Return best CPUID Extended Topology Leaf supported */ static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c6ede3b3d302..c04b933f48d3 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -27,6 +27,7 @@ #include <linux/clocksource.h> #include <linux/cpu.h> #include <linux/reboot.h> +#include <linux/static_call.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -336,11 +337,11 @@ static void __init vmware_paravirt_ops_setup(void) vmware_cyc2ns_setup(); if (vmw_sched_clock) - pv_ops.time.sched_clock = vmware_sched_clock; + paravirt_set_sched_clock(vmware_sched_clock); if (vmware_is_stealclock_available()) { has_steal_clock = true; - pv_ops.time.steal_clock = vmware_steal_clock; + static_call_update(pv_steal_clock, vmware_steal_clock); /* We use reboot notifier only to disable steal clock */ register_reboot_notifier(&vmware_pv_reboot_nb); @@ -378,6 +379,8 @@ static void __init vmware_set_capabilities(void) { setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_tsc_khz) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) setup_force_cpu_cap(X86_FEATURE_VMCALL); else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a8f3af257e26..b1deacbeb266 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -337,7 +337,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(sizeof(struct crash_mem)); + cmem = vzalloc(struct_size(cmem, ranges, 1)); if (!cmem) return -ENOMEM; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 22aad412f965..f74cb7da9557 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -793,7 +793,7 @@ core_initcall(e820__register_nvs_regions); #endif /* - * Allocate the requested number of bytes with the requsted alignment + * Allocate the requested number of bytes with the requested alignment * and return (the physical address) to the caller. Also register this * range in the 'kexec' E820 table as a reserved range. * diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 683749b80ae2..a85c64000218 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -253,7 +253,7 @@ static bool xfeature_enabled(enum xfeature xfeature) static void __init setup_xstate_features(void) { u32 eax, ebx, ecx, edx, i; - /* start at the beginnning of the "extended state" */ + /* start at the beginning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5e9beb77cafd..18be44163a50 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -104,7 +104,7 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) static bool __head check_la57_support(unsigned long physaddr) { /* - * 5-level paging is detected and enabled at kernel decomression + * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. */ if (!(native_read_cr4() & X86_CR4_LA57)) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ee1a283f8e96..d552f177eca0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -245,7 +245,7 @@ static const __initconst struct idt_data ist_idts[] = { * after that. * * Note, that X86_64 cannot install the real #PF handler in - * idt_setup_early_traps() because the memory intialization needs the #PF + * idt_setup_early_traps() because the memory initialization needs the #PF * handler from the early_idt_handler_array to initialize the early page * tables. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 58aa712973ac..e28f6a5d14f1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -338,7 +338,7 @@ void fixup_irqs(void) irq_migrate_all_off_this_cpu(); /* - * We can remove mdelay() and then send spuriuous interrupts to + * We can remove mdelay() and then send spurious interrupts to * new cpu targets for all the irqs that were handled previously by * this cpu. While it works, I have seen spurious interrupt messages * (nothing wrong but still...). diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ff7878df96b4..3a43a2dee658 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -17,7 +17,7 @@ * Updated by: Tom Rini <trini@kernel.crashing.org> * Updated by: Jason Wessel <jason.wessel@windriver.com> * Modified for 386 by Jim Kingdon, Cygnus Support. - * Origianl kgdb, compatibility with 2.1.xx kernel by + * Original kgdb, compatibility with 2.1.xx kernel by * David Grothe <dave@gcom.com> * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com> * X86_64 changes from Andi Kleen's patch merged by Jim Houston @@ -642,7 +642,7 @@ void kgdb_arch_late(void) struct perf_event **pevent; /* - * Pre-allocate the hw breakpoint structions in the non-atomic + * Pre-allocate the hw breakpoint instructions in the non-atomic * portion of kgdb because this operation requires mutexs to * complete. */ diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 373e5fa3ce1f..596de2f6d3a5 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,7 +12,7 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preempt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01ca3b4..172c947240b9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -650,7 +650,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_ops.time.steal_clock = kvm_steal_clock; + static_call_update(pv_steal_clock, kvm_steal_clock); } if (pv_tlb_flush_supported()) { @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; - if (in_nmi()) return; - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); -out: - local_irq_restore(flags); + if (READ_ONCE(*ptr) == val) + safe_halt(); + + local_irq_enable(); + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa593743acf6..d37ed4e1d033 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -106,7 +106,7 @@ static inline void kvm_sched_clock_init(bool stable) if (!stable) clear_sched_clock_stable(); kvm_sched_clock_offset = kvm_clock_read(); - pv_ops.time.sched_clock = kvm_sched_clock_read; + paravirt_set_sched_clock(kvm_sched_clock_read); pr_info("kvm-clock: using sched offset of %llu cycles", kvm_sched_clock_offset); @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; + kvmclock_init_mem(); - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a29a44a98e5b..f01cd9a08155 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -260,7 +260,7 @@ static void set_idt(void *newidt, u16 limit) { struct desc_ptr curidt; - /* x86-64 supports unaliged loads & stores */ + /* x86-64 supports unaligned loads & stores */ curidt.size = limit; curidt.address = (unsigned long)newidt; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 4f75d0cf6305..9e1ea99ad9df 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void) return pv_ops.lock.vcpu_is_preempted.func == __raw_callee_save___native_vcpu_is_preempted; } + +void __init paravirt_set_cap(void) +{ + if (!pv_is_native_spin_unlock()) + setup_force_cpu_cap(X86_FEATURE_PVUNLOCK); + + if (!pv_is_native_vcpu_is_preempted()) + setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT); +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c60222ab8ab9..d0730264786b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -14,6 +14,7 @@ #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -52,7 +53,10 @@ void __init default_banner(void) } /* Undefined instruction for dealing with missing ops pointers. */ -static const unsigned char ud2a[] = { 0x0f, 0x0b }; +static void paravirt_BUG(void) +{ + BUG(); +} struct branch { unsigned char opcode; @@ -85,25 +89,6 @@ u64 notrace _paravirt_ident_64(u64 x) { return x; } - -static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - struct branch *b = insn_buff; - unsigned long delta = (unsigned long)target - (addr+5); - - if (len < 5) { -#ifdef CONFIG_RETPOLINE - WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); -#endif - return len; /* call too long for patch site */ - } - - b->opcode = 0xe9; /* jmp */ - b->delta = delta; - - return 5; -} #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -114,8 +99,8 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -unsigned paravirt_patch_default(u8 type, void *insn_buff, - unsigned long addr, unsigned len) +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) { /* * Neat trick to map patch type back to the call within the @@ -125,20 +110,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned ret; if (opfunc == NULL) - /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); + /* If there's no function, patch it with paravirt_BUG() */ + ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); else if (opfunc == _paravirt_nop) ret = 0; - -#ifdef CONFIG_PARAVIRT_XXL - /* identity functions just return their single argument */ - else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insn_buff, len); - - else if (type == PARAVIRT_PATCH(cpu.iret)) - /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); -#endif else /* Otherwise call the function. */ ret = paravirt_patch_call(insn_buff, opfunc, addr, len); @@ -146,19 +121,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, return ret; } -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, - const char *start, const char *end) -{ - unsigned insn_len = end - start; - - /* Alternative instruction is too large for the patch site and we cannot continue: */ - BUG_ON(insn_len > len || start == NULL); - - memcpy(insn_buff, start, insn_len); - - return insn_len; -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -167,6 +129,14 @@ static u64 native_steal_clock(int cpu) return 0; } +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); +DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)) +{ + static_call_update(pv_sched_clock, func); +} + /* These are in entry.S */ extern void native_iret(void); @@ -269,13 +239,6 @@ struct pv_info pv_info = { #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { - /* Init ops. */ - .init.patch = native_patch, - - /* Time ops. */ - .time.sched_clock = native_sched_clock, - .time.steal_clock = native_steal_clock, - /* Cpu ops. */ .cpu.io_delay = native_io_delay, @@ -308,8 +271,6 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.iret = native_iret, - #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, @@ -414,6 +375,8 @@ struct paravirt_patch_template pv_ops = { NOKPROBE_SYMBOL(native_get_debugreg); NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); + +void (*paravirt_iret)(void) = native_iret; #endif EXPORT_SYMBOL(pv_ops); diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c deleted file mode 100644 index abd27ec67397..000000000000 --- a/arch/x86/kernel/paravirt_patch.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/stringify.h> - -#include <asm/paravirt.h> -#include <asm/asm-offsets.h> - -#define PSTART(d, m) \ - patch_data_##d.m - -#define PEND(d, m) \ - (PSTART(d, m) + sizeof(patch_data_##d.m)) - -#define PATCH(d, m, insn_buff, len) \ - paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) - -#define PATCH_CASE(ops, m, data, insn_buff, len) \ - case PARAVIRT_PATCH(ops.m): \ - return PATCH(data, ops##_##m, insn_buff, len) - -#ifdef CONFIG_PARAVIRT_XXL -struct patch_xxl { - const unsigned char irq_irq_disable[1]; - const unsigned char irq_irq_enable[1]; - const unsigned char irq_save_fl[2]; - const unsigned char mmu_read_cr2[3]; - const unsigned char mmu_read_cr3[3]; - const unsigned char mmu_write_cr3[3]; - const unsigned char cpu_wbinvd[2]; - const unsigned char mov64[3]; -}; - -static const struct patch_xxl patch_data_xxl = { - .irq_irq_disable = { 0xfa }, // cli - .irq_irq_enable = { 0xfb }, // sti - .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax - .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax - .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax - .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax -}; - -unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) -{ - return PATCH(xxl, mov64, insn_buff, len); -} -# endif /* CONFIG_PARAVIRT_XXL */ - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -struct patch_lock { - unsigned char queued_spin_unlock[3]; - unsigned char vcpu_is_preempted[2]; -}; - -static const struct patch_lock patch_data_lock = { - .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax - -# ifdef CONFIG_X86_64 - .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) -# else - .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) -# endif -}; -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - -unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) -{ - switch (type) { - -#ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, save_fl, xxl, insn_buff, len); - PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); - PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); - - PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); - PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); - PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - - PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); -#endif - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return PATCH(lock, queued_spin_unlock, insn_buff, len); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return PATCH(lock, vcpu_is_preempted, insn_buff, len); - break; -#endif - default: - break; - } - - return paravirt_patch_default(type, insn_buff, addr, len); -} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9c214d7085a4..43cbfc84153a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, - /* - * .sp1 is cpu_current_top_of_stack. The init task never - * runs user code, but cpu_current_top_of_stack should still - * be well defined before the first context switch. - */ +#ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, -#ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif @@ -451,7 +446,7 @@ void speculative_store_bypass_ht_init(void) * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link - * themself to the state of this CPU. + * themselves to the state of this CPU. */ st->shared_state = st; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 11065dc03f5b..eda37df016f0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -89,7 +89,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) /* * Assumption here is that last_value, a global accumulator, always goes * forward. If we are less than that, we should not be much smaller. - * We assume there is an error marging we're inside, and then the correction + * We assume there is an error margin we're inside, and then the correction * does not sacrifice accuracy. * * For reads: global may have changed between test and return, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 94b33885f8d2..f469153eca8a 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -107,7 +107,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movl %cr0, %eax andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a4d9a261425b..c53271aebb64 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -121,7 +121,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movq %cr0, %rax andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d883176ef2ce..69757fac7462 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -65,7 +65,7 @@ RESERVE_BRK(dmi_alloc, 65536); /* * Range of the BSS area. The size of the BSS area is determined - * at link time, with RESERVE_BRK*() facility reserving additional + * at link time, with RESERVE_BRK() facility reserving additional * chunks. */ unsigned long _brk_start = (unsigned long)__brk_base; @@ -633,11 +633,16 @@ static void __init trim_snb_memory(void) printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); /* - * Reserve all memory below the 1 MB mark that has not - * already been reserved. + * SandyBridge integrated graphics devices have a bug that prevents + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * + * To avoid these pages being ever accessed by SNB gfx devices + * reserve all memory below the 1 MB mark and bad_pages that have + * not already been reserved at boot time. */ memblock_reserve(0, 1<<20); - + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) printk(KERN_WARNING "failed to reserve 0x%08lx\n", @@ -645,18 +650,6 @@ static void __init trim_snb_memory(void) } } -/* - * Here we put platform-specific memory range workarounds, i.e. - * memory known to be corrupt or otherwise in need to be reserved on - * specific platforms. - * - * If this gets used more widely it could use a real dispatch mechanism. - */ -static void __init trim_platform_memory_ranges(void) -{ - trim_snb_memory(); -} - static void __init trim_bios_range(void) { /* @@ -725,11 +718,41 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static void __init trim_low_memory_range(void) +static void __init early_reserve_memory(void) { + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + + /* + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * + * Reserve the first memory page and typically some additional + * memory (64KiB by default) since some BIOSes are known to corrupt + * low memory. See the Kconfig help text for X86_RESERVE_LOW. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + + early_reserve_initrd(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + + memblock_x86_reserve_range_setup_data(); + + reserve_ibft_region(); + reserve_bios_regions(); } - + /* * Dump out kernel offset information on panic. */ @@ -764,29 +787,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { - /* - * Reserve the memory occupied by the kernel between _text and - * __end_of_kernel_reserve symbols. Any kernel sections after the - * __end_of_kernel_reserve symbol must be explicitly reserved with a - * separate memblock_reserve() or they will be discarded. - */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); - - /* - * Make sure page 0 is always reserved because on systems with - * L1TF its contents can be leaked to user processes. - */ - memblock_reserve(0, PAGE_SIZE); - - early_reserve_initrd(); - - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -910,8 +910,18 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); + /* + * Do some memory reservations *before* memory is added to + * memblock, so memblock allocations won't overwrite it. + * Do it after early param, so we could get (unlikely) panic from + * serial. + * + * After this point everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + early_reserve_memory(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -938,9 +948,6 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); - /* after early param, so could get panic from serial */ - memblock_x86_reserve_range_setup_data(); - if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; @@ -1032,14 +1039,12 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); - reserve_ibft_region(); - early_alloc_pgt_buf(); /* * Need to conclude brk, before e820__memblock_setup() - * it could use memblock_find_in_range, could overlap with - * brk area. + * it could use memblock_find_in_range, could overlap with + * brk area. */ reserve_brk(); @@ -1054,8 +1059,6 @@ void __init setup_arch(char **cmdline_p) */ sev_setup_arch(); - reserve_bios_regions(); - efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); @@ -1081,8 +1084,12 @@ void __init setup_arch(char **cmdline_p) reserve_real_mode(); - trim_platform_memory_ranges(); - trim_low_memory_range(); + /* + * Reserving memory causing GPU hangs on Sandy Bridge integrated + * graphics devices should be done after we allocated memory under + * 1M for the real mode trampoline. + */ + trim_snb_memory(); init_mem_mapping(); @@ -1129,6 +1136,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); acpi_table_upgrade(); + /* Look for ACPI tables and reserve memory occupied by them. */ + acpi_boot_table_init(); vsmp_init(); @@ -1136,11 +1145,6 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); initmem_init(); diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index cdc04d091242..0aa9f13efd57 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -24,7 +24,7 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int reason) { u64 val = GHCB_SEV_TERMINATE; @@ -186,7 +186,6 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * make it accessible to the hypervisor. * * In particular, check for: - * - Hypervisor CPUID bit * - Availability of CPUID leaf 0x8000001f * - SEV CPUID bit. * @@ -194,10 +193,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * can't be checked here. */ - if ((fn == 1 && !(regs->cx & BIT(31)))) - /* Hypervisor bit */ - goto fail; - else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) /* SEV leaf check */ goto fail; else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) @@ -210,12 +206,8 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) return; fail: - sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); - VMGEXIT(); - - /* Shouldn't get here - if we do halt the machine */ - while (true) - asm volatile("hlt\n"); + /* Terminate the guest */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 84c1821819af..26f5479a97a8 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -121,35 +121,57 @@ static void __init setup_vc_stacks(int cpu) cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); } -static __always_inline bool on_vc_stack(unsigned long sp) +static __always_inline bool on_vc_stack(struct pt_regs *regs) { + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); } /* - * This function handles the case when an NMI is raised in the #VC exception - * handler entry code. In this case, the IST entry for #VC must be adjusted, so - * that any subsequent #VC exception will not overwrite the stack contents of the - * interrupted #VC handler. + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. * * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested - * sev_es_ist_exit() call may adjust back the IST entry too early. + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. */ void noinstr __sev_es_ist_enter(struct pt_regs *regs) { unsigned long old_ist, new_ist; /* Read old IST entry */ - old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - /* Make room on the IST stack */ - if (on_vc_stack(regs->sp)) - new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); - else - new_ist = old_ist - sizeof(old_ist); + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ + if (on_vc_stack(regs)) + new_ist = regs->sp; - /* Store old IST entry */ + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); *(unsigned long *)new_ist = old_ist; /* Set new IST entry */ @@ -248,7 +270,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) int res; if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user(ctxt->regs, buffer); + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); if (!res) { ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; @@ -267,7 +289,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) return ES_EXCEPTION; } - insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); insn_get_length(&ctxt->insn); } @@ -1248,13 +1270,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; - lockdep_assert_irqs_disabled(); - /* * Handle #DB before calling into !noinstr code to avoid recursive #DB. */ @@ -1263,6 +1284,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) return; } + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); instrumentation_begin(); /* @@ -1325,6 +1348,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) out: instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); return; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ea794a083c44..a06cb107c0e8 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -492,7 +492,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatbility issue here: DOSEMU + * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU @@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { - /* - * This function is fundamentally broken as currently - * implemented. - * - * The idea is that we want to trigger a call to the - * restart_block() syscall and that we want in_ia32_syscall(), - * in_x32_syscall(), etc. to match whatever they were in the - * syscall being restarted. We assume that the syscall - * instruction at (regs->ip - 2) matches whatever syscall - * instruction we used to enter in the first place. - * - * The problem is that we can get here when ptrace pokes - * syscall-like values into regs even if we're not in a syscall - * at all. - * - * For now, we maintain historical behavior and guess based on - * stored state. We could do better by saving the actual - * syscall arch in restart_block or (with caveats on x32) by - * checking if regs->ip points to 'int $0x80'. The current - * behavior is incorrect if a tracer has a different bitness - * than the tracee. - */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index eff4ce3b10da..06db901fabe8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -67,7 +67,7 @@ * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings - * 7AP. We do not assume writes to the LVT deassering IRQs + * 7AP. We do not assume writes to the LVT deasserting IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode * @@ -204,7 +204,7 @@ static void native_stop_other_cpus(int wait) } /* * Don't wait longer than 10 ms if the caller didn't - * reqeust it. If wait is true, the machine hangs here if + * request it. If wait is true, the machine hangs here if * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 02813a7f3a7c..1e2050c4f94a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1407,7 +1407,7 @@ void __init calculate_max_logical_packages(void) int ncpus; /* - * Today neither Intel nor AMD support heterogenous systems so + * Today neither Intel nor AMD support heterogeneous systems so * extrapolate the boot cpu's data to all packages. */ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); @@ -1659,13 +1659,17 @@ void play_dead_common(void) local_irq_disable(); } -static bool wakeup_cpu0(void) +/** + * cond_wakeup_cpu0 - Wake up CPU0 if needed. + * + * If NMI wants to wake up CPU0, start CPU0. + */ +void cond_wakeup_cpu0(void) { if (smp_processor_id() == 0 && enable_start_cpu0) - return true; - - return false; + start_cpu0(); } +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); /* * We need to flush the caches before going to sleep, lest we have @@ -1734,11 +1738,8 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } @@ -1749,11 +1750,8 @@ void hlt_play_dead(void) while (1) { native_halt(); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8627fda8d993..15b058eefc4e 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -29,12 +29,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 653b7f617b61..8a56a6d80098 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -10,7 +10,7 @@ * EFI Quirks * Several EFI systems do not correctly advertise their boot framebuffers. * Hence, we use this static table of known broken machines and fix up the - * information so framebuffer drivers can load corectly. + * information so framebuffer drivers can load correctly. */ #include <linux/dmi.h> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c09ba110204..f9af561c3cd4 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -49,6 +49,30 @@ bool tboot_enabled(void) return tboot != NULL; } +/* noinline to prevent gcc from warning about dereferencing constant fixaddr */ +static noinline __init bool check_tboot_version(void) +{ + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + return false; + } + + if (tboot->version < 5) { + pr_warn("tboot version is invalid: %u\n", tboot->version); + return false; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); + + return true; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ @@ -66,25 +90,9 @@ void __init tboot_probe(void) /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); - tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); - if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + tboot = (void *)fix_to_virt(FIX_TBOOT_BASE); + if (!check_tboot_version()) tboot = NULL; - return; - } - if (tboot->version < 5) { - pr_warn("tboot version is invalid: %u\n", tboot->version); - tboot = NULL; - return; - } - - pr_info("found shared page at phys addr 0x%llx:\n", - boot_params.tboot_addr); - pr_debug("version: %d\n", tboot->version); - pr_debug("log_addr: 0x%08x\n", tboot->log_addr); - pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); - pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); - pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); } static pgd_t *tboot_pg_dir; diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index f5477eab5692..bd83748e2bde 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -113,7 +113,7 @@ int arch_register_cpu(int num) * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate * depends on BSP. PIC interrupts depend on BSP. * - * If the BSP depencies are under control, one can tell kernel to + * If the BSP dependencies are under control, one can tell kernel to * enable BSP hotplug. This basically adds a control file and * one can attempt to offline BSP. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7f5aec758f0e..48881db8dca1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -395,7 +395,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because - * we won't enable interupts or schedule before we invoke + * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * @@ -556,7 +556,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk->thread.trap_nr = X86_TRAP_GP; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + goto exit; show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); @@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ - if (regs->ip >= (unsigned long)entry_SYSCALL_64 && - regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + if (ip_within_syscall_gap(regs)) { sp = this_cpu_read(cpu_current_top_of_stack); goto sync; } @@ -1058,7 +1057,7 @@ static void math_error(struct pt_regs *regs, int trapnr) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) - return; + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index f70dffc2771f..57ec01192180 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> +#include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -254,7 +255,7 @@ unsigned long long sched_clock(void) bool using_native_sched_clock(void) { - return pv_ops.time.sched_clock == native_sched_clock; + return static_call_query(pv_sched_clock) == native_sched_clock; } #else unsigned long long @@ -739,7 +740,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the - * reference read for any possible disturbance. We dicard + * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. @@ -1079,7 +1080,7 @@ static void tsc_resume(struct clocksource *cs) * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in + * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm @@ -1264,7 +1265,7 @@ EXPORT_SYMBOL(convert_art_to_tsc); * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparibility of two cycle + * by timekeeping code to verify comparability of two cycle * values. */ diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 3d3c761eb74a..50a4515fe0ad 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -472,7 +472,7 @@ retry: /* * Add the result to the previous adjustment value. * - * The adjustement value is slightly off by the overhead of the + * The adjustment value is slightly off by the overhead of the * sync mechanism (observed values are ~200 TSC cycles), but this * really depends on CPU, node distance and frequency. So * compensating for this is hard to get right. Experiments show diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f6225bf22c02..fac1daae7994 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -272,7 +272,7 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes. + * significant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2a1d47f47eee..a1202536fc57 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -13,7 +13,7 @@ #define orc_warn_current(args...) \ ({ \ - if (state->task == current) \ + if (state->task == current && !state->error) \ orc_warn(args); \ }) @@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off, return false; if (state->full_regs) { - *val = ((unsigned long *)state->regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); return true; } if (state->prev_regs) { - *val = ((unsigned long *)state->prev_regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); return true; } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a788d5120d4d..f6b93a35ce14 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -84,6 +84,18 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config X86_SGX_KVM + bool "Software Guard eXtensions (SGX) Virtualization" + depends on X86_SGX && KVM_INTEL + help + + Enables KVM guests to create SGX enclaves. + + This includes support to expose "raw" unreclaimable enclave memory to + guests via a device node, e.g. /dev/sgx_vepc. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 1b4766fe1de2..eafc4d601f25 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y += -Iarch/x86/kvm +ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror ifeq ($(CONFIG_FRAME_POINTER),y) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6bd2f8b830e4..c02466a1410b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1033,7 +1033,7 @@ EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); * - Centaur: 0xc0000000 - 0xcfffffff * * The Hypervisor class is further subdivided into sub-classes that each act as - * their own indepdent class associated with a 0x100 byte range. E.g. if Qemu + * their own independent class associated with a 0x100 byte range. E.g. if Qemu * is advertising support for both HyperV and KVM, the resulting Hypervisor * CPUID sub-classes are: * diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f7970ba6219f..cdd2a2b6550e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3222,7 +3222,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, } /* - * Now load segment descriptors. If fault happenes at this stage + * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 58fa8c029867..f98370a39936 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -520,10 +520,10 @@ static u64 get_time_ref_counter(struct kvm *kvm) u64 tsc; /* - * The guest has not set up the TSC page or the clock isn't - * stable, fall back to get_kvmclock_ns. + * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, + * is broken, disabled or being updated. */ - if (!hv->tsc_ref.tsc_sequence) + if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); @@ -1077,6 +1077,21 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, return true; } +/* + * Don't touch TSC page values if the guest has opted for TSC emulation after + * migration. KVM doesn't fully support reenlightenment notifications and TSC + * access emulation and Hyper-V is known to expect the values in TSC page to + * stay constant before TSC access emulation is disabled from guest side + * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC + * frequency and guest visible TSC value across migration (and prevent it when + * TSC scaling is unsupported). + */ +static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) +{ + return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && + hv->hv_tsc_emulation_control; +} + void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { @@ -1087,7 +1102,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) return; mutex_lock(&hv->hv_lock); @@ -1101,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) + goto out_err; + + if (tsc_seq && tsc_page_update_unsafe(hv)) { + if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; + } /* * While we're computing and writing the parameters, force the @@ -1110,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - goto out_unlock; + goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - goto out_unlock; + goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - goto out_unlock; + goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -1131,8 +1155,45 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; - kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; + goto out_unlock; + +out_err: + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; +out_unlock: + mutex_unlock(&hv->hv_lock); +} + +void kvm_hv_invalidate_tsc_page(struct kvm *kvm) +{ + struct kvm_hv *hv = to_kvm_hv(kvm); + u64 gfn; + + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || + tsc_page_update_unsafe(hv)) + return; + + mutex_lock(&hv->hv_lock); + + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + goto out_unlock; + + /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */ + if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET) + hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING; + + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + + hv->tsc_ref.tsc_sequence = 0; + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + out_unlock: mutex_unlock(&hv->hv_lock); } @@ -1193,8 +1254,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { + if (!host) + hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; + else + hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + } else { + hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; + } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(kvm, @@ -1229,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: + if (data && !host) + return 1; + hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index e951af1fcb2c..60547d5cb6d7 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -133,6 +133,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock); +void kvm_hv_invalidate_tsc_page(struct kvm *kvm); void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8a4de3f12820..d5b72a08e566 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -269,7 +269,7 @@ int kvm_set_routing_entry(struct kvm *kvm, const struct kvm_irq_routing_entry *ue) { /* We can't check irqchip_in_kernel() here as some callers are - * currently inititalizing the irqchip. Other callers should therefore + * currently initializing the irqchip. Other callers should therefore * check kvm_arch_can_set_irq_routing() before calling this function. */ switch (ue->type) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 45d40bfacb7c..cc369b9ad8f1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - kvm_wait_lapic_expire(vcpu); + /* + * Ensure the guest's timer has truly expired before posting an + * interrupt. Open code the relevant checks to avoid querying + * lapic_timer_int_injected(), which will be false since the + * interrupt isn't yet injected. Waiting until after injecting + * is not an option since that won't help a posted interrupt. + */ + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } @@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d75524bc8423..62b1729277ef 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4961,7 +4961,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, /* * No need to care whether allocation memory is successful - * or not since pte prefetch is skiped if it does not have + * or not since pte prefetch is skipped if it does not have * enough objects in the cache. */ mmu_topup_memory_caches(vcpu, true); @@ -5884,6 +5884,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); + bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); @@ -5905,19 +5906,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); if (is_tdp_mmu_page(sp)) { - kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, - sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + flush |= kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ec4fc28b325a..360983865398 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -59,7 +59,7 @@ struct kvm_mmu_page { #ifdef CONFIG_X86_64 bool tdp_mmu_page; - /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif }; @@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) { /* diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index e5f148106e20..b3ed302c1a35 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -21,6 +21,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) } /* + * Return the TDP iterator to the root PT and allow it to continue its + * traversal over the paging structure from there. + */ +void tdp_iter_restart(struct tdp_iter *iter) +{ + iter->yielded_gfn = iter->next_last_level_gfn; + iter->level = iter->root_level; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ @@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; - iter->yielded_gfn = iter->next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->level = root_level; - iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; - - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); - tdp_iter_refresh_sptep(iter); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; + iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); - iter->valid = true; + tdp_iter_restart(iter); } /* @@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) -{ - return iter->pt_path[iter->root_level - 1]; -} - diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 4cc177d75c4a..b1748b988d3a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -36,6 +36,8 @@ struct tdp_iter { int min_level; /* The iterator's current level within the paging structure */ int level; + /* The address space ID, i.e. SMM vs. regular. */ + int as_id; /* A snapshot of the value at sptep */ u64 old_spte; /* @@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); +void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c926c6b899a1..34207b874886 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield); + gfn_t start, gfn_t end, bool can_yield, bool flush); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { @@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del(&root->link); - zap_gfn_range(kvm, root, 0, max_gfn, false); + zap_gfn_range(kvm, root, 0, max_gfn, false, false); free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); @@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) -{ - return sp->role.smm ? 1 : 0; -} - static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); @@ -301,11 +296,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. + * + * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU + * protection. Since this thread removed it from the paging structure, + * this thread will be responsible for ensuring the page is freed. Hence the + * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, +static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, bool shared) { - struct kvm_mmu_page *sp = sptep_to_sp(pt); + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; u64 old_child_spte; @@ -318,7 +318,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = pt + i; + sptep = rcu_dereference(pt) + i; gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); if (shared) { @@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, cpu_relax(); } } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; /* * Marking the SPTE as a removed SPTE is not @@ -393,7 +404,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * If this warning were to trigger it would indicate that there was a * missing MMU notifier or a race with some notifier handler. * A present, leaf SPTE should never be directly replaced with another - * present leaf SPTE pointing to a differnt PFN. A notifier handler + * present leaf SPTE pointing to a different PFN. A notifier handler * should be zapping the SPTE before the main MM's page table is * changed, or the SPTE should be zeroed, and the TLBs flushed by the * thread before replacement. @@ -407,7 +418,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, /* * Crash the host to prevent error propagation and guest data - * courruption. + * corruption. */ BUG(); } @@ -481,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - u64 *root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -498,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, new_spte) != iter->old_spte) return false; - handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, true); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return true; } @@ -522,12 +529,12 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, /* * No other thread can overwrite the removed SPTE as they * must either wait on the MMU lock or use - * tdp_mmu_set_spte_atomic which will not overrite the + * tdp_mmu_set_spte_atomic which will not overwrite the * special removed SPTE value. No bookkeeping is needed * here since the SPTE is going from non-present * to non-present. */ - WRITE_ONCE(*iter->sptep, 0); + WRITE_ONCE(*rcu_dereference(iter->sptep), 0); return true; } @@ -553,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - tdp_ptep_t root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -570,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); - __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, false); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); } @@ -648,9 +651,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, - iter->next_last_level_gfn); + tdp_iter_restart(iter); return true; } @@ -667,20 +668,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. + * operation can cause a soft lockup. Note, in some use cases a flush may be + * required by prior actions. Ensure the pending flush is performed prior to + * yielding. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield) + gfn_t start, gfn_t end, bool can_yield, bool flush) { struct tdp_iter iter; - bool flush_needed = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { - flush_needed = false; + tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + flush = false; continue; } @@ -698,11 +700,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_set_spte(kvm, &iter, 0); - flush_needed = true; + flush = true; } rcu_read_unlock(); - return flush_needed; + return flush; } /* @@ -711,13 +713,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield) { struct kvm_mmu_page *root; bool flush = false; for_each_tdp_mmu_root_yield_safe(kvm, root) - flush |= zap_gfn_range(kvm, root, start, end, true); + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); return flush; } @@ -929,7 +932,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, unsigned long unused) { - return zap_gfn_range(kvm, root, start, end, false); + return zap_gfn_range(kvm, root, start, end, false, false); } int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3b761c111bff..31096ece9b14 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -8,7 +8,29 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, + gfn_t end) +{ + return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); +} +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + + /* + * Don't allow yielding, as the caller may have a flush pending. Note, + * if mmu_lock is held for write, zapping will never yield in this case, + * but explicitly disallow it for safety. The TDP MMU does not yield + * until it has made forward progress (steps sideways), and when zapping + * a single shadow page that it's guaranteed to see (thus the mmu_lock + * requirement), its "step sideways" will always step beyond the bounds + * of the shadow page's gfn range and stop iterating before yielding. + */ + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); +} void kvm_tdp_mmu_zap_all(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7b30bc967af3..67e753edfa22 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -103,7 +103,7 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, /* returns general purpose PMC with the specified MSR. Note that it can be * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a - * paramenter to tell them apart. + * parameter to tell them apart. */ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, u32 base) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 78bdcfac4e40..3e55674098be 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -727,7 +727,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; /** - * In some cases, the existing irte is updaed and re-set, + * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ @@ -838,7 +838,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. * 2. Unsetting posted interrupt. - * 3. APIC virtialization is disabled for the vcpu. + * 3. APIC virtualization is disabled for the vcpu. * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 35891d9a1099..fb204eaa8bb3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -246,11 +246,18 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; bool vmcb12_lma; + /* + * FIXME: these should be done after copying the fields, + * to avoid TOC/TOU races. For these save area checks + * the possible damage is limited since kvm_set_cr0 and + * kvm_set_cr4 handle failure; EFER_SVME is an exception + * so it is force-set later in nested_prepare_vmcb_save. + */ if ((vmcb12->save.efer & EFER_SVME) == 0) return false; @@ -271,7 +278,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb12->control); + return true; } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -396,7 +403,14 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.gdtr = vmcb12->save.gdtr; svm->vmcb->save.idtr = vmcb12->save.idtr; kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, vmcb12->save.efer); + + /* + * Force-set EFER_SVME even though it is checked earlier on the + * VMCB12, because the guest can flip the bit between the check + * and now. Clearing EFER_SVME would call svm_free_nested. + */ + svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -468,7 +482,6 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, svm->nested.vmcb12_gpa = vmcb12_gpa; - load_nested_vmcb_control(svm, &vmcb12->control); nested_prepare_vmcb_control(svm); nested_prepare_vmcb_save(svm, vmcb12); @@ -515,7 +528,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - if (!nested_vmcb_checks(svm, vmcb12)) { + load_nested_vmcb_control(svm, &vmcb12->control); + + if (!nested_vmcb_check_save(svm, vmcb12) || + !nested_vmcb_check_controls(&svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -1209,6 +1225,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (!(save->cr0 & X86_CR0_PG)) goto out_free; + if (!(save->efer & EFER_SVME)) + goto out_free; /* * All checks done, we can enter guest mode. L1 control fields diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 035da07500e8..fdf587f19c5f 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -98,6 +98,8 @@ static enum index msr_to_index(u32 msr) static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, enum pmu_type type) { + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -105,6 +107,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTL3: case MSR_F15H_PERF_CTL4: case MSR_F15H_PERF_CTL5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: if (type != PMU_TYPE_EVNTSEL) return NULL; @@ -115,6 +120,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTR3: case MSR_F15H_PERF_CTR4: case MSR_F15H_PERF_CTR5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: if (type != PMU_TYPE_COUNTER) return NULL; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 874ea309279f..2b27a9452403 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2082,7 +2082,7 @@ void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - /* PKRU is restored on VMEXIT, save the curent host value */ + /* PKRU is restored on VMEXIT, save the current host value */ hostsa->pkru = read_pkru(); /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index baee91c1e936..6dad89248312 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs { { .index = MSR_INVALID, .always = false }, }; -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -bool npt_enabled = true; -#else -bool npt_enabled; -#endif - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444); static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; module_param(pause_filter_count_max, ushort, 0444); -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); +/* + * Use nested page tables by default. Note, NPT may get forced off by + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. + */ +bool npt_enabled = true; +module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; @@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void) goto err; } - if (!boot_cpu_has(X86_FEATURE_NPT)) + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) npt_enabled = false; - if (npt_enabled && !npt) + if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); @@ -4399,7 +4400,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i * * This happens because CPU microcode reading instruction bytes * uses a special opcode which attempts to read data using CPL=0 - * priviledges. The microcode reads CS:RIP and if it hits a SMAP + * privileges. The microcode reads CS:RIP and if it hits a SMAP * fault, it gives up and returns no instruction bytes. * * Detection: diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcca0b80e0d0..1e069aac7410 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3537,7 +3537,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * snapshot restore (migration). * * In this flow, it is assumed that vmcs12 cache was - * trasferred as part of captured nVMX state and should + * transferred as part of captured nVMX state and should * therefore not be read from guest memory (which may not * exist on destination host yet). */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 4831bc44ce66..459748680daf 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -10,7 +10,7 @@ #include "vmx.h" /* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we * can find which vCPU should be waken up. */ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 50810d471462..bcbf0d2139e9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1529,7 +1529,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) /* * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that - * utilize encodings marked reserved will casue a #GP fault. + * utilize encodings marked reserved will cause a #GP fault. */ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && @@ -2761,7 +2761,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * Update real mode segment cache. It may be not up-to-date if sement + * Update real mode segment cache. It may be not up-to-date if segment * register was written while vcpu was in a guest mode. */ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); @@ -6027,19 +6027,19 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + int ndata = 3; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason.full; vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.ndata++; - vcpu->run->internal.data[3] = + vcpu->run->internal.data[ndata++] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } - vcpu->run->internal.data[vcpu->run->internal.ndata++] = - vcpu->arch.last_vmentry_cpu; + vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + vcpu->run->internal.ndata = ndata; return 0; } @@ -6580,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) int i, nr_msrs; struct perf_guest_switch_msr *msrs; + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs); - if (!msrs) return; @@ -7252,7 +7252,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; - /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a20ce60152e..efc7a82ab140 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -156,9 +156,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancment of 1000ns. '0' disables + * adaptive tuning starting from default advancement of 1000ns. '0' disables * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows priveleged userspace to set an exact advancement time. + * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); @@ -271,8 +271,7 @@ static struct kmem_cache *x86_emulator_cache; * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ -static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -1288,7 +1287,7 @@ static const u32 emulated_msrs_all[] = { MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, + MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, @@ -1373,7 +1372,7 @@ static u64 kvm_get_arch_capabilities(void) /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that * the nested hypervisor runs with NX huge pages. If it is not, - * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other * L1 guests, so it need not worry about its own (L2) guests. */ data |= ARCH_CAP_PSCHANGE_MC_NO; @@ -1445,7 +1444,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) r = 0; } @@ -1526,35 +1525,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; struct kvm *kvm = vcpu->kvm; - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; - u32 count = kvm->arch.msr_filter.count; - u32 i; - bool r = kvm->arch.msr_filter.default_allow; + bool allowed; int idx; + u32 i; - /* MSR filtering not set up or x2APIC enabled, allow everything */ - if (!count || (index >= 0x800 && index <= 0x8ff)) + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) return true; - /* Prevent collision with set_msr_filter */ idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < count; i++) { + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { u32 start = ranges[i].base; u32 end = start + ranges[i].nmsrs; u32 flags = ranges[i].flags; unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - r = !!test_bit(index - start, bitmap); + allowed = !!test_bit(index - start, bitmap); break; } } +out: srcu_read_unlock(&kvm->srcu, idx); - return r; + return allowed; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -1611,7 +1619,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(vcpu, index, data, true)) + if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; @@ -1649,7 +1657,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } @@ -1841,7 +1849,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) ret = EXIT_FASTPATH_EXIT_HANDLED; } break; - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_tscdeadline(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); @@ -2320,7 +2328,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -2328,7 +2336,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2550,11 +2558,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + unsigned long flags; + + kvm_hv_invalidate_tsc_page(kvm); - spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2562,8 +2575,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } @@ -2571,17 +2582,18 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + unsigned long flags; u64 ret; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -2675,13 +2687,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -3075,7 +3087,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: @@ -3437,7 +3449,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: @@ -4013,7 +4025,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct kvm_host_map map; struct kvm_steal_time *st; - int idx; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4021,15 +4032,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, &vcpu->arch.st.cache, true)) - goto out; + return; st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); @@ -4037,20 +4042,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); - -out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + if (vcpu->preempted && !vcpu->arch.guest_state_protected) vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_xen_msr_enabled(vcpu->kvm)) kvm_xen_runstate_set_preempted(vcpu); else kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -5352,25 +5362,34 @@ split_irqchip_unlock: return r; } -static void kvm_clear_msr_filter(struct kvm *kvm) +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) { u32 i; - u32 count = kvm->arch.msr_filter.count; - struct msr_bitmap_range ranges[16]; - mutex_lock(&kvm->lock); - kvm->arch.msr_filter.count = 0; - memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); - mutex_unlock(&kvm->lock); - synchronize_srcu(&kvm->srcu); + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); - for (i = 0; i < count; i++) - kfree(ranges[i].bitmap); + kfree(msr_filter); } -static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) { - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; struct msr_bitmap_range range; unsigned long *bitmap = NULL; size_t bitmap_size; @@ -5404,11 +5423,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user goto err; } - /* Everything ok, add this range identifier to our global pool */ - ranges[kvm->arch.msr_filter.count] = range; - /* Make sure we filled the array before we tell anyone to walk it */ - smp_wmb(); - kvm->arch.msr_filter.count++; + /* Everything ok, add this range identifier. */ + msr_filter->ranges[msr_filter->count] = range; + msr_filter->count++; return 0; err: @@ -5419,10 +5436,11 @@ err: static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) { struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_x86_msr_filter *new_filter, *old_filter; struct kvm_msr_filter filter; bool default_allow; - int r = 0; bool empty = true; + int r = 0; u32 i; if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) @@ -5435,25 +5453,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (empty && !default_allow) return -EINVAL; - kvm_clear_msr_filter(kvm); + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; - kvm->arch.msr_filter.default_allow = default_allow; - - /* - * Protect from concurrent calls to this function that could trigger - * a TOCTOU violation on kvm->arch.msr_filter.count. - */ - mutex_lock(&kvm->lock); for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(kvm, &filter.ranges[i]); - if (r) - break; + r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } } + mutex_lock(&kvm->lock); + + /* The per-VM filter is protected by kvm->lock... */ + old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); + + rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); mutex_unlock(&kvm->lock); - return r; + return 0; } long kvm_arch_vm_ioctl(struct file *filp, @@ -5700,6 +5725,7 @@ set_pit2_out: } #endif case KVM_SET_CLOCK: { + struct kvm_arch *ka = &kvm->arch; struct kvm_clock_data user_ns; u64 now_ns; @@ -5718,8 +5744,22 @@ set_pit2_out: * pvclock_update_vm_gtod_copy(). */ kvm_gen_update_masterclock(kvm); - now_ns = get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'user_ns.clock' is very small. + */ + spin_lock_irq(&ka->pvclock_gtod_sync_lock); + if (kvm->arch.use_master_clock) + now_ns = ka->master_kernel_ns; + else + now_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = user_ns.clock - now_ns; + spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } @@ -6603,7 +6643,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); @@ -7698,6 +7738,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; + unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -7713,17 +7754,15 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock(&ka->pvclock_gtod_sync_lock); - + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); } mutex_unlock(&kvm_lock); } @@ -10601,7 +10640,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } else { if (!slot || !slot->npages) - return 0; + return NULL; old_npages = slot->npages; hva = slot->userspace_addr; @@ -10634,8 +10673,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - u32 i; - if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10651,8 +10688,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } static_call_cond(kvm_x86_vm_destroy)(kvm); - for (i = 0; i < kvm->arch.msr_filter.count; i++) - kfree(kvm->arch.msr_filter.ranges[i].bitmap); + kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 39eb04887141..9035e34aa156 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -250,7 +250,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 3b6544111ac9..16bc9130e7a5 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -6,7 +6,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 1c5c81c16b06..ce6935690766 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -6,7 +6,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> .macro read64 reg movl %ebx, %eax diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 2402d4c489d2..db4b4f9197c7 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -3,7 +3,7 @@ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> /* diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 77b9b2a3b5c8..57b79c577496 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -11,7 +11,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/asm.h> #include <asm/smap.h> #include <asm/export.h> diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4229950a5d78..2bf07e18e38c 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -232,7 +232,7 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) * resolve_seg_reg() - obtain segment register index * @insn: Instruction with operands * @regs: Register values as seen when entering kernel mode - * @regoff: Operand offset, in pt_regs, used to deterimine segment register + * @regoff: Operand offset, in pt_regs, used to determine segment register * * Determine the segment register associated with the operands and, if * applicable, prefixes and the instruction pointed by @insn. @@ -517,7 +517,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, * @insn: Instruction containing ModRM byte * @regs: Register values as seen when entering kernel mode * @offs1: Offset of the first operand register - * @offs2: Offset of the second opeand register, if applicable + * @offs2: Offset of the second operand register, if applicable * * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte * in @insn. This function is to be used with 16-bit address encodings. The @@ -576,7 +576,7 @@ static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs, * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement- * only addressing. This means that no registers are involved in * computing the effective address. Thus, ensure that the first - * register offset is invalild. The second register offset is already + * register offset is invalid. The second register offset is already * invalid under the aforementioned conditions. */ if ((X86_MODRM_MOD(insn->modrm.value) == 0) && @@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } +static unsigned long insn_get_effective_ip(struct pt_regs *regs) +{ + unsigned long seg_base = 0; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + return seg_base + regs->ip; +} + /** * insn_fetch_from_user() - Copy instruction bytes from user-space memory * @regs: Structure with register values as seen when entering kernel mode @@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { - unsigned long seg_base = 0; + unsigned long ip; int not_copied; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) { - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - if (seg_base == -1L) - return 0; - } + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; + + not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory + * while in atomic code + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. This function must be used in atomic context. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long ip; + int not_copied; + + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - MAX_INSN_SIZE); + not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); return MAX_INSN_SIZE - not_copied; } diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1e299ac73c86..1cc9da6e29c7 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,7 +4,7 @@ #include <linux/linkage.h> #include <asm/errno.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> .pushsection .noinstr.text, "ax" diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 41902fe8b859..64801010d312 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -8,7 +8,7 @@ */ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> #undef memmove diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 0bfd26e4ca9e..9827ae267f96 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -3,7 +3,7 @@ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> /* diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 419365c48b2a..cc5f4ea943d3 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -14,7 +14,7 @@ * tested so far for any MMX solution figured. * * 22/09/2000 - Arjan van de Ven - * Improved for non-egineering-sample Athlons + * Improved for non-engineering-sample Athlons * */ #include <linux/hardirq.h> diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index 75a0915b0d01..40bbe56bde32 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -252,7 +252,7 @@ static void __wrmsr_safe_regs_on_cpu(void *info) rv->err = wrmsr_safe_regs(rv->regs); } -int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; @@ -265,7 +265,7 @@ int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); -int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 3bd905e10ee2..b09cd2ad426c 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(msrs_free); * argument @m. * */ -int msr_read(u32 msr, struct msr *m) +static int msr_read(u32 msr, struct msr *m) { int err; u64 val; @@ -54,7 +54,7 @@ int msr_read(u32 msr, struct msr *m) * @msr: MSR to write * @m: value to write */ -int msr_write(u32 msr, struct msr *m) +static int msr_write(u32 msr, struct msr *m) { return wrmsrl_safe(msr, m->q); } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index f6fb1d218dcc..6bb74b5c238c 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -4,7 +4,7 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 4a9887851ad8..990d847ae902 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -547,7 +547,7 @@ static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) single_arg_error(st0_ptr, st0_tag); } -static int fsin(FPU_REG *st0_ptr, u_char tag) +static int f_sin(FPU_REG *st0_ptr, u_char tag) { u_char arg_sign = getsign(st0_ptr); @@ -608,6 +608,11 @@ static int fsin(FPU_REG *st0_ptr, u_char tag) } } +static void fsin(FPU_REG *st0_ptr, u_char tag) +{ + f_sin(st0_ptr, tag); +} + static int f_cos(FPU_REG *st0_ptr, u_char tag) { u_char st0_sign; @@ -724,7 +729,7 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) } reg_copy(st0_ptr, &arg); - if (!fsin(st0_ptr, st0_tag)) { + if (!f_sin(st0_ptr, st0_tag)) { push(); FPU_copy_to_reg0(&arg, st0_tag); f_cos(&st(0), st0_tag); @@ -1635,7 +1640,7 @@ void FPU_triga(void) } static FUNC_ST0 const trig_table_b[] = { - fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos + fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, fsin, fcos }; void FPU_trigb(void) diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index fe6246ff9887..7ca6417c0c8d 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -964,7 +964,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) /* The return value (in eax) is zero if the result is exact, if bits are changed due to rounding, truncation, etc, then a non-zero value is returned */ -/* Overflow is signalled by a non-zero return value (in eax). +/* Overflow is signaled by a non-zero return value (in eax). In the case of overflow, the returned significand always has the largest possible value */ int FPU_round_to_int(FPU_REG *r, u_char tag) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index 11a1f798451b..4a9fc3cc5a4d 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -575,7 +575,7 @@ Normalise_result: #ifdef PECULIAR_486 /* * This implements a special feature of 80486 behaviour. - * Underflow will be signalled even if the number is + * Underflow will be signaled even if the number is * not a denormal after rounding. * This difference occurs only for masked underflow, and not * in the unmasked case. diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a73347e2cdfc..1c548ad00752 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1497,7 +1497,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event - * which is injected when the memory becomes available, is delived via + * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * @@ -1523,7 +1523,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps - * debugability. + * debuggability. */ state = irqentry_enter(regs); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dd694fb93916..fbf41dd142ca 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,7 +29,7 @@ /* * We need to define the tracepoints somewhere, and tlb.c - * is only compied when SMP=y. + * is only compiled when SMP=y. */ #define CREATE_TRACE_POINTS #include <trace/events/tlb.h> @@ -756,7 +756,7 @@ void __init init_mem_mapping(void) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { - /* can we preseve max_low_pfn ?*/ + /* can we preserve max_low_pfn ?*/ max_low_pfn = max_pfn; } #else @@ -939,7 +939,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, - * decompresser could be confused by aligned initrd_end + * decompressor could be confused by aligned initrd_end * We already reserve the end partial page before in * - i386_start_kernel() * - x86_64_start_kernel() diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b5a3fa4033d3..55247451ba85 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -172,7 +172,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end) /* * With folded p4d, pgd_none() is always false, we need to - * handle synchonization on p4d level. + * handle synchronization on p4d level. */ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); @@ -986,7 +986,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { /* * Do not free direct mapping pages since they were - * freed when offlining, or simplely not in use. + * freed when offlining, or simply not in use. */ if (!direct) free_pagetable(pte_page(*pte), 0); @@ -1004,7 +1004,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, * * If we are not removing the whole page, it means * other page structs in this page are being used and - * we canot remove them. So fill the unused page_structs + * we cannot remove them. So fill the unused page_structs * with 0xFD, and remove the page when it is wholly * filled with 0xFD. */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 6e6b39710e5f..557f0fe25dff 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -96,7 +96,7 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt phyiscal memory region size based on available memory */ + /* Adapt physical memory region size based on available memory */ if (memory_tb < kaslr_regions[0].size_tb) kaslr_regions[0].size_tb = memory_tb; diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index be020a7bc414..d3efbc5b3449 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Support for MMIO probes. - * Benfit many code from kprobes + * Benefit many code from kprobes * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. * 2007 Alexander Eichner * 2008 Pekka Paalanen <pq@iki.fi> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 4b01f7dbaf30..f633f9e23b8f 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/dma-mapping.h> +#include <linux/virtio_config.h> #include <asm/tlbflush.h> #include <asm/fixmap.h> @@ -262,7 +263,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) if (pgprot_val(old_prot) == pgprot_val(new_prot)) return; - pa = pfn << page_level_shift(level); + pa = pfn << PAGE_SHIFT; size = page_level_size(level); /* @@ -484,3 +485,8 @@ void __init mem_encrypt_init(void) print_mem_encrypt_feature_info(); } +int arch_has_restricted_virtio_memory_access(void) +{ + return sev_active(); +} +EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access); diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 7a84fc8bc5c3..17d292b7072f 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -27,7 +27,7 @@ SYM_FUNC_START(sme_encrypt_execute) * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) * - intermediate copy buffer (PMD_PAGE_SIZE) - * R8 - physcial address of the pagetables to use for encryption + * R8 - physical address of the pagetables to use for encryption */ push %rbp diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 6c5eb6f3f14f..a19374d26101 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -503,14 +503,10 @@ void __init sme_enable(struct boot_params *bp) #define AMD_SME_BIT BIT(0) #define AMD_SEV_BIT BIT(1) - /* - * Set the feature mask (SME or SEV) based on whether we are - * running under a hypervisor. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = __rdmsr(MSR_AMD64_SEV); + feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* * Check for the SME/SEV feature: @@ -530,19 +526,26 @@ void __init sme_enable(struct boot_params *bp) /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { + /* + * No SME if Hypervisor bit is set. This check is here to + * prevent a guest from trying to enable SME. For running as a + * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * might be other hypervisors which emulate that MSR as non-zero + * or even pass it through to the guest. + * A malicious hypervisor can still trick a guest into this + * path, but there is no way to protect against that. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (ecx & BIT(31)) + return; + /* For SME, check the SYSCFG MSR */ msr = __rdmsr(MSR_K8_SYSCFG); if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) return; } else { - /* For SEV, check the SEV MSR */ - msr = __rdmsr(MSR_AMD64_SEV); - if (!(msr & MSR_AMD64_SEV_ENABLED)) - return; - - /* Save SEV_STATUS to avoid reading MSR again */ - sev_status = msr; - /* SEV state cannot be controlled by a command line option */ sme_me_mask = me_mask; sev_enabled = true; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index ca311aaa67b8..3112ca7786ed 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -695,7 +695,7 @@ int memtype_free(u64 start, u64 end) /** - * lookup_memtype - Looksup the memory type for a physical address + * lookup_memtype - Looks up the memory type for a physical address * @paddr: physical address of which memory type needs to be looked up * * Only to be called when PAT is enabled @@ -800,6 +800,7 @@ void memtype_free_io(resource_size_t start, resource_size_t end) memtype_free(start, end); } +#ifdef CONFIG_X86_PAT int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) { enum page_cache_mode type = _PAGE_CACHE_MODE_WC; @@ -813,6 +814,7 @@ void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) memtype_free_io(start, start + size); } EXPORT_SYMBOL(arch_io_free_memtype_wc); +#endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 16f878c26667..427980617557 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -680,7 +680,7 @@ pmd_t *lookup_pmd_address(unsigned long address) * end up in this kind of memory, for instance. * * This could be optimized, but it is only intended to be - * used at inititalization time, and keeping it + * used at initialization time, and keeping it * unoptimized should increase the testing coverage for * the more obscure platforms. */ diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 8873ed1438a9..a2332eef66e9 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -128,7 +128,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | /* * Called from the FPU code when creating a fresh set of FPU * registers. This is called from a very specific context where - * we know the FPU regstiers are safe for use and we can use PKRU + * we know the FPU registers are safe for use and we can use PKRU * directly. */ void copy_init_pkru_to_fpregs(void) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 1aab92930569..5d5c7bb50ce9 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -361,7 +361,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, * global, so set it as global in both copies. Note: * the X86_FEATURE_PGE check is not _required_ because * the CPU ignores _PAGE_GLOBAL when PGE is not - * supported. The check keeps consistentency with + * supported. The check keeps consistency with * code that only set this bit when supported. */ if (boot_cpu_has(X86_FEATURE_PGE)) @@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void) for_each_possible_cpu(cpu) { /* - * The SYSCALL64 entry code needs to be able to find the - * thread stack and needs one word of scratch space in which - * to spill a register. All of this lives in the TSS, in - * the sp1 and sp2 slots. + * The SYSCALL64 entry code needs one word of scratch space + * in which to spill a register. It lives in the sp2 slot + * of the CPU's TSS. * * This is done for all possible CPUs during boot to ensure * that it's propagated to all mms. @@ -512,7 +511,7 @@ static void pti_clone_entry_text(void) static inline bool pti_kernel_image_global_ok(void) { /* - * Systems with PCIDs get litlle benefit from global + * Systems with PCIDs get little benefit from global * kernel text and are not worth the downsides. */ if (cpu_feature_enabled(X86_FEATURE_PCID)) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 569ac1d57f55..98f269560d40 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -106,7 +106,7 @@ static inline u16 kern_pcid(u16 asid) #ifdef CONFIG_PAGE_TABLE_ISOLATION /* - * Make sure that the dynamic ASID space does not confict with the + * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs. */ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); @@ -736,7 +736,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * 3, we'd be break the invariant: we'd update local_tlb_gen above * 1 without the full flush that's needed for tlb_gen 2. * - * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. * Partial TLB flushes are not all that much cheaper than full TLB * flushes, so it seems unlikely that it would be a performance win * to do a partial flush if that won't bring our TLB fully up to @@ -876,7 +876,7 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, static inline void put_flush_tlb_info(void) { #ifdef CONFIG_DEBUG_VM - /* Complete reentrency prevention checks */ + /* Complete reentrancy prevention checks */ barrier(); this_cpu_dec(flush_tlb_info_idx); #endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 79e7a0ec1da5..220e72434f3c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1349,6 +1349,7 @@ st: if (is_imm8(insn->off)) insn->imm == (BPF_XOR | BPF_FETCH)) { u8 *branch_target; bool is64 = BPF_SIZE(insn->code) == BPF_DW; + u32 real_src_reg = src_reg; /* * Can't be implemented with a single x86 insn. @@ -1357,6 +1358,9 @@ st: if (is_imm8(insn->off)) /* Will need RAX as a CMPXCHG operand so save R0 */ emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + if (src_reg == BPF_REG_0) + real_src_reg = BPF_REG_AX; + branch_target = prog; /* Load old value */ emit_ldx(&prog, BPF_SIZE(insn->code), @@ -1366,9 +1370,9 @@ st: if (is_imm8(insn->off)) * put the result in the AUX_REG. */ emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); - maybe_emit_mod(&prog, AUX_REG, src_reg, is64); + maybe_emit_mod(&prog, AUX_REG, real_src_reg, is64); EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], - add_2reg(0xC0, AUX_REG, src_reg)); + add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ err = emit_atomic(&prog, BPF_CMPXCHG, dst_reg, AUX_REG, insn->off, @@ -1381,7 +1385,7 @@ st: if (is_imm8(insn->off)) */ EMIT2(X86_JNE, -(prog - branch_target) - 2); /* Return the pre-modification value */ - emit_mov_reg(&prog, is64, src_reg, BPF_REG_0); + emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0); /* Restore R0 after clobbering RAX */ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break; @@ -1552,7 +1556,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ if (is_imm8(jmp_offset)) { if (jmp_padding) { /* To keep the jmp_offset valid, the extra bytes are - * padded before the jump insn, so we substract the + * padded before the jump insn, so we subtract the * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 @@ -1627,7 +1631,7 @@ emit_jmp: if (jmp_padding) { /* To avoid breaking jmp_offset, the extra bytes * are padded before the actual jmp insn, so - * 2 bytes is substracted from INSN_SZ_DIFF. + * 2 bytes is subtracted from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 * jmp, there is nothing to pad (0 byte). @@ -1685,7 +1689,16 @@ emit_jmp: } if (image) { - if (unlikely(proglen + ilen > oldproglen)) { + /* + * When populating the image, assert that: + * + * i) We do not write beyond the allocated space, and + * ii) addrs[i] did not change from the prior run, in order + * to validate assumptions made for computing branch + * displacements. + */ + if (unlikely(proglen + ilen > oldproglen || + proglen + ilen != addrs[i])) { pr_err("bpf_jit: fatal error\n"); return -EFAULT; } @@ -1932,7 +1945,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1971,6 +1984,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1989,8 +2011,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -1999,6 +2020,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { @@ -2029,9 +2053,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ @@ -2221,7 +2253,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2313,7 +2345,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index d17b67c69f89..6a99def7d315 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2276,7 +2276,16 @@ notyet: } if (image) { - if (unlikely(proglen + ilen > oldproglen)) { + /* + * When populating the image, assert that: + * + * i) We do not write beyond the allocated space, and + * ii) addrs[i] did not change from the prior run, in order + * to validate assumptions made for computing branch + * displacements. + */ + if (unlikely(proglen + ilen > oldproglen || + proglen + ilen != addrs[i])) { pr_err("bpf_jit: fatal error\n"); return -EFAULT; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 0a0e168be1cb..02dc64625e64 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -375,7 +375,7 @@ static const struct dmi_system_id msi_k8t_dmi_table[] = { * The BIOS only gives options "DISABLED" and "AUTO". This code sets * the corresponding register-value to enable the soundcard. * - * The soundcard is only enabled, if the mainborad is identified + * The soundcard is only enabled, if the mainboard is identified * via DMI-tables and the soundcard is detected to be off. */ static void pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 1b82d77019b1..df7b5477fc4f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -195,7 +195,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) } /* - * Certain firmware versions are way too sentimential and still believe + * Certain firmware versions are way too sentimental and still believe * they are exclusive and unquestionable owners of the first physical page, * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY * (but then write-access it later during SetVirtualAddressMap()). @@ -457,7 +457,7 @@ void __init efi_dump_pagetable(void) * in a kernel thread and user context. Preemption needs to remain disabled * while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm * can not change under us. - * It should be ensured that there are no concurent calls to this function. + * It should be ensured that there are no concurrent calls to this function. */ void efi_enter_mm(void) { diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 67d93a243c35..7850111008a8 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -441,7 +441,7 @@ void __init efi_free_boot_services(void) * 1.4.4 with SGX enabled booting Linux via Fedora 24's * grub2-efi on a hard disk. (And no, I don't know why * this happened, but Linux should still try to boot rather - * panicing early.) + * panicking early.) */ rm_size = real_mode_size_needed(); if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { @@ -726,7 +726,7 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) * Buggy efi_reset_system() is handled differently from other EFI * Runtime Services as it doesn't use efi_rts_wq. Although, * native_machine_emergency_restart() says that machine_real_restart() - * could fail, it's better not to compilcate this fault handler + * could fail, it's better not to complicate this fault handler * because this case occurs *very* rarely and hence could be improved * on a need by basis. */ diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 0286fe1b14b5..d3d456925b2a 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * imr.c -- Intel Isolated Memory Region driver * * Copyright(c) 2013 Intel Corporation. @@ -551,7 +551,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) /* * Setup an unlocked IMR around the physical extent of the kernel - * from the beginning of the .text secton to the end of the + * from the beginning of the .text section to the end of the * .rodata section as one physically contiguous block. * * We don't round up @size since it is already PAGE_SIZE aligned. diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 570e3062faac..761f3689f60a 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * imr_selftest.c -- Intel Isolated Memory Region self-test driver * * Copyright(c) 2013 Intel Corporation. diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index 526f70f27c1c..fdd49d70b437 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -187,7 +187,7 @@ bool iosf_mbi_available(void) EXPORT_SYMBOL(iosf_mbi_available); /* - **************** P-Unit/kernel shared I2C bus arbritration **************** + **************** P-Unit/kernel shared I2C bus arbitration **************** * * Some Bay Trail and Cherry Trail devices have the P-Unit and us (the kernel) * share a single I2C bus to the PMIC. Below are helpers to arbitrate the @@ -493,7 +493,7 @@ static void iosf_sideband_debug_init(void) /* mcrx */ debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); - /* mcr - initiates mailbox tranaction */ + /* mcr - initiates mailbox transaction */ debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); } diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 1ac8578258af..b42bfdab01a9 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -27,7 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>"); MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); -MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); static bool force; diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 85f4638764d6..994a229cb79f 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -27,7 +27,7 @@ static bool lid_wake_on_close; * wake-on-close. This is implemented as standard by the XO-1.5 DSDT. * * We provide here a sysfs attribute that will additionally enable - * wake-on-close behavior. This is useful (e.g.) when we oportunistically + * wake-on-close behavior. This is useful (e.g.) when we opportunistically * suspend with the display running; if the lid is then closed, we want to * wake up to turn the display off. * diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 26d1f6693789..75e3319e8bee 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -131,7 +131,7 @@ void * __init prom_early_alloc(unsigned long size) const size_t chunk_size = max(PAGE_SIZE, size); /* - * To mimimize the number of allocations, grab at least + * To minimize the number of allocations, grab at least * PAGE_SIZE of memory (that's an arbitrary choice that's * fast enough on the platforms we care about while minimizing * wasted bootmem) and hand off chunks of it to callers. diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index d2ccadc247e6..66b317398b8a 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -30,10 +30,10 @@ * the boot start info structure. * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared. * - `cr4`: all bits are cleared. - * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’ - * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified. + * - `cs `: must be a 32-bit read/execute code segment with a base of `0` + * and a limit of `0xFFFFFFFF`. The selector value is unspecified. * - `ds`, `es`: must be a 32-bit read/write data segment with a base of - * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all + * `0` and a limit of `0xFFFFFFFF`. The selector values are all * unspecified. * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit * of '0x67'. diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index eafc530c8767..1e9ff28bc2e0 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -24,6 +24,7 @@ #include <asm/kdebug.h> #include <asm/local64.h> #include <asm/nmi.h> +#include <asm/reboot.h> #include <asm/traps.h> #include <asm/uv/uv.h> #include <asm/uv/uv_hub.h> @@ -91,6 +92,8 @@ static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1); static atomic_t uv_nmi_slave_continue; static cpumask_var_t uv_nmi_cpu_mask; +static atomic_t uv_nmi_kexec_failed; + /* Values for uv_nmi_slave_continue */ #define SLAVE_CLEAR 0 #define SLAVE_CONTINUE 1 @@ -834,38 +837,35 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -static atomic_t uv_nmi_kexec_failed; - -#if defined(CONFIG_KEXEC_CORE) -static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) +static void uv_nmi_kdump(int cpu, int main, struct pt_regs *regs) { + /* Check if kdump kernel loaded for both main and secondary CPUs */ + if (!kexec_crash_image) { + if (main) + pr_err("UV: NMI error: kdump kernel not loaded\n"); + return; + } + /* Call crash to dump system state */ - if (master) { + if (main) { pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu); crash_kexec(regs); - pr_emerg("UV: crash_kexec unexpectedly returned, "); + pr_emerg("UV: crash_kexec unexpectedly returned\n"); atomic_set(&uv_nmi_kexec_failed, 1); - if (!kexec_crash_image) { - pr_cont("crash kernel not loaded\n"); - return; - } - pr_cont("kexec busy, stalling cpus while waiting\n"); - } - /* If crash exec fails the slaves should return, otherwise stall */ - while (atomic_read(&uv_nmi_kexec_failed) == 0) - mdelay(10); -} + } else { /* secondary */ -#else /* !CONFIG_KEXEC_CORE */ -static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) -{ - if (master) - pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); - atomic_set(&uv_nmi_kexec_failed, 1); + /* If kdump kernel fails, secondaries will exit this loop */ + while (atomic_read(&uv_nmi_kexec_failed) == 0) { + + /* Once shootdown cpus starts, they do not return */ + run_crash_ipi_callback(regs); + + mdelay(10); + } + } } -#endif /* !CONFIG_KEXEC_CORE */ #ifdef CONFIG_KGDB #ifdef CONFIG_KGDB_KDB @@ -889,7 +889,7 @@ static inline int uv_nmi_kdb_reason(void) * Call KGDB/KDB from NMI handler * * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or - * 'kdb' has no affect on which is used. See the KGDB documention for further + * 'kdb' has no affect on which is used. See the KGDB documentation for further * information. */ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index db1378c6ff26..c9908bcdb249 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -321,7 +321,7 @@ int hibernate_resume_nonboot_cpu_disable(void) /* * When bsp_check() is called in hibernate and suspend, cpu hotplug - * is disabled already. So it's unnessary to handle race condition between + * is disabled already. So it's unnecessary to handle race condition between * cpumask query and cpu hotplug. */ static int bsp_check(void) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 22fda7d99159..1be71ef5e4c4 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -103,7 +103,7 @@ static void __init setup_real_mode(void) *ptr += phys_base; } - /* Must be perfomed *after* relocation. */ + /* Must be performed *after* relocation. */ trampoline_header = (struct trampoline_header *) __va(real_mode_header->trampoline_header); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index dc0a337f985b..4f18cd9eacd8 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1070,8 +1070,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_pmc = xen_read_pmc, - .iret = xen_iret, - .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, .load_gdt = xen_load_gdt, @@ -1233,8 +1231,8 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.init.patch = paravirt_patch_default; pv_ops.cpu = xen_cpu_ops; + paravirt_iret = xen_iret; xen_init_irq_ops(); /* diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index cf2ade864c30..1e28c880f642 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2410,7 +2410,7 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, rmd.prot = prot; /* * We use the err_ptr to indicate if there we are doing a contiguous - * mapping or a discontigious mapping. + * mapping or a discontiguous mapping. */ rmd.contiguous = !err_ptr; rmd.no_translate = no_translate; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index a3cc33091f46..ac06ca32e9ef 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT -#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT +#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT #else #define P2M_LIMIT 0 #endif @@ -416,9 +416,6 @@ void __init xen_vmalloc_p2m_tree(void) xen_p2m_last_pfn = xen_max_p2m_pfn; p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; - if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC)) - p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO; - vm.flags = VM_ALLOC; vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), PMD_SIZE * PMDS_PER_MID_PAGE); @@ -741,7 +738,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, map_ops[i].status = GNTST_general_error; unmap[0].host_addr = map_ops[i].host_addr, unmap[0].handle = map_ops[i].handle; - map_ops[i].handle = ~0; + map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr; else @@ -751,7 +748,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, kmap_ops[i].status = GNTST_general_error; unmap[1].host_addr = kmap_ops[i].host_addr, unmap[1].handle = kmap_ops[i].handle; - kmap_ops[i].handle = ~0; + kmap_ops[i].handle = INVALID_GRANT_HANDLE; if (kmap_ops[i].flags & GNTMAP_device_map) unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr; else @@ -776,7 +773,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, out: return ret; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -802,7 +798,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); #ifdef CONFIG_XEN_DEBUG_FS #include <linux/debugfs.h> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1a3b75652fa4..8bfc10330107 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -59,6 +59,18 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -778,13 +790,13 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. * * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); i = 0; addr = xen_e820_table.entries[0].addr; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 91f5b330dcc6..d9c945ee1100 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -379,11 +379,6 @@ void xen_timer_resume(void) } } -static const struct pv_time_ops xen_time_ops __initconst = { - .sched_clock = xen_sched_clock, - .steal_clock = xen_steal_clock, -}; - static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; static u64 xen_clock_value_saved; @@ -525,17 +520,24 @@ static void __init xen_time_init(void) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } -void __init xen_init_time_ops(void) +static void __init xen_init_time_common(void) { xen_sched_clock_offset = xen_clocksource_read(); - pv_ops.time = xen_time_ops; + static_call_update(pv_steal_clock, xen_steal_clock); + paravirt_set_sched_clock(xen_sched_clock); + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; +} + +void __init xen_init_time_ops(void) +{ + xen_init_time_common(); x86_init.timers.timer_init = xen_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; /* Dom0 uses the native method to set the hardware RTC. */ if (!xen_initial_domain()) x86_platform.set_wallclock = xen_set_wallclock; @@ -569,13 +571,11 @@ void __init xen_hvm_init_time_ops(void) return; } - xen_sched_clock_offset = xen_clocksource_read(); - pv_ops.time = xen_time_ops; + xen_init_time_common(); + x86_init.timers.setup_percpu_clockev = xen_time_init; x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; x86_platform.set_wallclock = xen_set_wallclock; } #endif diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index c426b846beef..45cc0ae0af6f 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -100,37 +100,6 @@ LOAD_CP_REGS_TAB(7) /* - * coprocessor_flush(struct thread_info*, index) - * a2 a3 - * - * Save coprocessor registers for coprocessor 'index'. - * The register values are saved to or loaded from the coprocessor area - * inside the task_info structure. - * - * Note that this function doesn't update the coprocessor_owner information! - * - */ - -ENTRY(coprocessor_flush) - - /* reserve 4 bytes on stack to save a0 */ - abi_entry(4) - - s32i a0, a1, 0 - movi a0, .Lsave_cp_regs_jump_table - addx8 a3, a3, a0 - l32i a4, a3, 4 - l32i a3, a3, 0 - add a2, a2, a4 - beqz a3, 1f - callx0 a3 -1: l32i a0, a1, 0 - - abi_ret(4) - -ENDPROC(coprocessor_flush) - -/* * Entry condition: * * a0: trashed, original value saved on stack (PT_AREG0) @@ -245,6 +214,39 @@ ENTRY(fast_coprocessor) ENDPROC(fast_coprocessor) + .text + +/* + * coprocessor_flush(struct thread_info*, index) + * a2 a3 + * + * Save coprocessor registers for coprocessor 'index'. + * The register values are saved to or loaded from the coprocessor area + * inside the task_info structure. + * + * Note that this function doesn't update the coprocessor_owner information! + * + */ + +ENTRY(coprocessor_flush) + + /* reserve 4 bytes on stack to save a0 */ + abi_entry(4) + + s32i a0, a1, 0 + movi a0, .Lsave_cp_regs_jump_table + addx8 a3, a3, a0 + l32i a4, a3, 4 + l32i a3, a3, 0 + add a2, a2, a4 + beqz a3, 1f + callx0 a3 +1: l32i a0, a1, 0 + + abi_ret(4) + +ENDPROC(coprocessor_flush) + .data ENTRY(coprocessor_owner) diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 7666408ce12a..95a74890c7e9 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -112,8 +112,11 @@ good_area: */ fault = handle_mm_fault(vma, address, flags, regs); - if (fault_signal_pending(fault, regs)) + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + goto bad_page_fault; return; + } if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) |