diff options
Diffstat (limited to 'arch/x86')
253 files changed, 5252 insertions, 3092 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 1538562cc720..eb3abf8ac44e 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,6 +1,7 @@ - obj-y += entry/ +obj-$(CONFIG_PERF_EVENTS) += events/ + obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eda18cecdbbd..8b680a5cb25b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -155,6 +155,7 @@ config X86 select VIRT_TO_BUS select X86_DEV_DMA_OPS if X86_64 select X86_FEATURE_NAMES if PROC_FS + select HAVE_STACK_VALIDATION if X86_64 select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS @@ -305,6 +306,9 @@ config ARCH_SUPPORTS_UPROBES config FIX_EARLYCON_MEM def_bool y +config DEBUG_RODATA + def_bool y + config PGTABLE_LEVELS int default 4 if X86_64 @@ -780,8 +784,8 @@ config HPET_TIMER HPET is the next generation timer replacing legacy 8254s. The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, - as it is off-chip. You can find the HPET spec at - <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>. + as it is off-chip. The interface used is documented + in the HPET spec, revision 1. You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. @@ -1162,22 +1166,23 @@ config MICROCODE bool "CPU microcode loading support" default y depends on CPU_SUP_AMD || CPU_SUP_INTEL - depends on BLK_DEV_INITRD select FW_LOADER ---help--- - If you say Y here, you will be able to update the microcode on - certain Intel and AMD processors. The Intel support is for the - IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, - Xeon etc. The AMD support is for families 0x10 and later. You will - obviously need the actual microcode binary data itself which is not - shipped with the Linux kernel. + Intel and AMD processors. The Intel support is for the IA32 family, + e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The + AMD support is for families 0x10 and later. You will obviously need + the actual microcode binary data itself which is not shipped with + the Linux kernel. - This option selects the general module only, you need to select - at least one vendor specific module as well. + The preferred method to load microcode from a detached initrd is described + in Documentation/x86/early-microcode.txt. For that you need to enable + CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the + initrd for microcode blobs. - To compile this driver as a module, choose M here: the module - will be called microcode. + In addition, you can build-in the microcode into the kernel. For that you + need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode + to the CONFIG_EXTRA_FIRMWARE config option. config MICROCODE_INTEL bool "Intel microcode loading support" @@ -2447,8 +2452,6 @@ config PCI_CNB20LE_QUIRK You should say N unless you know you need this. -source "drivers/pci/pcie/Kconfig" - source "drivers/pci/Kconfig" # x86_64 have no ISA slots, but can have ISA-style DMA. @@ -2604,8 +2607,6 @@ config AMD_NB source "drivers/pcmcia/Kconfig" -source "drivers/pci/hotplug/Kconfig" - config RAPIDIO tristate "RapidIO support" depends on PCI diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 68a2d1f0a683..67eec55093a5 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -74,28 +74,16 @@ config EFI_PGT_DUMP issues with the mapping of the EFI runtime regions into that table. -config DEBUG_RODATA - bool "Write protect kernel read-only data structures" - default y - depends on DEBUG_KERNEL - ---help--- - Mark the kernel read-only data as write-protected in the pagetables, - in order to catch accidental (and incorrect) writes to such const - data. This is recommended so that we can catch kernel bugs sooner. - If in doubt, say "Y". - config DEBUG_RODATA_TEST - bool "Testcase for the DEBUG_RODATA feature" - depends on DEBUG_RODATA + bool "Testcase for the marking rodata read-only" default y ---help--- - This option enables a testcase for the DEBUG_RODATA - feature as well as for the change_page_attr() infrastructure. + This option enables a testcase for the setting rodata read-only + as well as for the change_page_attr() infrastructure. If in doubt, say "N" config DEBUG_WX bool "Warn on W+X mappings at boot" - depends on DEBUG_RODATA select X86_PTDUMP_CORE ---help--- Generate a warning if any W+X mappings are found at boot. diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index bbe1a62efc02..0bf6749522d9 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,7 +9,8 @@ # Changed by many, many contributors over the years. # -KASAN_SANITIZE := n +KASAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f9ce75d80101..5e1d26e09407 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -16,7 +16,8 @@ # (see scripts/Makefile.lib size_append) # compressed vmlinux.bin.all + u32 size of vmlinux.bin.all -KASAN_SANITIZE := n +KASAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index a7661c430cd9..0702d2531bc7 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -49,7 +49,6 @@ typedef unsigned int u32; /* This must be large enough to hold the entire setup */ u8 buf[SETUP_SECT_MAX*512]; -int is_big_kernel; #define PECOFF_RELOC_RESERVE 0x20 diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 028be48c8839..265901a84f3f 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y CONFIG_PRINTK_TIME=y # CONFIG_ENABLE_WARN_DEPRECATED is not set -CONFIG_FRAME_WARN=2048 +CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_KERNEL=y @@ -303,7 +303,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y -CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cb5b3ab5beec..4f404a64681b 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -300,7 +300,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y -CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 6bd2c6c95373..383a6f84a060 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -31,6 +31,7 @@ #include <linux/linkage.h> #include <asm/inst.h> +#include <asm/frame.h> /* * The following macros are used to move an (un)aligned 16 byte value to/from @@ -1800,11 +1801,12 @@ ENDPROC(_key_expansion_256b) * unsigned int key_len) */ ENTRY(aesni_set_key) + FRAME_BEGIN #ifndef __x86_64__ pushl KEYP - movl 8(%esp), KEYP # ctx - movl 12(%esp), UKEYP # in_key - movl 16(%esp), %edx # key_len + movl (FRAME_OFFSET+8)(%esp), KEYP # ctx + movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key + movl (FRAME_OFFSET+16)(%esp), %edx # key_len #endif movups (UKEYP), %xmm0 # user key (first 16 bytes) movaps %xmm0, (KEYP) @@ -1905,6 +1907,7 @@ ENTRY(aesni_set_key) #ifndef __x86_64__ popl KEYP #endif + FRAME_END ret ENDPROC(aesni_set_key) @@ -1912,12 +1915,13 @@ ENDPROC(aesni_set_key) * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_enc) + FRAME_BEGIN #ifndef __x86_64__ pushl KEYP pushl KLEN - movl 12(%esp), KEYP - movl 16(%esp), OUTP - movl 20(%esp), INP + movl (FRAME_OFFSET+12)(%esp), KEYP # ctx + movl (FRAME_OFFSET+16)(%esp), OUTP # dst + movl (FRAME_OFFSET+20)(%esp), INP # src #endif movl 480(KEYP), KLEN # key length movups (INP), STATE # input @@ -1927,6 +1931,7 @@ ENTRY(aesni_enc) popl KLEN popl KEYP #endif + FRAME_END ret ENDPROC(aesni_enc) @@ -2101,12 +2106,13 @@ ENDPROC(_aesni_enc4) * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_dec) + FRAME_BEGIN #ifndef __x86_64__ pushl KEYP pushl KLEN - movl 12(%esp), KEYP - movl 16(%esp), OUTP - movl 20(%esp), INP + movl (FRAME_OFFSET+12)(%esp), KEYP # ctx + movl (FRAME_OFFSET+16)(%esp), OUTP # dst + movl (FRAME_OFFSET+20)(%esp), INP # src #endif mov 480(KEYP), KLEN # key length add $240, KEYP @@ -2117,6 +2123,7 @@ ENTRY(aesni_dec) popl KLEN popl KEYP #endif + FRAME_END ret ENDPROC(aesni_dec) @@ -2292,14 +2299,15 @@ ENDPROC(_aesni_dec4) * size_t len) */ ENTRY(aesni_ecb_enc) + FRAME_BEGIN #ifndef __x86_64__ pushl LEN pushl KEYP pushl KLEN - movl 16(%esp), KEYP - movl 20(%esp), OUTP - movl 24(%esp), INP - movl 28(%esp), LEN + movl (FRAME_OFFSET+16)(%esp), KEYP # ctx + movl (FRAME_OFFSET+20)(%esp), OUTP # dst + movl (FRAME_OFFSET+24)(%esp), INP # src + movl (FRAME_OFFSET+28)(%esp), LEN # len #endif test LEN, LEN # check length jz .Lecb_enc_ret @@ -2342,6 +2350,7 @@ ENTRY(aesni_ecb_enc) popl KEYP popl LEN #endif + FRAME_END ret ENDPROC(aesni_ecb_enc) @@ -2350,14 +2359,15 @@ ENDPROC(aesni_ecb_enc) * size_t len); */ ENTRY(aesni_ecb_dec) + FRAME_BEGIN #ifndef __x86_64__ pushl LEN pushl KEYP pushl KLEN - movl 16(%esp), KEYP - movl 20(%esp), OUTP - movl 24(%esp), INP - movl 28(%esp), LEN + movl (FRAME_OFFSET+16)(%esp), KEYP # ctx + movl (FRAME_OFFSET+20)(%esp), OUTP # dst + movl (FRAME_OFFSET+24)(%esp), INP # src + movl (FRAME_OFFSET+28)(%esp), LEN # len #endif test LEN, LEN jz .Lecb_dec_ret @@ -2401,6 +2411,7 @@ ENTRY(aesni_ecb_dec) popl KEYP popl LEN #endif + FRAME_END ret ENDPROC(aesni_ecb_dec) @@ -2409,16 +2420,17 @@ ENDPROC(aesni_ecb_dec) * size_t len, u8 *iv) */ ENTRY(aesni_cbc_enc) + FRAME_BEGIN #ifndef __x86_64__ pushl IVP pushl LEN pushl KEYP pushl KLEN - movl 20(%esp), KEYP - movl 24(%esp), OUTP - movl 28(%esp), INP - movl 32(%esp), LEN - movl 36(%esp), IVP + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv #endif cmp $16, LEN jb .Lcbc_enc_ret @@ -2443,6 +2455,7 @@ ENTRY(aesni_cbc_enc) popl LEN popl IVP #endif + FRAME_END ret ENDPROC(aesni_cbc_enc) @@ -2451,16 +2464,17 @@ ENDPROC(aesni_cbc_enc) * size_t len, u8 *iv) */ ENTRY(aesni_cbc_dec) + FRAME_BEGIN #ifndef __x86_64__ pushl IVP pushl LEN pushl KEYP pushl KLEN - movl 20(%esp), KEYP - movl 24(%esp), OUTP - movl 28(%esp), INP - movl 32(%esp), LEN - movl 36(%esp), IVP + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv #endif cmp $16, LEN jb .Lcbc_dec_just_ret @@ -2534,13 +2548,16 @@ ENTRY(aesni_cbc_dec) popl LEN popl IVP #endif + FRAME_END ret ENDPROC(aesni_cbc_dec) #ifdef __x86_64__ +.pushsection .rodata .align 16 .Lbswap_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.popsection /* * _aesni_inc_init: internal ABI @@ -2598,6 +2615,7 @@ ENDPROC(_aesni_inc) * size_t len, u8 *iv) */ ENTRY(aesni_ctr_enc) + FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret mov 480(KEYP), KLEN @@ -2651,6 +2669,7 @@ ENTRY(aesni_ctr_enc) .Lctr_enc_ret: movups IV, (IVP) .Lctr_enc_just_ret: + FRAME_END ret ENDPROC(aesni_ctr_enc) @@ -2677,6 +2696,7 @@ ENDPROC(aesni_ctr_enc) * bool enc, u8 *iv) */ ENTRY(aesni_xts_crypt8) + FRAME_BEGIN cmpb $0, %cl movl $0, %ecx movl $240, %r10d @@ -2777,6 +2797,7 @@ ENTRY(aesni_xts_crypt8) pxor INC, STATE4 movdqu STATE4, 0x70(OUTP) + FRAME_END ret ENDPROC(aesni_xts_crypt8) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 3633ad6145c5..064c7e2bd7c8 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -639,16 +639,11 @@ static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; int err; - /* key consists of keys of equal size concatenated, therefore - * the length must be even - */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + err = xts_check_key(tfm, key, keylen); + if (err) + return err; /* first half of xts-key is for crypt */ err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index ce71f9212409..aa9e8bd163f6 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -16,6 +16,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -726,6 +727,7 @@ __camellia_enc_blk16: * %xmm0..%xmm15: 16 encrypted blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + FRAME_BEGIN leaq 8 * 16(%rax), %rcx; @@ -780,6 +782,7 @@ __camellia_enc_blk16: %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); + FRAME_END ret; .align 8 @@ -812,6 +815,7 @@ __camellia_dec_blk16: * %xmm0..%xmm15: 16 plaintext blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + FRAME_BEGIN leaq 8 * 16(%rax), %rcx; @@ -865,6 +869,7 @@ __camellia_dec_blk16: %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); + FRAME_END ret; .align 8 @@ -890,6 +895,7 @@ ENTRY(camellia_ecb_enc_16way) * %rsi: dst (16 blocks) * %rdx: src (16 blocks) */ + FRAME_BEGIN inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -904,6 +910,7 @@ ENTRY(camellia_ecb_enc_16way) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); + FRAME_END ret; ENDPROC(camellia_ecb_enc_16way) @@ -913,6 +920,7 @@ ENTRY(camellia_ecb_dec_16way) * %rsi: dst (16 blocks) * %rdx: src (16 blocks) */ + FRAME_BEGIN cmpl $16, key_length(CTX); movl $32, %r8d; @@ -932,6 +940,7 @@ ENTRY(camellia_ecb_dec_16way) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); + FRAME_END ret; ENDPROC(camellia_ecb_dec_16way) @@ -941,6 +950,7 @@ ENTRY(camellia_cbc_dec_16way) * %rsi: dst (16 blocks) * %rdx: src (16 blocks) */ + FRAME_BEGIN cmpl $16, key_length(CTX); movl $32, %r8d; @@ -981,6 +991,7 @@ ENTRY(camellia_cbc_dec_16way) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); + FRAME_END ret; ENDPROC(camellia_cbc_dec_16way) @@ -997,6 +1008,7 @@ ENTRY(camellia_ctr_16way) * %rdx: src (16 blocks) * %rcx: iv (little endian, 128bit) */ + FRAME_BEGIN subq $(16 * 16), %rsp; movq %rsp, %rax; @@ -1092,6 +1104,7 @@ ENTRY(camellia_ctr_16way) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); + FRAME_END ret; ENDPROC(camellia_ctr_16way) @@ -1112,6 +1125,7 @@ camellia_xts_crypt_16way: * %r8: index for input whitening key * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 */ + FRAME_BEGIN subq $(16 * 16), %rsp; movq %rsp, %rax; @@ -1234,6 +1248,7 @@ camellia_xts_crypt_16way: %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); + FRAME_END ret; ENDPROC(camellia_xts_crypt_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 0e0b8863a34b..16186c18656d 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -11,6 +11,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -766,6 +767,7 @@ __camellia_enc_blk32: * %ymm0..%ymm15: 32 encrypted blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + FRAME_BEGIN leaq 8 * 32(%rax), %rcx; @@ -820,6 +822,7 @@ __camellia_enc_blk32: %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); + FRAME_END ret; .align 8 @@ -852,6 +855,7 @@ __camellia_dec_blk32: * %ymm0..%ymm15: 16 plaintext blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + FRAME_BEGIN leaq 8 * 32(%rax), %rcx; @@ -905,6 +909,7 @@ __camellia_dec_blk32: %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); + FRAME_END ret; .align 8 @@ -930,6 +935,7 @@ ENTRY(camellia_ecb_enc_32way) * %rsi: dst (32 blocks) * %rdx: src (32 blocks) */ + FRAME_BEGIN vzeroupper; @@ -948,6 +954,7 @@ ENTRY(camellia_ecb_enc_32way) vzeroupper; + FRAME_END ret; ENDPROC(camellia_ecb_enc_32way) @@ -957,6 +964,7 @@ ENTRY(camellia_ecb_dec_32way) * %rsi: dst (32 blocks) * %rdx: src (32 blocks) */ + FRAME_BEGIN vzeroupper; @@ -980,6 +988,7 @@ ENTRY(camellia_ecb_dec_32way) vzeroupper; + FRAME_END ret; ENDPROC(camellia_ecb_dec_32way) @@ -989,6 +998,7 @@ ENTRY(camellia_cbc_dec_32way) * %rsi: dst (32 blocks) * %rdx: src (32 blocks) */ + FRAME_BEGIN vzeroupper; @@ -1046,6 +1056,7 @@ ENTRY(camellia_cbc_dec_32way) vzeroupper; + FRAME_END ret; ENDPROC(camellia_cbc_dec_32way) @@ -1070,6 +1081,7 @@ ENTRY(camellia_ctr_32way) * %rdx: src (32 blocks) * %rcx: iv (little endian, 128bit) */ + FRAME_BEGIN vzeroupper; @@ -1184,6 +1196,7 @@ ENTRY(camellia_ctr_32way) vzeroupper; + FRAME_END ret; ENDPROC(camellia_ctr_32way) @@ -1216,6 +1229,7 @@ camellia_xts_crypt_32way: * %r8: index for input whitening key * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 */ + FRAME_BEGIN vzeroupper; @@ -1349,6 +1363,7 @@ camellia_xts_crypt_32way: vzeroupper; + FRAME_END ret; ENDPROC(camellia_xts_crypt_32way) diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 5c8b6266a394..aa76cad9d262 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1503,13 +1503,9 @@ int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, u32 *flags = &tfm->crt_flags; int err; - /* key consists of keys of equal size concatenated, therefore - * the length must be even - */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + err = xts_check_key(tfm, key, keylen); + if (err) + return err; /* first half of xts-key is for crypt */ err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index c35fd5d6ecd2..14fa1966bf01 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -24,6 +24,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> .file "cast5-avx-x86_64-asm_64.S" @@ -365,6 +366,7 @@ ENTRY(cast5_ecb_enc_16way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN movq %rsi, %r11; @@ -388,6 +390,7 @@ ENTRY(cast5_ecb_enc_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + FRAME_END ret; ENDPROC(cast5_ecb_enc_16way) @@ -398,6 +401,7 @@ ENTRY(cast5_ecb_dec_16way) * %rdx: src */ + FRAME_BEGIN movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -420,6 +424,7 @@ ENTRY(cast5_ecb_dec_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + FRAME_END ret; ENDPROC(cast5_ecb_dec_16way) @@ -429,6 +434,7 @@ ENTRY(cast5_cbc_dec_16way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN pushq %r12; @@ -469,6 +475,7 @@ ENTRY(cast5_cbc_dec_16way) popq %r12; + FRAME_END ret; ENDPROC(cast5_cbc_dec_16way) @@ -479,6 +486,7 @@ ENTRY(cast5_ctr_16way) * %rdx: src * %rcx: iv (big endian, 64bit) */ + FRAME_BEGIN pushq %r12; @@ -542,5 +550,6 @@ ENTRY(cast5_ctr_16way) popq %r12; + FRAME_END ret; ENDPROC(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index e3531f833951..c419389889cd 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -24,6 +24,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> #include "glue_helper-asm-avx.S" .file "cast6-avx-x86_64-asm_64.S" @@ -349,6 +350,7 @@ ENTRY(cast6_ecb_enc_8way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN movq %rsi, %r11; @@ -358,6 +360,7 @@ ENTRY(cast6_ecb_enc_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(cast6_ecb_enc_8way) @@ -367,6 +370,7 @@ ENTRY(cast6_ecb_dec_8way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN movq %rsi, %r11; @@ -376,6 +380,7 @@ ENTRY(cast6_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(cast6_ecb_dec_8way) @@ -385,6 +390,7 @@ ENTRY(cast6_cbc_dec_8way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN pushq %r12; @@ -399,6 +405,7 @@ ENTRY(cast6_cbc_dec_8way) popq %r12; + FRAME_END ret; ENDPROC(cast6_cbc_dec_8way) @@ -409,6 +416,7 @@ ENTRY(cast6_ctr_8way) * %rdx: src * %rcx: iv (little endian, 128bit) */ + FRAME_BEGIN pushq %r12; @@ -424,6 +432,7 @@ ENTRY(cast6_ctr_8way) popq %r12; + FRAME_END ret; ENDPROC(cast6_ctr_8way) @@ -434,6 +443,7 @@ ENTRY(cast6_xts_enc_8way) * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN movq %rsi, %r11; @@ -446,6 +456,7 @@ ENTRY(cast6_xts_enc_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(cast6_xts_enc_8way) @@ -456,6 +467,7 @@ ENTRY(cast6_xts_dec_8way) * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN movq %rsi, %r11; @@ -468,5 +480,6 @@ ENTRY(cast6_xts_dec_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index fca459578c35..50e684768c55 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -329,13 +329,9 @@ static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, u32 *flags = &tfm->crt_flags; int err; - /* key consists of keys of equal size concatenated, therefore - * the length must be even - */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + err = xts_check_key(tfm, key, keylen); + if (err) + return err; /* first half of xts-key is for crypt */ err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 4fe27e074194..dc05f010ca9b 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -170,8 +170,8 @@ continue_block: ## branch into array lea jump_table(%rip), bufp movzxw (bufp, %rax, 2), len - offset=crc_array-jump_table - lea offset(bufp, len, 1), bufp + lea crc_array(%rip), bufp + lea (bufp, len, 1), bufp jmp *bufp ################################################################ @@ -310,7 +310,9 @@ do_return: popq %rdi popq %rbx ret +ENDPROC(crc_pcl) +.section .rodata, "a", %progbits ################################################################ ## jump table Table is 129 entries x 2 bytes each ################################################################ @@ -324,13 +326,11 @@ JMPTBL_ENTRY %i i=i+1 .endr -ENDPROC(crc_pcl) ################################################################ ## PCLMULQDQ tables ## Table is 128 entries x 2 words (8 bytes) each ################################################################ -.section .rodata, "a", %progbits .align 8 K_table: .long 0x493c7d27, 0x00000001 diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 5d1e0075ac24..eed55c8cca4f 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -18,6 +18,7 @@ #include <linux/linkage.h> #include <asm/inst.h> +#include <asm/frame.h> .data @@ -94,6 +95,7 @@ ENDPROC(__clmul_gf128mul_ble) /* void clmul_ghash_mul(char *dst, const u128 *shash) */ ENTRY(clmul_ghash_mul) + FRAME_BEGIN movups (%rdi), DATA movups (%rsi), SHASH movaps .Lbswap_mask, BSWAP @@ -101,6 +103,7 @@ ENTRY(clmul_ghash_mul) call __clmul_gf128mul_ble PSHUFB_XMM BSWAP DATA movups DATA, (%rdi) + FRAME_END ret ENDPROC(clmul_ghash_mul) @@ -109,6 +112,7 @@ ENDPROC(clmul_ghash_mul) * const u128 *shash); */ ENTRY(clmul_ghash_update) + FRAME_BEGIN cmp $16, %rdx jb .Lupdate_just_ret # check length movaps .Lbswap_mask, BSWAP @@ -128,5 +132,6 @@ ENTRY(clmul_ghash_update) PSHUFB_XMM BSWAP DATA movups DATA, (%rdi) .Lupdate_just_ret: + FRAME_END ret ENDPROC(clmul_ghash_update) diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 2f202f49872b..8be571808342 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -24,6 +24,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> #include "glue_helper-asm-avx.S" .file "serpent-avx-x86_64-asm_64.S" @@ -681,6 +682,7 @@ ENTRY(serpent_ecb_enc_8way_avx) * %rsi: dst * %rdx: src */ + FRAME_BEGIN load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -688,6 +690,7 @@ ENTRY(serpent_ecb_enc_8way_avx) store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(serpent_ecb_enc_8way_avx) @@ -697,6 +700,7 @@ ENTRY(serpent_ecb_dec_8way_avx) * %rsi: dst * %rdx: src */ + FRAME_BEGIN load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -704,6 +708,7 @@ ENTRY(serpent_ecb_dec_8way_avx) store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + FRAME_END ret; ENDPROC(serpent_ecb_dec_8way_avx) @@ -713,6 +718,7 @@ ENTRY(serpent_cbc_dec_8way_avx) * %rsi: dst * %rdx: src */ + FRAME_BEGIN load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -720,6 +726,7 @@ ENTRY(serpent_cbc_dec_8way_avx) store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + FRAME_END ret; ENDPROC(serpent_cbc_dec_8way_avx) @@ -730,6 +737,7 @@ ENTRY(serpent_ctr_8way_avx) * %rdx: src * %rcx: iv (little endian, 128bit) */ + FRAME_BEGIN load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, RK0, RK1, RK2); @@ -738,6 +746,7 @@ ENTRY(serpent_ctr_8way_avx) store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(serpent_ctr_8way_avx) @@ -748,6 +757,7 @@ ENTRY(serpent_xts_enc_8way_avx) * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN /* regs <= src, dst <= IVs, regs <= regs xor IVs */ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, @@ -758,6 +768,7 @@ ENTRY(serpent_xts_enc_8way_avx) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(serpent_xts_enc_8way_avx) @@ -768,6 +779,7 @@ ENTRY(serpent_xts_dec_8way_avx) * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN /* regs <= src, dst <= IVs, regs <= regs xor IVs */ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, @@ -778,5 +790,6 @@ ENTRY(serpent_xts_dec_8way_avx) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + FRAME_END ret; ENDPROC(serpent_xts_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index b222085cccac..97c48add33ed 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -15,6 +15,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> #include "glue_helper-asm-avx2.S" .file "serpent-avx2-asm_64.S" @@ -673,6 +674,7 @@ ENTRY(serpent_ecb_enc_16way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN vzeroupper; @@ -684,6 +686,7 @@ ENTRY(serpent_ecb_enc_16way) vzeroupper; + FRAME_END ret; ENDPROC(serpent_ecb_enc_16way) @@ -693,6 +696,7 @@ ENTRY(serpent_ecb_dec_16way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN vzeroupper; @@ -704,6 +708,7 @@ ENTRY(serpent_ecb_dec_16way) vzeroupper; + FRAME_END ret; ENDPROC(serpent_ecb_dec_16way) @@ -713,6 +718,7 @@ ENTRY(serpent_cbc_dec_16way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN vzeroupper; @@ -725,6 +731,7 @@ ENTRY(serpent_cbc_dec_16way) vzeroupper; + FRAME_END ret; ENDPROC(serpent_cbc_dec_16way) @@ -735,6 +742,7 @@ ENTRY(serpent_ctr_16way) * %rdx: src (16 blocks) * %rcx: iv (little endian, 128bit) */ + FRAME_BEGIN vzeroupper; @@ -748,6 +756,7 @@ ENTRY(serpent_ctr_16way) vzeroupper; + FRAME_END ret; ENDPROC(serpent_ctr_16way) @@ -758,6 +767,7 @@ ENTRY(serpent_xts_enc_16way) * %rdx: src (16 blocks) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN vzeroupper; @@ -772,6 +782,7 @@ ENTRY(serpent_xts_enc_16way) vzeroupper; + FRAME_END ret; ENDPROC(serpent_xts_enc_16way) @@ -782,6 +793,7 @@ ENTRY(serpent_xts_dec_16way) * %rdx: src (16 blocks) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN vzeroupper; @@ -796,5 +808,6 @@ ENTRY(serpent_xts_dec_16way) vzeroupper; + FRAME_END ret; ENDPROC(serpent_xts_dec_16way) diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 5dc37026c7ce..6f778d3daa22 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -332,16 +332,11 @@ int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; int err; - /* key consists of keys of equal size concatenated, therefore - * the length must be even - */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + err = xts_check_key(tfm, key, keylen); + if (err) + return err; /* first half of xts-key is for crypt */ err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 3643dd508f45..8943407e8917 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -309,16 +309,11 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; int err; - /* key consists of keys of equal size concatenated, therefore - * the length must be even - */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + err = xts_check_key(tfm, key, keylen); + if (err) + return err; /* first half of xts-key is for crypt */ err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index a841e9765bd6..a8a0224fa0f8 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -762,6 +762,38 @@ static int sha1_mb_async_digest(struct ahash_request *req) return crypto_ahash_digest(mcryptd_req); } +static int sha1_mb_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_export(mcryptd_req, out); +} + +static int sha1_mb_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm); + struct mcryptd_hash_request_ctx *rctx; + struct shash_desc *desc; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + rctx = ahash_request_ctx(mcryptd_req); + desc = &rctx->desc; + desc->tfm = child; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + return crypto_ahash_import(mcryptd_req, in); +} + static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) { struct mcryptd_ahash *mcryptd_tfm; @@ -796,8 +828,11 @@ static struct ahash_alg sha1_mb_async_alg = { .final = sha1_mb_async_final, .finup = sha1_mb_async_finup, .digest = sha1_mb_async_digest, + .export = sha1_mb_async_export, + .import = sha1_mb_async_import, .halg = { .digestsize = SHA1_DIGEST_SIZE, + .statesize = sizeof(struct sha1_hash_ctx), .base = { .cra_name = "sha1", .cra_driver_name = "sha1_mb", diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S index 85c4e1cf7172..96df6a39d7e2 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S @@ -52,6 +52,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/linkage.h> +#include <asm/frame.h> #include "sha1_mb_mgr_datastruct.S" @@ -86,16 +87,6 @@ #define extra_blocks %arg2 #define p %arg2 - -# STACK_SPACE needs to be an odd multiple of 8 -_XMM_SAVE_SIZE = 10*16 -_GPR_SAVE_SIZE = 8*8 -_ALIGN_SIZE = 8 - -_XMM_SAVE = 0 -_GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE -STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE - .macro LABEL prefix n \prefix\n\(): .endm @@ -113,16 +104,8 @@ offset = \_offset # JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state) # arg 1 : rcx : state ENTRY(sha1_mb_mgr_flush_avx2) - mov %rsp, %r10 - sub $STACK_SPACE, %rsp - and $~31, %rsp - mov %rbx, _GPR_SAVE(%rsp) - mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp - mov %rbp, _GPR_SAVE+8*3(%rsp) - mov %r12, _GPR_SAVE+8*4(%rsp) - mov %r13, _GPR_SAVE+8*5(%rsp) - mov %r14, _GPR_SAVE+8*6(%rsp) - mov %r15, _GPR_SAVE+8*7(%rsp) + FRAME_BEGIN + push %rbx # If bit (32+3) is set, then all lanes are empty mov _unused_lanes(state), unused_lanes @@ -230,16 +213,8 @@ len_is_0: mov tmp2_w, offset(job_rax) return: - - mov _GPR_SAVE(%rsp), %rbx - mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp - mov _GPR_SAVE+8*3(%rsp), %rbp - mov _GPR_SAVE+8*4(%rsp), %r12 - mov _GPR_SAVE+8*5(%rsp), %r13 - mov _GPR_SAVE+8*6(%rsp), %r14 - mov _GPR_SAVE+8*7(%rsp), %r15 - mov %r10, %rsp - + pop %rbx + FRAME_END ret return_null: diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S index 2ab9560b53c8..63a0d9c8e31f 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S @@ -53,6 +53,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> #include "sha1_mb_mgr_datastruct.S" @@ -86,33 +87,21 @@ job_rax = %rax len = %rax DWORD_len = %eax -lane = %rbp -tmp3 = %rbp +lane = %r12 +tmp3 = %r12 tmp = %r9 DWORD_tmp = %r9d lane_data = %r10 -# STACK_SPACE needs to be an odd multiple of 8 -STACK_SPACE = 8*8 + 16*10 + 8 - # JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job) # arg 1 : rcx : state # arg 2 : rdx : job ENTRY(sha1_mb_mgr_submit_avx2) - - mov %rsp, %r10 - sub $STACK_SPACE, %rsp - and $~31, %rsp - - mov %rbx, (%rsp) - mov %r10, 8*2(%rsp) #save old rsp - mov %rbp, 8*3(%rsp) - mov %r12, 8*4(%rsp) - mov %r13, 8*5(%rsp) - mov %r14, 8*6(%rsp) - mov %r15, 8*7(%rsp) + FRAME_BEGIN + push %rbx + push %r12 mov _unused_lanes(state), unused_lanes mov unused_lanes, lane @@ -197,22 +186,15 @@ len_is_0: vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0 vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0 vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0 - movl 4*32(state, idx, 4), DWORD_tmp + movl _args_digest+4*32(state, idx, 4), DWORD_tmp vmovdqu %xmm0, _result_digest(job_rax) movl DWORD_tmp, _result_digest+1*16(job_rax) return: - - mov (%rsp), %rbx - mov 8*2(%rsp), %r10 #save old rsp - mov 8*3(%rsp), %rbp - mov 8*4(%rsp), %r12 - mov 8*5(%rsp), %r13 - mov 8*6(%rsp), %r14 - mov 8*7(%rsp), %r15 - mov %r10, %rsp - + pop %r12 + pop %rbx + FRAME_END ret return_null: diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 05058134c443..dc66273e610d 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -24,6 +24,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> #include "glue_helper-asm-avx.S" .file "twofish-avx-x86_64-asm_64.S" @@ -333,6 +334,7 @@ ENTRY(twofish_ecb_enc_8way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN movq %rsi, %r11; @@ -342,6 +344,7 @@ ENTRY(twofish_ecb_enc_8way) store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + FRAME_END ret; ENDPROC(twofish_ecb_enc_8way) @@ -351,6 +354,7 @@ ENTRY(twofish_ecb_dec_8way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN movq %rsi, %r11; @@ -360,6 +364,7 @@ ENTRY(twofish_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(twofish_ecb_dec_8way) @@ -369,6 +374,7 @@ ENTRY(twofish_cbc_dec_8way) * %rsi: dst * %rdx: src */ + FRAME_BEGIN pushq %r12; @@ -383,6 +389,7 @@ ENTRY(twofish_cbc_dec_8way) popq %r12; + FRAME_END ret; ENDPROC(twofish_cbc_dec_8way) @@ -393,6 +400,7 @@ ENTRY(twofish_ctr_8way) * %rdx: src * %rcx: iv (little endian, 128bit) */ + FRAME_BEGIN pushq %r12; @@ -408,6 +416,7 @@ ENTRY(twofish_ctr_8way) popq %r12; + FRAME_END ret; ENDPROC(twofish_ctr_8way) @@ -418,6 +427,7 @@ ENTRY(twofish_xts_enc_8way) * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN movq %rsi, %r11; @@ -430,6 +440,7 @@ ENTRY(twofish_xts_enc_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + FRAME_END ret; ENDPROC(twofish_xts_enc_8way) @@ -440,6 +451,7 @@ ENTRY(twofish_xts_dec_8way) * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ + FRAME_BEGIN movq %rsi, %r11; @@ -452,5 +464,6 @@ ENTRY(twofish_xts_dec_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + FRAME_END ret; ENDPROC(twofish_xts_dec_8way) diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 56d8a08ee479..2ebb5e9789f3 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -277,13 +277,9 @@ int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, u32 *flags = &tfm->crt_flags; int err; - /* key consists of keys of equal size concatenated, therefore - * the length must be even - */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + err = xts_check_key(tfm, key, keylen); + if (err) + return err; /* first half of xts-key is for crypt */ err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index bd55dedd7614..fe91c25092da 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,6 +1,10 @@ # # Makefile for the x86 low level entry code # + +OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y + obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index c6ab2ebb5f4f..e79d93d44ecd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -45,6 +45,8 @@ __visible void enter_from_user_mode(void) CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit(); } +#else +static inline void enter_from_user_mode(void) {} #endif static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) @@ -85,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; -#ifdef CONFIG_CONTEXT_TRACKING - /* - * If TIF_NOHZ is set, we are required to call user_exit() before - * doing anything that could touch RCU. - */ - if (work & _TIF_NOHZ) { - enter_from_user_mode(); - work &= ~_TIF_NOHZ; - } -#endif - #ifdef CONFIG_SECCOMP /* * Do seccomp first -- it should minimize exposure of other @@ -172,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - /* - * If we stepped into a sysenter/syscall insn, it trapped in - * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. - * If user-mode had set TF itself, then it's still clear from - * do_debug() and we need to set it again to restore the user - * state. If we entered on the slow path, TF was already set. - */ - if (work & _TIF_SINGLESTEP) - regs->flags |= X86_EFLAGS_TF; - #ifdef CONFIG_SECCOMP /* * Call seccomp_phase2 before running the other hooks so that @@ -269,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { + struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -276,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) lockdep_sys_exit(); - cached_flags = - READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(ti->flags); if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + */ + ti->status &= ~TS_COMPAT; +#endif + user_enter(); } @@ -333,14 +325,6 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) syscall_slow_exit_work(regs, cached_flags); -#ifdef CONFIG_COMPAT - /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. - */ - ti->status &= ~TS_COMPAT; -#endif - local_irq_disable(); prepare_exit_to_usermode(regs); } @@ -351,6 +335,7 @@ __visible void do_syscall_64(struct pt_regs *regs) struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned long nr = regs->orig_ax; + enter_from_user_mode(); local_irq_enable(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) @@ -373,19 +358,12 @@ __visible void do_syscall_64(struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* - * Does a 32-bit syscall. Called with IRQs on and does all entry and - * exit work and returns with IRQs off. This function is extremely hot - * in workloads that use it, and it's usually called from + * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does + * all entry and exit work and returns with IRQs off. This function is + * extremely hot in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ -#ifdef CONFIG_X86_32 -/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */ -__visible -#else -/* 64-bit kernels use do_syscall_32_irqs_off() instead. */ -static -#endif -__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -420,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) syscall_return_slowpath(regs); } -#ifdef CONFIG_X86_64 -/* Handles INT80 on 64-bit kernels */ -__visible void do_syscall_32_irqs_off(struct pt_regs *regs) +/* Handles int $0x80 */ +__visible void do_int80_syscall_32(struct pt_regs *regs) { + enter_from_user_mode(); local_irq_enable(); do_syscall_32_irqs_on(regs); } -#endif /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) @@ -447,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - /* - * Fetch EBP from where the vDSO stashed it. - * - * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! - */ + enter_from_user_mode(); + local_irq_enable(); + + /* Fetch EBP from where the vDSO stashed it. */ if ( #ifdef CONFIG_X86_64 /* @@ -470,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; -#ifdef CONFIG_CONTEXT_TRACKING - enter_from_user_mode(); -#endif prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4c5228352744..10868aa734dc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -287,7 +287,58 @@ need_resched: END(resume_kernel) #endif - # SYSENTER call handler stub +GLOBAL(__begin_SYSENTER_singlestep_region) +/* + * All code from here through __end_SYSENTER_singlestep_region is subject + * to being single-stepped if a user program sets TF and executes SYSENTER. + * There is absolutely nothing that we can do to prevent this from happening + * (thanks Intel!). To keep our handling of this situation as simple as + * possible, we handle TF just like AC and NT, except that our #DB handler + * will ignore all of the single-step traps generated in this range. + */ + +#ifdef CONFIG_XEN +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp +#endif + +/* + * 32-bit SYSENTER entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * if X86_FEATURE_SEP is available. This is the preferred system call + * entry on 32-bit systems. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old EIP (!!!), ESP, or EFLAGS. + * + * To avoid losing track of EFLAGS.VM (and thus potentially corrupting + * user and/or vm86 state), we explicitly disable the SYSENTER + * instruction in vm86 mode by reprogramming the MSRs. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + */ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: @@ -301,6 +352,29 @@ sysenter_past_esp: SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* + * SYSENTER doesn't filter flags, so we need to clear NT, AC + * and TF ourselves. To save a few cycles, we can check whether + * either was set instead of doing an unconditional popfq. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. + * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * + * NB.: .Lsysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. + */ + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp) + jnz .Lsysenter_fix_flags +.Lsysenter_flags_fixed: + + /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. */ @@ -326,6 +400,15 @@ sysenter_past_esp: popl %eax /* pt_regs->ax */ /* + * Restore all flags except IF. (We restore IF separately because + * STI gives a one-instruction window in which we won't be interrupted, + * whereas POPF does not.) + */ + addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ + btr $X86_EFLAGS_IF_BIT, (%esp) + popfl + + /* * Return back to the vDSO, which will pop ecx and edx. * Don't bother with DS and ES (they already contain __USER_DS). */ @@ -338,28 +421,63 @@ sysenter_past_esp: .popsection _ASM_EXTABLE(1b, 2b) PTGS_TO_GS_EX + +.Lsysenter_fix_flags: + pushl $X86_EFLAGS_FIXED + popfl + jmp .Lsysenter_flags_fixed +GLOBAL(__end_SYSENTER_singlestep_region) ENDPROC(entry_SYSENTER_32) - # system call handler stub +/* + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by any 32-bit perform system calls. + * Instances of INT $0x80 can be found inline in various programs and + * libraries. It is also used by the vDSO's __kernel_vsyscall + * fallback for hardware that doesn't support a faster entry method. + * Restarted 32-bit system calls also fall back to INT $0x80 + * regardless of what instruction was originally used to do the system + * call. (64-bit programs can use INT $0x80 as well, but they can + * only run on 64-bit kernels and therefore land in + * entry_INT80_compat.) + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 + */ ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* - * User mode is traced as though IRQs are on. Unlike the 64-bit - * case, INT80 is a trap gate on 32-bit kernels, so interrupts - * are already on (unless user code is messing around with iopl). + * User mode is traced as though IRQs are on, and the interrupt gate + * turned them off. */ + TRACE_IRQS_OFF movl %esp, %eax - call do_syscall_32_irqs_on + call do_int80_syscall_32 .Lsyscall_32_done: restore_all: TRACE_IRQS_IRET restore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 + ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS /* * Warning: PT_OLDSS(%esp) contains the wrong/random values if we @@ -386,19 +504,6 @@ ENTRY(iret_exc ) #ifdef CONFIG_X86_ESPFIX32 ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - /* * Setup and switch to ESPFIX stack * @@ -631,14 +736,6 @@ ENTRY(spurious_interrupt_bug) END(spurious_interrupt_bug) #ifdef CONFIG_XEN -/* - * Xen doesn't set %esp to be precisely what the normal SYSENTER - * entry point expects, so fix it up before using the normal path. - */ -ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp - ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL @@ -938,51 +1035,48 @@ error_code: jmp ret_from_exception END(page_fault) -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp -.endm - ENTRY(debug) + /* + * #DB can happen at the first instruction of + * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this + * happens, then we will be running on a very small stack. We + * need to detect this condition and switch to the thread + * stack before calling any C code at all. + * + * If you edit this code, keep in mind that NMIs can happen in here. + */ ASM_CLAC - cmpl $entry_SYSENTER_32, (%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: pushl $-1 # mark this as an int SAVE_ALL - TRACE_IRQS_OFF xorl %edx, %edx # error code 0 movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack + + TRACE_IRQS_OFF + call do_debug + jmp ret_from_exception + +.Ldebug_from_sysenter_stack: + /* We're on the SYSENTER stack. Switch off. */ + movl %esp, %ebp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + TRACE_IRQS_OFF call do_debug + movl %ebp, %esp jmp ret_from_exception END(debug) /* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. + * NMI is doubly nasty. It can happen on the first instruction of + * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning + * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 + * switched stacks. We handle both conditions by simply checking whether we + * interrupted kernel code running on the SYSENTER stack. */ ENTRY(nmi) ASM_CLAC @@ -993,41 +1087,32 @@ ENTRY(nmi) popl %eax je nmi_espfix_stack #endif - cmpl $entry_SYSENTER_32, (%esp) - je nmi_stack_fixup - pushl %eax - movl %esp, %eax - /* - * Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1), %eax - cmpl $(THREAD_SIZE-20), %eax - popl %eax - jae nmi_stack_correct - cmpl $entry_SYSENTER_32, 12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - pushl %eax + + pushl %eax # pt_regs->orig_ax SAVE_ALL xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack + + /* Not on SYSENTER stack. */ call do_nmi jmp restore_all_notrace -nmi_stack_fixup: - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - cmpw $__KERNEL_CS, 16(%esp) - jne nmi_stack_correct - cmpl $debug, (%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn, (%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct +.Lnmi_from_sysenter_stack: + /* + * We're on the SYSENTER stack. Switch off. No one (not even debug) + * is using the thread stack right now, so it's safe for us to use it. + */ + movl %esp, %ebp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + call do_nmi + movl %ebp, %esp + jmp restore_all_notrace #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 70eadb0ea5fa..858b555e274b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64) /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * + * This is the only entry point used for 64-bit system calls. The + * hardware interface is reasonably well designed and the register to + * argument mapping Linux uses fits well with the registers that are + * available when SYSCALL is used. + * + * SYSCALL instructions can be found inlined in libc implementations as + * well as some other programs and libraries. There are also a handful + * of SYSCALL instructions in the vDSO used, for example, as a + * clock_gettimeofday fallback. + * * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index ff1c6d61f332..847f2f0c31e5 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -19,12 +19,21 @@ .section .entry.text, "ax" /* - * 32-bit SYSENTER instruction entry. + * 32-bit SYSENTER entry. * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on Intel CPUs. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. + * and does not save old RIP (!!!), RSP, or RFLAGS. * * Arguments: * eax system call number @@ -35,10 +44,6 @@ * edi arg5 * ebp user stack * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. */ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ @@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat) */ pushfq /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ - ASM_CLAC /* Clear AC after saving FLAGS */ - pushq $__USER32_CS /* pt_regs->cs */ xorq %r8,%r8 pushq %r8 /* pt_regs->ip = 0 (placeholder) */ @@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat) cld /* - * Sysenter doesn't filter flags, so we need to clear NT + * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. + * either was set instead of doing an unconditional popfq. * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * * NB.: .Lsysenter_fix_flags is a label with the code under it moved * out-of-line as an optimization: NT is unlikely to be set in the * majority of the cases and instead of polluting the I$ unnecessarily, * we're keeping that code behind a branch which will predict as * not-taken and therefore its instructions won't be fetched. */ - testl $X86_EFLAGS_NT, EFLAGS(%rsp) + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: @@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat) pushq $X86_EFLAGS_FIXED popfq jmp .Lsysenter_flags_fixed +GLOBAL(__end_entry_SYSENTER_compat) ENDPROC(entry_SYSENTER_compat) /* - * 32-bit SYSCALL instruction entry. + * 32-bit SYSCALL entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on AMD CPUs. + * + * The SYSCALL instruction, in principle, should *only* occur in the + * vDSO. In practice, it appears that this really is the case. + * As evidence: + * + * - The calling convention for SYSCALL has changed several times without + * anyone noticing. * - * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. + * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything + * user task that did SYSCALL without immediately reloading SS + * would randomly crash. * - * Note: rflags saving+masking-with-MSR happens only in Long mode + * - Most programmers do not directly target AMD CPUs, and the 32-bit + * SYSCALL instruction does not exist on Intel CPUs. Even on AMD + * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels + * because the SYSCALL instruction in legacy/native 32-bit mode (as + * opposed to compat mode) is sufficiently poorly designed as to be + * essentially unusable. + * + * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves + * RFLAGS to R11, then loads new SS, CS, and RIP from previously + * programmed MSRs. RFLAGS gets masked by a value from another MSR + * (so CLD and CLAC are not needed). SYSCALL does not save anything on + * the stack and does not change RSP. + * + * Note: RFLAGS saving+masking-with-MSR happens only in Long mode * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). * @@ -236,7 +267,21 @@ sysret32_from_system_call: END(entry_SYSCALL_compat) /* - * Emulated IA32 system calls via int 0x80. + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. * * Arguments: * eax system call number @@ -245,22 +290,14 @@ END(entry_SYSCALL_compat) * edx arg3 * esi arg4 * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. + * ebp arg6 */ - ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. */ PARAVIRT_ADJUST_EXCEPTION_FRAME + ASM_CLAC /* Do this early to minimize exposure */ SWAPGS /* @@ -299,7 +336,7 @@ ENTRY(entry_INT80_compat) TRACE_IRQS_OFF movq %rsp, %rdi - call do_syscall_32_irqs_off + call do_int80_syscall_32 .Lsyscall_32_done: /* Go back to user mode. */ diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index cb713df81180..b30dd8154cc2 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -384,3 +384,5 @@ 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 377 i386 copy_file_range sys_copy_file_range +378 i386 preadv2 sys_preadv2 +379 i386 pwritev2 sys_pwritev2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 2e5b565adacc..cac6d17ce5db 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -333,6 +333,8 @@ 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 326 common copy_file_range sys_copy_file_range +327 64 preadv2 sys_preadv2 +328 64 pwritev2 sys_pwritev2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index efb2b932b748..98df1fa8825c 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -8,11 +8,14 @@ #include <linux/linkage.h> #include "calling.h" #include <asm/asm.h> +#include <asm/frame.h> /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name + .type \name, @function \name: + FRAME_BEGIN /* this one pushes 9 elems, the next one would be %rIP */ pushq %rdi @@ -62,6 +65,7 @@ restore: popq %rdx popq %rsi popq %rdi + FRAME_END ret _ASM_NOKPROBE(restore) #endif diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index c854541d93ff..f9fb859c98b9 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,8 +3,9 @@ # KBUILD_CFLAGS += $(DISABLE_LTO) -KASAN_SANITIZE := n -UBSAN_SANITIZE := n +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y @@ -16,6 +17,7 @@ vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o # files to link into kernel obj-y += vma.o +OBJECT_FILES_NON_STANDARD_vma.o := n # vDSO images to build vdso_img-$(VDSO64-y) += 64 diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index abe961c7c71c..63a03bb91497 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "#include <asm/vdso.h>\n"); fprintf(outfile, "\n"); fprintf(outfile, - "static unsigned char raw_data[%lu] __page_aligned_data = {", + "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {", mapping_size); for (j = 0; j < stripped_len; j++) { if (j % 10 == 0) diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile new file mode 100644 index 000000000000..fdfea1511cc0 --- /dev/null +++ b/arch/x86/events/Makefile @@ -0,0 +1,13 @@ +obj-y += core.o + +obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o +obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o +endif +obj-$(CONFIG_CPU_SUP_INTEL) += intel/core.o intel/bts.o intel/cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel/cstate.o intel/ds.o intel/knc.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel/rapl.o msr.o +obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o +obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/events/amd/core.c index 58610539b048..049ada8d4e9c 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/events/amd/core.c @@ -5,7 +5,7 @@ #include <linux/slab.h> #include <asm/apicdef.h> -#include "perf_event.h" +#include "../perf_event.h" static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/events/amd/ibs.c index 989d3c215d2b..51087c29b2c2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -14,7 +14,7 @@ #include <asm/apic.h> -#include "perf_event.h" +#include "../perf_event.h" static u32 ibs_caps; @@ -670,7 +670,7 @@ static __init int perf_event_ibs_init(void) perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); - printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); + pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; } @@ -774,14 +774,14 @@ static int setup_ibs_ctl(int ibs_eilvt_off) pci_read_config_dword(cpu_cfg, IBSCTL, &value); if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { pci_dev_put(cpu_cfg); - printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x\n", value); + pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n", + value); return -EINVAL; } } while (1); if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS\n"); + pr_debug("No CPU node configured for IBS\n"); return -ENODEV; } @@ -810,7 +810,7 @@ static void force_ibs_eilvt_setup(void) preempt_enable(); if (offset == APIC_EILVT_NR_MAX) { - printk(KERN_DEBUG "No EILVT entry available\n"); + pr_debug("No EILVT entry available\n"); return; } diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/events/amd/iommu.c index 97242a9242bd..635e5eba0caf 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -16,8 +16,8 @@ #include <linux/cpumask.h> #include <linux/slab.h> -#include "perf_event.h" -#include "perf_event_amd_iommu.h" +#include "../perf_event.h" +#include "iommu.h" #define COUNTER_SHIFT 16 diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/events/amd/iommu.h index 845d173278e3..845d173278e3 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h +++ b/arch/x86/events/amd/iommu.h diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/events/amd/uncore.c index 49742746a6c9..3db9569e658c 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -323,6 +323,8 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) return 0; fail: + if (amd_uncore_nb) + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; kfree(uncore_nb); return -ENOMEM; } @@ -536,7 +538,7 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - printk(KERN_INFO "perf: AMD NB counters detected\n"); + pr_info("perf: AMD NB counters detected\n"); ret = 0; } @@ -550,7 +552,7 @@ static int __init amd_uncore_init(void) if (ret) goto fail_l2; - printk(KERN_INFO "perf: AMD L2I counters detected\n"); + pr_info("perf: AMD L2I counters detected\n"); ret = 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/events/core.c index 1b443db2db50..9b6ad08aa51a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/events/core.c @@ -254,15 +254,16 @@ static bool check_hw_exists(void) * We still allow the PMU driver to operate: */ if (bios_fail) { - printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); - printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); + pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); + pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", + reg_fail, val_fail); } return true; msr_fail: - printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); - printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n", + pr_cont("Broken PMU hardware detected, using software events only.\n"); + pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n", boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR, reg, val_new); @@ -596,6 +597,19 @@ void x86_pmu_disable_all(void) } } +/* + * There may be PMI landing after enabled=0. The PMI hitting could be before or + * after disable_all. + * + * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. + * It will not be re-enabled in the NMI handler again, because enabled=0. After + * handling the NMI, disable_all will be called, which will not change the + * state either. If PMI hits after disable_all, the PMU is already disabled + * before entering NMI handler. The NMI handler will not change the state + * either. + * + * So either situation is harmless. + */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2180,11 +2194,11 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr, int reliable) +static int backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - perf_callchain_store(entry, addr); + return perf_callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/events/intel/bts.c index 2cad71d1b14c..b99dc9258c0f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/events/intel/bts.c @@ -26,7 +26,7 @@ #include <asm-generic/sizes.h> #include <asm/perf_event.h> -#include "perf_event.h" +#include "../perf_event.h" struct bts_ctx { struct perf_output_handle handle; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/events/intel/core.c index fed2ab1f1065..68fa55b4d42e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/events/intel/core.c @@ -18,7 +18,7 @@ #include <asm/hardirq.h> #include <asm/apic.h> -#include "perf_event.h" +#include "../perf_event.h" /* * Intel PerfMon, used on Core and later. @@ -1502,7 +1502,15 @@ static __initconst const u64 knl_hw_cache_extra_regs }; /* - * Use from PMIs where the LBRs are already disabled. + * Used from PMIs where the LBRs are already disabled. + * + * This function could be called consecutively. It is required to remain in + * disabled state if called consecutively. + * + * During consecutive calls, the same disable value will be written to related + * registers, so the PMU state remains unchanged. hw.state in + * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive + * calls. */ static void __intel_pmu_disable_all(void) { @@ -1884,6 +1892,16 @@ again: if (__test_and_clear_bit(62, (unsigned long *)&status)) { handled++; x86_pmu.drain_pebs(regs); + /* + * There are cases where, even though, the PEBS ovfl bit is set + * in GLOBAL_OVF_STATUS, the PEBS events may also have their + * overflow bits set for their counters. We must clear them + * here because they have been processed as exact samples in + * the drain_pebs() routine. They must not be processed again + * in the for_each_bit_set() loop for regular samples below. + */ + status &= ~cpuc->pebs_enabled; + status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; } /* @@ -1929,7 +1947,10 @@ again: goto again; done: - __intel_pmu_enable_all(0, true); + /* Only restore PMU state when it's active. See x86_pmu_disable(). */ + if (cpuc->enabled) + __intel_pmu_enable_all(0, true); + /* * Only unmask the NMI after the overflow counters * have been reset. This avoids spurious NMIs on @@ -3396,6 +3417,7 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); pr_cont("Nehalem events, "); @@ -3459,6 +3481,7 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + intel_pmu_pebs_data_source_nhm(); pr_cont("Westmere events, "); break; @@ -3581,7 +3604,7 @@ __init int intel_pmu_init(void) intel_pmu_lbr_init_hsw(); x86_pmu.event_constraints = intel_bdw_event_constraints; - x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints; x86_pmu.extra_regs = intel_snbep_extra_regs; x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; x86_pmu.pebs_prec_dist = true; diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/events/intel/cqm.c index a316ca96f1b6..93cb412a5579 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -7,7 +7,7 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <asm/cpu_device_id.h> -#include "perf_event.h" +#include "../perf_event.h" #define MSR_IA32_PQR_ASSOC 0x0c8f #define MSR_IA32_QM_CTR 0x0c8e @@ -1244,15 +1244,12 @@ static struct pmu intel_cqm_pmu = { static inline void cqm_pick_event_reader(int cpu) { - int phys_id = topology_physical_package_id(cpu); - int i; + int reader; - for_each_cpu(i, &cqm_cpumask) { - if (phys_id == topology_physical_package_id(i)) - return; /* already got reader for this socket */ - } - - cpumask_set_cpu(cpu, &cqm_cpumask); + /* First online cpu in package becomes the reader */ + reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); + if (reader >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cqm_cpumask); } static void intel_cqm_cpu_starting(unsigned int cpu) @@ -1270,24 +1267,17 @@ static void intel_cqm_cpu_starting(unsigned int cpu) static void intel_cqm_cpu_exit(unsigned int cpu) { - int phys_id = topology_physical_package_id(cpu); - int i; + int target; - /* - * Is @cpu a designated cqm reader? - */ + /* Is @cpu the current cqm reader for this package ? */ if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) return; - for_each_online_cpu(i) { - if (i == cpu) - continue; + /* Find another online reader in this package */ + target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - if (phys_id == topology_physical_package_id(i)) { - cpumask_set_cpu(i, &cqm_cpumask); - break; - } - } + if (target < nr_cpu_ids) + cpumask_set_cpu(target, &cqm_cpumask); } static int intel_cqm_cpu_notifier(struct notifier_block *nb, diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/events/intel/cstate.c index 75a38b5a2e26..7946c4231169 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -89,7 +89,7 @@ #include <linux/slab.h> #include <linux/perf_event.h> #include <asm/cpu_device_id.h> -#include "perf_event.h" +#include "../perf_event.h" #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/events/intel/ds.c index 10602f0a438f..ce7211a07c0b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/events/intel/ds.c @@ -5,7 +5,7 @@ #include <asm/perf_event.h> #include <asm/insn.h> -#include "perf_event.h" +#include "../perf_event.h" /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -51,7 +51,8 @@ union intel_x86_pebs_dse { #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) -static const u64 pebs_data_source[] = { +/* Version for Sandy Bridge and later */ +static u64 pebs_data_source[] = { P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ @@ -70,6 +71,14 @@ static const u64 pebs_data_source[] = { OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ }; +/* Patch up minor differences in the bits */ +void __init intel_pmu_pebs_data_source_nhm(void) +{ + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -269,7 +278,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); + buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -286,7 +295,7 @@ static int alloc_pebs_buffer(int cpu) per_cpu(insn_buffer, cpu) = ibuffer; } - max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; ds->pebs_buffer_base = (u64)(unsigned long)buffer; ds->pebs_index = ds->pebs_buffer_base; @@ -722,6 +731,30 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_bdw_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), + EVENT_CONSTRAINT_END +}; + + struct event_constraint intel_skl_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ @@ -1319,19 +1352,28 @@ void __init intel_ds_init(void) x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; int format = x86_pmu.intel_cap.pebs_format; switch (format) { case 0: - printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); + pr_cont("PEBS fmt0%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); + /* + * Using >PAGE_SIZE buffers makes the WRMSR to + * PERF_GLOBAL_CTRL in intel_pmu_enable_all() + * mysteriously hang on Core2. + * + * As a workaround, we don't do this. + */ + x86_pmu.pebs_buffer_size = PAGE_SIZE; x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; break; case 1: - printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); + pr_cont("PEBS fmt1%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; @@ -1351,7 +1393,7 @@ void __init intel_ds_init(void) break; default: - printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); + pr_cont("no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; } } diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/events/intel/knc.c index 5b0c232d1ee6..548d5f774b07 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/events/intel/knc.c @@ -5,7 +5,7 @@ #include <asm/hardirq.h> -#include "perf_event.h" +#include "../perf_event.h" static const u64 knc_perfmon_event_map[] = { @@ -263,7 +263,9 @@ again: goto again; done: - knc_pmu_enable_all(0); + /* Only restore PMU state when it's active. See x86_pmu_disable(). */ + if (cpuc->enabled) + knc_pmu_enable_all(0); return handled; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/events/intel/lbr.c index 653f88d25987..69dd11887dd1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -5,7 +5,7 @@ #include <asm/msr.h> #include <asm/insn.h> -#include "perf_event.h" +#include "../perf_event.h" enum { LBR_FORMAT_32 = 0x00, diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/events/intel/p4.c index f2e56783af3d..0a5ede187d9c 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/events/intel/p4.c @@ -13,7 +13,7 @@ #include <asm/hardirq.h> #include <asm/apic.h> -#include "perf_event.h" +#include "../perf_event.h" #define P4_CNTR_LIMIT 3 /* diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/events/intel/p6.c index 7c1a0c07b607..1f5c47ab4c65 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/events/intel/p6.c @@ -1,7 +1,7 @@ #include <linux/perf_event.h> #include <linux/types.h> -#include "perf_event.h" +#include "../perf_event.h" /* * Not sure about some of these diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/events/intel/pt.c index c0bbd1033b7c..6af7cf71d6b2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/events/intel/pt.c @@ -29,8 +29,8 @@ #include <asm/io.h> #include <asm/intel_pt.h> -#include "perf_event.h" -#include "intel_pt.h" +#include "../perf_event.h" +#include "pt.h" static DEFINE_PER_CPU(struct pt, pt_ctx); diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/events/intel/pt.h index 336878a5d205..336878a5d205 100644 --- a/arch/x86/kernel/cpu/intel_pt.h +++ b/arch/x86/events/intel/pt.h diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/events/intel/rapl.c index 24a351ad628d..b834a3f55a01 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -44,11 +44,14 @@ * the duration of the measurement. Tools may use a function such as * ldexp(raw_count, -32); */ + +#define pr_fmt(fmt) "RAPL PMU: " fmt + #include <linux/module.h> #include <linux/slab.h> #include <linux/perf_event.h> #include <asm/cpu_device_id.h> -#include "perf_event.h" +#include "../perf_event.h" /* * RAPL energy status counters @@ -107,7 +110,7 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ static struct kobj_attribute format_attr_##_var = \ __ATTR(_name, 0444, __rapl_##_var##_show, NULL) -#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ +#define RAPL_CNTR_WIDTH 32 #define RAPL_EVENT_ATTR_STR(_name, v, str) \ static struct perf_pmu_events_attr event_attr_##v = { \ @@ -117,23 +120,33 @@ static struct perf_pmu_events_attr event_attr_##v = { \ }; struct rapl_pmu { - spinlock_t lock; - int n_active; /* number of active events */ - struct list_head active_list; - struct pmu *pmu; /* pointer to rapl_pmu_class */ - ktime_t timer_interval; /* in ktime_t unit */ - struct hrtimer hrtimer; + raw_spinlock_t lock; + int n_active; + int cpu; + struct list_head active_list; + struct pmu *pmu; + ktime_t timer_interval; + struct hrtimer hrtimer; }; -static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */ -static struct pmu rapl_pmu_class; +struct rapl_pmus { + struct pmu pmu; + unsigned int maxpkg; + struct rapl_pmu *pmus[]; +}; + + /* 1/2^hw_unit Joule */ +static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; +static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; -static int rapl_cntr_mask; +static unsigned int rapl_cntr_mask; +static u64 rapl_timer_ms; -static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); -static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); +static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) +{ + return rapl_pmus->pmus[topology_logical_package_id(cpu)]; +} -static struct x86_pmu_quirk *rapl_quirks; static inline u64 rapl_read_counter(struct perf_event *event) { u64 raw; @@ -141,19 +154,10 @@ static inline u64 rapl_read_counter(struct perf_event *event) return raw; } -#define rapl_add_quirk(func_) \ -do { \ - static struct x86_pmu_quirk __quirk __initdata = { \ - .func = func_, \ - }; \ - __quirk.next = rapl_quirks; \ - rapl_quirks = &__quirk; \ -} while (0) - static inline u64 rapl_scale(u64 v, int cfg) { if (cfg > NR_RAPL_DOMAINS) { - pr_warn("invalid domain %d, failed to scale data\n", cfg); + pr_warn("Invalid domain %d, failed to scale data\n", cfg); return v; } /* @@ -206,27 +210,21 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) HRTIMER_MODE_REL_PINNED); } -static void rapl_stop_hrtimer(struct rapl_pmu *pmu) -{ - hrtimer_cancel(&pmu->hrtimer); -} - static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) { - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); + struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); struct perf_event *event; unsigned long flags; if (!pmu->n_active) return HRTIMER_NORESTART; - spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&pmu->lock, flags); - list_for_each_entry(event, &pmu->active_list, active_entry) { + list_for_each_entry(event, &pmu->active_list, active_entry) rapl_event_update(event); - } - spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&pmu->lock, flags); hrtimer_forward_now(hrtimer, pmu->timer_interval); @@ -260,28 +258,28 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, static void rapl_pmu_event_start(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); + struct rapl_pmu *pmu = event->pmu_private; unsigned long flags; - spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&pmu->lock, flags); __rapl_pmu_event_start(pmu, event); - spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&pmu->lock, flags); } static void rapl_pmu_event_stop(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); + struct rapl_pmu *pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; - spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&pmu->lock, flags); /* mark event as deactivated and stopped */ if (!(hwc->state & PERF_HES_STOPPED)) { WARN_ON_ONCE(pmu->n_active <= 0); pmu->n_active--; if (pmu->n_active == 0) - rapl_stop_hrtimer(pmu); + hrtimer_cancel(&pmu->hrtimer); list_del(&event->active_entry); @@ -299,23 +297,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) hwc->state |= PERF_HES_UPTODATE; } - spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&pmu->lock, flags); } static int rapl_pmu_event_add(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); + struct rapl_pmu *pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; - spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&pmu->lock, flags); hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (mode & PERF_EF_START) __rapl_pmu_event_start(pmu, event); - spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&pmu->lock, flags); return 0; } @@ -329,15 +327,19 @@ static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; int bit, msr, ret = 0; + struct rapl_pmu *pmu; /* only look at RAPL events */ - if (event->attr.type != rapl_pmu_class.type) + if (event->attr.type != rapl_pmus->pmu.type) return -ENOENT; /* check only supported bits are set */ if (event->attr.config & ~RAPL_EVENT_MASK) return -EINVAL; + if (event->cpu < 0) + return -EINVAL; + /* * check event is known (determines counter) */ @@ -376,6 +378,9 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; /* must be done before validate_group */ + pmu = cpu_to_rapl_pmu(event->cpu); + event->cpu = pmu->cpu; + event->pmu_private = pmu; event->hw.event_base = msr; event->hw.config = cfg; event->hw.idx = bit; @@ -506,139 +511,62 @@ const struct attribute_group *rapl_attr_groups[] = { NULL, }; -static struct pmu rapl_pmu_class = { - .attr_groups = rapl_attr_groups, - .task_ctx_nr = perf_invalid_context, /* system-wide only */ - .event_init = rapl_pmu_event_init, - .add = rapl_pmu_event_add, /* must have */ - .del = rapl_pmu_event_del, /* must have */ - .start = rapl_pmu_event_start, - .stop = rapl_pmu_event_stop, - .read = rapl_pmu_event_read, -}; - static void rapl_cpu_exit(int cpu) { - struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); - int i, phys_id = topology_physical_package_id(cpu); - int target = -1; + struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); + int target; - /* find a new cpu on same package */ - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (phys_id == topology_physical_package_id(i)) { - target = i; - break; - } - } - /* - * clear cpu from cpumask - * if was set in cpumask and still some cpu on package, - * then move to new cpu - */ - if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) - cpumask_set_cpu(target, &rapl_cpu_mask); + /* Check if exiting cpu is used for collecting rapl events */ + if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) + return; - WARN_ON(cpumask_empty(&rapl_cpu_mask)); - /* - * migrate events and context to new cpu - */ - if (target >= 0) - perf_pmu_migrate_context(pmu->pmu, cpu, target); + pmu->cpu = -1; + /* Find a new cpu to collect rapl events */ + target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - /* cancel overflow polling timer for CPU */ - rapl_stop_hrtimer(pmu); + /* Migrate rapl events to the new target */ + if (target < nr_cpu_ids) { + cpumask_set_cpu(target, &rapl_cpu_mask); + pmu->cpu = target; + perf_pmu_migrate_context(pmu->pmu, cpu, target); + } } static void rapl_cpu_init(int cpu) { - int i, phys_id = topology_physical_package_id(cpu); + struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); + int target; - /* check if phys_is is already covered */ - for_each_cpu(i, &rapl_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) - return; - } - /* was not found, so add it */ - cpumask_set_cpu(cpu, &rapl_cpu_mask); -} - -static __init void rapl_hsw_server_quirk(void) -{ /* - * DRAM domain on HSW server has fixed energy unit which can be - * different than the unit from power unit MSR. - * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 - * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + * Check if there is an online cpu in the package which collects rapl + * events already. */ - rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; + target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu)); + if (target < nr_cpu_ids) + return; + + cpumask_set_cpu(cpu, &rapl_cpu_mask); + pmu->cpu = cpu; } static int rapl_cpu_prepare(int cpu) { - struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); - int phys_id = topology_physical_package_id(cpu); - u64 ms; + struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); if (pmu) return 0; - if (phys_id < 0) - return -1; - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) - return -1; - spin_lock_init(&pmu->lock); + return -ENOMEM; + raw_spin_lock_init(&pmu->lock); INIT_LIST_HEAD(&pmu->active_list); - - pmu->pmu = &rapl_pmu_class; - - /* - * use reference of 200W for scaling the timeout - * to avoid missing counter overflows. - * 200W = 200 Joules/sec - * divide interval by 2 to avoid lockstep (2 * 100) - * if hw unit is 32, then we use 2 ms 1/200/2 - */ - if (rapl_hw_unit[0] < 32) - ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1)); - else - ms = 2; - - pmu->timer_interval = ms_to_ktime(ms); - + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + pmu->cpu = -1; rapl_hrtimer_init(pmu); - - /* set RAPL pmu for this cpu for now */ - per_cpu(rapl_pmu, cpu) = pmu; - per_cpu(rapl_pmu_to_free, cpu) = NULL; - - return 0; -} - -static void rapl_cpu_kfree(int cpu) -{ - struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); - - kfree(pmu); - - per_cpu(rapl_pmu_to_free, cpu) = NULL; -} - -static int rapl_cpu_dying(int cpu) -{ - struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); - - if (!pmu) - return 0; - - per_cpu(rapl_pmu, cpu) = NULL; - - per_cpu(rapl_pmu_to_free, cpu) = pmu; - + rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; return 0; } @@ -651,28 +579,20 @@ static int rapl_cpu_notifier(struct notifier_block *self, case CPU_UP_PREPARE: rapl_cpu_prepare(cpu); break; - case CPU_STARTING: - rapl_cpu_init(cpu); - break; - case CPU_UP_CANCELED: - case CPU_DYING: - rapl_cpu_dying(cpu); - break; + + case CPU_DOWN_FAILED: case CPU_ONLINE: - case CPU_DEAD: - rapl_cpu_kfree(cpu); + rapl_cpu_init(cpu); break; + case CPU_DOWN_PREPARE: rapl_cpu_exit(cpu); break; - default: - break; } - return NOTIFY_OK; } -static int rapl_check_hw_unit(void) +static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; int i; @@ -683,28 +603,107 @@ static int rapl_check_hw_unit(void) for (i = 0; i < NR_RAPL_DOMAINS; i++) rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + /* + * DRAM domain on HSW server and KNL has fixed energy unit which can be + * different than the unit from power unit MSR. See + * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 + * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + */ + if (apply_quirk) + rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; + + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter + * overflows. 200W = 200 Joules/sec + * Divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + rapl_timer_ms = 2; + if (rapl_hw_unit[0] < 32) { + rapl_timer_ms = (1000 / (2 * 100)); + rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); + } + return 0; +} + +static void __init rapl_advertise(void) +{ + int i; + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", + hweight32(rapl_cntr_mask), rapl_timer_ms); + + for (i = 0; i < NR_RAPL_DOMAINS; i++) { + if (rapl_cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", + rapl_domain_names[i], rapl_hw_unit[i]); + } + } +} + +static int __init rapl_prepare_cpus(void) +{ + unsigned int cpu, pkg; + int ret; + + for_each_online_cpu(cpu) { + pkg = topology_logical_package_id(cpu); + if (rapl_pmus->pmus[pkg]) + continue; + + ret = rapl_cpu_prepare(cpu); + if (ret) + return ret; + rapl_cpu_init(cpu); + } + return 0; +} + +static void __init cleanup_rapl_pmus(void) +{ + int i; + + for (i = 0; i < rapl_pmus->maxpkg; i++) + kfree(rapl_pmus->pmus + i); + kfree(rapl_pmus); +} + +static int __init init_rapl_pmus(void) +{ + int maxpkg = topology_max_packages(); + size_t size; + + size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *); + rapl_pmus = kzalloc(size, GFP_KERNEL); + if (!rapl_pmus) + return -ENOMEM; + + rapl_pmus->maxpkg = maxpkg; + rapl_pmus->pmu.attr_groups = rapl_attr_groups; + rapl_pmus->pmu.task_ctx_nr = perf_invalid_context; + rapl_pmus->pmu.event_init = rapl_pmu_event_init; + rapl_pmus->pmu.add = rapl_pmu_event_add; + rapl_pmus->pmu.del = rapl_pmu_event_del; + rapl_pmus->pmu.start = rapl_pmu_event_start; + rapl_pmus->pmu.stop = rapl_pmu_event_stop; + rapl_pmus->pmu.read = rapl_pmu_event_read; return 0; } -static const struct x86_cpu_id rapl_cpu_match[] = { +static const struct x86_cpu_id rapl_cpu_match[] __initconst = { [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, [1] = {}, }; static int __init rapl_pmu_init(void) { - struct rapl_pmu *pmu; - int cpu, ret; - struct x86_pmu_quirk *quirk; - int i; + bool apply_quirk = false; + int ret; - /* - * check for Intel processor family 6 - */ if (!x86_match_cpu(rapl_cpu_match)) - return 0; + return -ENODEV; - /* check supported CPU */ switch (boot_cpu_data.x86_model) { case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ @@ -712,7 +711,7 @@ static int __init rapl_pmu_init(void) rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; case 63: /* Haswell-Server */ - rapl_add_quirk(rapl_hsw_server_quirk); + apply_quirk = true; rapl_cntr_mask = RAPL_IDX_SRV; rapl_pmu_events_group.attrs = rapl_events_srv_attr; break; @@ -728,56 +727,41 @@ static int __init rapl_pmu_init(void) rapl_pmu_events_group.attrs = rapl_events_srv_attr; break; case 87: /* Knights Landing */ - rapl_add_quirk(rapl_hsw_server_quirk); + apply_quirk = true; rapl_cntr_mask = RAPL_IDX_KNL; rapl_pmu_events_group.attrs = rapl_events_knl_attr; - + break; default: - /* unsupported */ - return 0; + return -ENODEV; } - ret = rapl_check_hw_unit(); + + ret = rapl_check_hw_unit(apply_quirk); if (ret) return ret; - /* run cpu model quirks */ - for (quirk = rapl_quirks; quirk; quirk = quirk->next) - quirk->func(); - cpu_notifier_register_begin(); + ret = init_rapl_pmus(); + if (ret) + return ret; - for_each_online_cpu(cpu) { - ret = rapl_cpu_prepare(cpu); - if (ret) - goto out; - rapl_cpu_init(cpu); - } + cpu_notifier_register_begin(); - __perf_cpu_notifier(rapl_cpu_notifier); + ret = rapl_prepare_cpus(); + if (ret) + goto out; - ret = perf_pmu_register(&rapl_pmu_class, "power", -1); - if (WARN_ON(ret)) { - pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); - cpu_notifier_register_done(); - return -1; - } + ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); + if (ret) + goto out; - pmu = __this_cpu_read(rapl_pmu); + __perf_cpu_notifier(rapl_cpu_notifier); + cpu_notifier_register_done(); + rapl_advertise(); + return 0; - pr_info("RAPL PMU detected," - " API unit is 2^-32 Joules," - " %d fixed counters" - " %llu ms ovfl timer\n", - hweight32(rapl_cntr_mask), - ktime_to_ms(pmu->timer_interval)); - for (i = 0; i < NR_RAPL_DOMAINS; i++) { - if (rapl_cntr_mask & (1 << i)) { - pr_info("hw unit of domain %s 2^-%d Joules\n", - rapl_domain_names[i], rapl_hw_unit[i]); - } - } out: + pr_warn("Initialization failed (%d), disabled\n", ret); + cleanup_rapl_pmus(); cpu_notifier_register_done(); - - return 0; + return ret; } device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/events/intel/uncore.c index 3bf41d413775..7012d18bb293 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1,4 +1,4 @@ -#include "perf_event_intel_uncore.h" +#include "uncore.h" static struct intel_uncore_type *empty_uncore[] = { NULL, }; struct intel_uncore_type **uncore_msr_uncores = empty_uncore; @@ -9,9 +9,9 @@ struct pci_driver *uncore_pci_driver; /* pci bus to socket mapping */ DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); -struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; +struct pci_extra_dev *uncore_extra_pci_dev; +static int max_packages; -static DEFINE_RAW_SPINLOCK(uncore_box_lock); /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -21,7 +21,7 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); -int uncore_pcibus_to_physid(struct pci_bus *bus) +static int uncore_pcibus_to_physid(struct pci_bus *bus) { struct pci2phy_map *map; int phys_id = -1; @@ -38,6 +38,16 @@ int uncore_pcibus_to_physid(struct pci_bus *bus) return phys_id; } +static void uncore_free_pcibus_map(void) +{ + struct pci2phy_map *map, *tmp; + + list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) { + list_del(&map->list); + kfree(map); + } +} + struct pci2phy_map *__find_pci2phy_map(int segment) { struct pci2phy_map *map, *alloc = NULL; @@ -82,43 +92,9 @@ ssize_t uncore_event_show(struct kobject *kobj, return sprintf(buf, "%s", event->config); } -struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) -{ - return container_of(event->pmu, struct intel_uncore_pmu, pmu); -} - struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - struct intel_uncore_box *box; - - box = *per_cpu_ptr(pmu->box, cpu); - if (box) - return box; - - raw_spin_lock(&uncore_box_lock); - /* Recheck in lock to handle races. */ - if (*per_cpu_ptr(pmu->box, cpu)) - goto out; - list_for_each_entry(box, &pmu->box_list, list) { - if (box->phys_id == topology_physical_package_id(cpu)) { - atomic_inc(&box->refcnt); - *per_cpu_ptr(pmu->box, cpu) = box; - break; - } - } -out: - raw_spin_unlock(&uncore_box_lock); - - return *per_cpu_ptr(pmu->box, cpu); -} - -struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) -{ - /* - * perf core schedules event on the basis of cpu, uncore events are - * collected by one of the cpus inside a physical package. - */ - return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); + return pmu->boxes[topology_logical_package_id(cpu)]; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -207,7 +183,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx) return config; } -static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) +static void uncore_assign_hw_event(struct intel_uncore_box *box, + struct perf_event *event, int idx) { struct hw_perf_event *hwc = &event->hw; @@ -302,24 +279,25 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) box->hrtimer.function = uncore_pmu_hrtimer; } -static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node) +static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, + int node) { + int i, size, numshared = type->num_shared_regs ; struct intel_uncore_box *box; - int i, size; - size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); + size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg); box = kzalloc_node(size, GFP_KERNEL, node); if (!box) return NULL; - for (i = 0; i < type->num_shared_regs; i++) + for (i = 0; i < numshared; i++) raw_spin_lock_init(&box->shared_regs[i].lock); uncore_pmu_init_hrtimer(box); - atomic_set(&box->refcnt, 1); box->cpu = -1; - box->phys_id = -1; + box->pci_phys_id = -1; + box->pkgid = -1; /* set default hrtimer timeout */ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; @@ -341,7 +319,8 @@ static bool is_uncore_event(struct perf_event *event) } static int -uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) +uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, + bool dogrp) { struct perf_event *event; int n, max_count; @@ -402,7 +381,8 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve return &type->unconstrainted; } -static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +static void uncore_put_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) { if (box->pmu->type->ops->put_constraint) box->pmu->type->ops->put_constraint(box, event); @@ -582,7 +562,7 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags) if (event == box->event_list[i]) { uncore_put_event_constraint(box, event); - while (++i < box->n_events) + for (++i; i < box->n_events; i++) box->event_list[i - 1] = box->event_list[i]; --box->n_events; @@ -676,6 +656,7 @@ static int uncore_pmu_event_init(struct perf_event *event) if (!box || box->cpu < 0) return -EINVAL; event->cpu = box->cpu; + event->pmu_private = box; event->hw.idx = -1; event->hw.last_tag = ~0ULL; @@ -760,64 +741,110 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) } ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + if (!ret) + pmu->registered = true; return ret; } +static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) +{ + if (!pmu->registered) + return; + perf_pmu_unregister(&pmu->pmu); + pmu->registered = false; +} + +static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) +{ + struct intel_uncore_pmu *pmu = type->pmus; + struct intel_uncore_box *box; + int i, pkg; + + if (pmu) { + pkg = topology_physical_package_id(cpu); + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (box) + uncore_box_exit(box); + } + } +} + +static void __init uncore_exit_boxes(void *dummy) +{ + struct intel_uncore_type **types; + + for (types = uncore_msr_uncores; *types; types++) + __uncore_exit_boxes(*types++, smp_processor_id()); +} + +static void uncore_free_boxes(struct intel_uncore_pmu *pmu) +{ + int pkg; + + for (pkg = 0; pkg < max_packages; pkg++) + kfree(pmu->boxes[pkg]); + kfree(pmu->boxes); +} + static void __init uncore_type_exit(struct intel_uncore_type *type) { + struct intel_uncore_pmu *pmu = type->pmus; int i; - for (i = 0; i < type->num_boxes; i++) - free_percpu(type->pmus[i].box); - kfree(type->pmus); - type->pmus = NULL; + if (pmu) { + for (i = 0; i < type->num_boxes; i++, pmu++) { + uncore_pmu_unregister(pmu); + uncore_free_boxes(pmu); + } + kfree(type->pmus); + type->pmus = NULL; + } kfree(type->events_group); type->events_group = NULL; } static void __init uncore_types_exit(struct intel_uncore_type **types) { - int i; - for (i = 0; types[i]; i++) - uncore_type_exit(types[i]); + for (; *types; types++) + uncore_type_exit(*types); } -static int __init uncore_type_init(struct intel_uncore_type *type) +static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) { struct intel_uncore_pmu *pmus; struct attribute_group *attr_group; struct attribute **attrs; + size_t size; int i, j; pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); if (!pmus) return -ENOMEM; - type->pmus = pmus; + size = max_packages * sizeof(struct intel_uncore_box *); + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = setid ? i : -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + pmus[i].boxes = kzalloc(size, GFP_KERNEL); + if (!pmus[i].boxes) + return -ENOMEM; + } + + type->pmus = pmus; type->unconstrainted = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, 0, type->num_counters, 0, 0); - for (i = 0; i < type->num_boxes; i++) { - pmus[i].func_id = -1; - pmus[i].pmu_idx = i; - pmus[i].type = type; - INIT_LIST_HEAD(&pmus[i].box_list); - pmus[i].box = alloc_percpu(struct intel_uncore_box *); - if (!pmus[i].box) - goto fail; - } - if (type->event_descs) { - i = 0; - while (type->event_descs[i].attr.attr.name) - i++; + for (i = 0; type->event_descs[i].attr.attr.name; i++); attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + sizeof(*attr_group), GFP_KERNEL); if (!attr_group) - goto fail; + return -ENOMEM; attrs = (struct attribute **)(attr_group + 1); attr_group->name = "events"; @@ -831,25 +858,19 @@ static int __init uncore_type_init(struct intel_uncore_type *type) type->pmu_group = &uncore_pmu_attr_group; return 0; -fail: - uncore_type_exit(type); - return -ENOMEM; } -static int __init uncore_types_init(struct intel_uncore_type **types) +static int __init +uncore_types_init(struct intel_uncore_type **types, bool setid) { - int i, ret; + int ret; - for (i = 0; types[i]; i++) { - ret = uncore_type_init(types[i]); + for (; *types; types++) { + ret = uncore_type_init(*types, setid); if (ret) - goto fail; + return ret; } return 0; -fail: - while (--i >= 0) - uncore_type_exit(types[i]); - return ret; } /* @@ -857,28 +878,28 @@ fail: */ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { + struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - struct intel_uncore_type *type; - int phys_id; - bool first_box = false; + int phys_id, pkg, ret; phys_id = uncore_pcibus_to_physid(pdev->bus); if (phys_id < 0) return -ENODEV; + pkg = topology_phys_to_logical_pkg(phys_id); + if (WARN_ON_ONCE(pkg < 0)) + return -EINVAL; + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { int idx = UNCORE_PCI_DEV_IDX(id->driver_data); - uncore_extra_pci_dev[phys_id][idx] = pdev; + + uncore_extra_pci_dev[pkg].dev[idx] = pdev; pci_set_drvdata(pdev, NULL); return 0; } type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; - box = uncore_alloc_box(type, NUMA_NO_NODE); - if (!box) - return -ENOMEM; - /* * for performance monitoring unit with multiple boxes, * each box has a different function id. @@ -890,44 +911,60 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id * some device types. Hence PCI device idx would be 0 for all devices. * So increment pmu pointer to point to an unused array element. */ - if (boot_cpu_data.x86_model == 87) + if (boot_cpu_data.x86_model == 87) { while (pmu->func_id >= 0) pmu++; + } + + if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL)) + return -EINVAL; + + box = uncore_alloc_box(type, NUMA_NO_NODE); + if (!box) + return -ENOMEM; + if (pmu->func_id < 0) pmu->func_id = pdev->devfn; else WARN_ON_ONCE(pmu->func_id != pdev->devfn); - box->phys_id = phys_id; + atomic_inc(&box->refcnt); + box->pci_phys_id = phys_id; + box->pkgid = pkg; box->pci_dev = pdev; box->pmu = pmu; uncore_box_init(box); pci_set_drvdata(pdev, box); - raw_spin_lock(&uncore_box_lock); - if (list_empty(&pmu->box_list)) - first_box = true; - list_add_tail(&box->list, &pmu->box_list); - raw_spin_unlock(&uncore_box_lock); + pmu->boxes[pkg] = box; + if (atomic_inc_return(&pmu->activeboxes) > 1) + return 0; - if (first_box) - uncore_pmu_register(pmu); - return 0; + /* First active box registers the pmu */ + ret = uncore_pmu_register(pmu); + if (ret) { + pci_set_drvdata(pdev, NULL); + pmu->boxes[pkg] = NULL; + uncore_box_exit(box); + kfree(box); + } + return ret; } static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); struct intel_uncore_pmu *pmu; - int i, cpu, phys_id; - bool last_box = false; + int i, phys_id, pkg; phys_id = uncore_pcibus_to_physid(pdev->bus); + pkg = topology_phys_to_logical_pkg(phys_id); + box = pci_get_drvdata(pdev); if (!box) { for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { - if (uncore_extra_pci_dev[phys_id][i] == pdev) { - uncore_extra_pci_dev[phys_id][i] = NULL; + if (uncore_extra_pci_dev[pkg].dev[i] == pdev) { + uncore_extra_pci_dev[pkg].dev[i] = NULL; break; } } @@ -936,33 +973,20 @@ static void uncore_pci_remove(struct pci_dev *pdev) } pmu = box->pmu; - if (WARN_ON_ONCE(phys_id != box->phys_id)) + if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) return; pci_set_drvdata(pdev, NULL); - - raw_spin_lock(&uncore_box_lock); - list_del(&box->list); - if (list_empty(&pmu->box_list)) - last_box = true; - raw_spin_unlock(&uncore_box_lock); - - for_each_possible_cpu(cpu) { - if (*per_cpu_ptr(pmu->box, cpu) == box) { - *per_cpu_ptr(pmu->box, cpu) = NULL; - atomic_dec(&box->refcnt); - } - } - - WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + pmu->boxes[pkg] = NULL; + if (atomic_dec_return(&pmu->activeboxes) == 0) + uncore_pmu_unregister(pmu); + uncore_box_exit(box); kfree(box); - - if (last_box) - perf_pmu_unregister(&pmu->pmu); } static int __init uncore_pci_init(void) { + size_t size; int ret; switch (boot_cpu_data.x86_model) { @@ -999,25 +1023,40 @@ static int __init uncore_pci_init(void) ret = skl_uncore_pci_init(); break; default: - return 0; + return -ENODEV; } if (ret) return ret; - ret = uncore_types_init(uncore_pci_uncores); + size = max_packages * sizeof(struct pci_extra_dev); + uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); + if (!uncore_extra_pci_dev) { + ret = -ENOMEM; + goto err; + } + + ret = uncore_types_init(uncore_pci_uncores, false); if (ret) - return ret; + goto errtype; uncore_pci_driver->probe = uncore_pci_probe; uncore_pci_driver->remove = uncore_pci_remove; ret = pci_register_driver(uncore_pci_driver); - if (ret == 0) - pcidrv_registered = true; - else - uncore_types_exit(uncore_pci_uncores); + if (ret) + goto errtype; + + pcidrv_registered = true; + return 0; +errtype: + uncore_types_exit(uncore_pci_uncores); + kfree(uncore_extra_pci_dev); + uncore_extra_pci_dev = NULL; + uncore_free_pcibus_map(); +err: + uncore_pci_uncores = empty_uncore; return ret; } @@ -1027,173 +1066,139 @@ static void __init uncore_pci_exit(void) pcidrv_registered = false; pci_unregister_driver(uncore_pci_driver); uncore_types_exit(uncore_pci_uncores); - } -} - -/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */ -static LIST_HEAD(boxes_to_free); - -static void uncore_kfree_boxes(void) -{ - struct intel_uncore_box *box; - - while (!list_empty(&boxes_to_free)) { - box = list_entry(boxes_to_free.next, - struct intel_uncore_box, list); - list_del(&box->list); - kfree(box); + kfree(uncore_extra_pci_dev); + uncore_free_pcibus_map(); } } static void uncore_cpu_dying(int cpu) { - struct intel_uncore_type *type; + struct intel_uncore_type *type, **types = uncore_msr_uncores; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, j; - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - box = *per_cpu_ptr(pmu->box, cpu); - *per_cpu_ptr(pmu->box, cpu) = NULL; - if (box && atomic_dec_and_test(&box->refcnt)) - list_add(&box->list, &boxes_to_free); + int i, pkg; + + pkg = topology_logical_package_id(cpu); + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (box && atomic_dec_return(&box->refcnt) == 0) + uncore_box_exit(box); } } } -static int uncore_cpu_starting(int cpu) +static void uncore_cpu_starting(int cpu, bool init) { - struct intel_uncore_type *type; + struct intel_uncore_type *type, **types = uncore_msr_uncores; struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box, *exist; - int i, j, k, phys_id; - - phys_id = topology_physical_package_id(cpu); - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - box = *per_cpu_ptr(pmu->box, cpu); - /* called by uncore_cpu_init? */ - if (box && box->phys_id >= 0) { - uncore_box_init(box); - continue; - } + struct intel_uncore_box *box; + int i, pkg, ncpus = 1; - for_each_online_cpu(k) { - exist = *per_cpu_ptr(pmu->box, k); - if (exist && exist->phys_id == phys_id) { - atomic_inc(&exist->refcnt); - *per_cpu_ptr(pmu->box, cpu) = exist; - if (box) { - list_add(&box->list, - &boxes_to_free); - box = NULL; - } - break; - } - } + if (init) { + /* + * On init we get the number of online cpus in the package + * and set refcount for all of them. + */ + ncpus = cpumask_weight(topology_core_cpumask(cpu)); + } - if (box) { - box->phys_id = phys_id; + pkg = topology_logical_package_id(cpu); + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (!box) + continue; + /* The first cpu on a package activates the box */ + if (atomic_add_return(ncpus, &box->refcnt) == ncpus) uncore_box_init(box); - } } } - return 0; } -static int uncore_cpu_prepare(int cpu, int phys_id) +static int uncore_cpu_prepare(int cpu) { - struct intel_uncore_type *type; + struct intel_uncore_type *type, **types = uncore_msr_uncores; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, j; - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - if (pmu->func_id < 0) - pmu->func_id = j; - + int i, pkg; + + pkg = topology_logical_package_id(cpu); + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + if (pmu->boxes[pkg]) + continue; + /* First cpu of a package allocates the box */ box = uncore_alloc_box(type, cpu_to_node(cpu)); if (!box) return -ENOMEM; - box->pmu = pmu; - box->phys_id = phys_id; - *per_cpu_ptr(pmu->box, cpu) = box; + box->pkgid = pkg; + pmu->boxes[pkg] = box; } } return 0; } -static void -uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) +static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu, + int new_cpu) { - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu; + struct intel_uncore_pmu *pmu = type->pmus; struct intel_uncore_box *box; - int i, j; + int i, pkg; - for (i = 0; uncores[i]; i++) { - type = uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - if (old_cpu < 0) - box = uncore_pmu_to_box(pmu, new_cpu); - else - box = uncore_pmu_to_box(pmu, old_cpu); - if (!box) - continue; - - if (old_cpu < 0) { - WARN_ON_ONCE(box->cpu != -1); - box->cpu = new_cpu; - continue; - } + pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu); + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (!box) + continue; - WARN_ON_ONCE(box->cpu != old_cpu); - if (new_cpu >= 0) { - uncore_pmu_cancel_hrtimer(box); - perf_pmu_migrate_context(&pmu->pmu, - old_cpu, new_cpu); - box->cpu = new_cpu; - } else { - box->cpu = -1; - } + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; } + + WARN_ON_ONCE(box->cpu != old_cpu); + box->cpu = -1; + if (new_cpu < 0) + continue; + + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu); + box->cpu = new_cpu; } } +static void uncore_change_context(struct intel_uncore_type **uncores, + int old_cpu, int new_cpu) +{ + for (; *uncores; uncores++) + uncore_change_type_ctx(*uncores, old_cpu, new_cpu); +} + static void uncore_event_exit_cpu(int cpu) { - int i, phys_id, target; + int target; - /* if exiting cpu is used for collecting uncore events */ + /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) return; - /* find a new cpu to collect uncore events */ - phys_id = topology_physical_package_id(cpu); - target = -1; - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (phys_id == topology_physical_package_id(i)) { - target = i; - break; - } - } + /* Find a new cpu to collect uncore events */ + target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - /* migrate uncore events to the new cpu */ - if (target >= 0) + /* Migrate uncore events to the new target */ + if (target < nr_cpu_ids) cpumask_set_cpu(target, &uncore_cpu_mask); + else + target = -1; uncore_change_context(uncore_msr_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); @@ -1201,13 +1206,15 @@ static void uncore_event_exit_cpu(int cpu) static void uncore_event_init_cpu(int cpu) { - int i, phys_id; + int target; - phys_id = topology_physical_package_id(cpu); - for_each_cpu(i, &uncore_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) - return; - } + /* + * Check if there is an online cpu in the package + * which collects uncore events already. + */ + target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu)); + if (target < nr_cpu_ids) + return; cpumask_set_cpu(cpu, &uncore_cpu_mask); @@ -1220,39 +1227,25 @@ static int uncore_cpu_notifier(struct notifier_block *self, { unsigned int cpu = (long)hcpu; - /* allocate/free data structure for uncore box */ switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - uncore_cpu_prepare(cpu, -1); - break; + return notifier_from_errno(uncore_cpu_prepare(cpu)); + case CPU_STARTING: - uncore_cpu_starting(cpu); + uncore_cpu_starting(cpu, false); + case CPU_DOWN_FAILED: + uncore_event_init_cpu(cpu); break; + case CPU_UP_CANCELED: case CPU_DYING: uncore_cpu_dying(cpu); break; - case CPU_ONLINE: - case CPU_DEAD: - uncore_kfree_boxes(); - break; - default: - break; - } - /* select the cpu that collects uncore events */ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_STARTING: - uncore_event_init_cpu(cpu); - break; case CPU_DOWN_PREPARE: uncore_event_exit_cpu(cpu); break; - default: - break; } - return NOTIFY_OK; } @@ -1265,9 +1258,29 @@ static struct notifier_block uncore_cpu_nb = { .priority = CPU_PRI_PERF + 1, }; -static void __init uncore_cpu_setup(void *dummy) +static int __init type_pmu_register(struct intel_uncore_type *type) { - uncore_cpu_starting(smp_processor_id()); + int i, ret; + + for (i = 0; i < type->num_boxes; i++) { + ret = uncore_pmu_register(&type->pmus[i]); + if (ret) + return ret; + } + return 0; +} + +static int __init uncore_msr_pmus_register(void) +{ + struct intel_uncore_type **types = uncore_msr_uncores; + int ret; + + for (; *types; types++) { + ret = type_pmu_register(*types); + if (ret) + return ret; + } + return 0; } static int __init uncore_cpu_init(void) @@ -1311,71 +1324,61 @@ static int __init uncore_cpu_init(void) knl_uncore_cpu_init(); break; default: - return 0; + return -ENODEV; } - ret = uncore_types_init(uncore_msr_uncores); + ret = uncore_types_init(uncore_msr_uncores, true); if (ret) - return ret; + goto err; + ret = uncore_msr_pmus_register(); + if (ret) + goto err; return 0; +err: + uncore_types_exit(uncore_msr_uncores); + uncore_msr_uncores = empty_uncore; + return ret; } -static int __init uncore_pmus_register(void) +static void __init uncore_cpu_setup(void *dummy) { - struct intel_uncore_pmu *pmu; - struct intel_uncore_type *type; - int i, j; - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - uncore_pmu_register(pmu); - } - } - - return 0; + uncore_cpu_starting(smp_processor_id(), true); } -static void __init uncore_cpumask_init(void) -{ - int cpu; - - /* - * ony invoke once from msr or pci init code - */ - if (!cpumask_empty(&uncore_cpu_mask)) - return; +/* Lazy to avoid allocation of a few bytes for the normal case */ +static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC); - cpu_notifier_register_begin(); +static int __init uncore_cpumask_init(bool msr) +{ + unsigned int cpu; for_each_online_cpu(cpu) { - int i, phys_id = topology_physical_package_id(cpu); + unsigned int pkg = topology_logical_package_id(cpu); + int ret; - for_each_cpu(i, &uncore_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) { - phys_id = -1; - break; - } - } - if (phys_id < 0) + if (test_and_set_bit(pkg, packages)) continue; - - uncore_cpu_prepare(cpu, phys_id); + /* + * The first online cpu of each package allocates and takes + * the refcounts for all other online cpus in that package. + * If msrs are not enabled no allocation is required. + */ + if (msr) { + ret = uncore_cpu_prepare(cpu); + if (ret) + return ret; + } uncore_event_init_cpu(cpu); + smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1); } - on_each_cpu(uncore_cpu_setup, NULL, 1); - __register_cpu_notifier(&uncore_cpu_nb); - - cpu_notifier_register_done(); + return 0; } - static int __init intel_uncore_init(void) { - int ret; + int pret, cret, ret; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; @@ -1383,19 +1386,27 @@ static int __init intel_uncore_init(void) if (cpu_has_hypervisor) return -ENODEV; - ret = uncore_pci_init(); - if (ret) - goto fail; - ret = uncore_cpu_init(); - if (ret) { - uncore_pci_exit(); - goto fail; - } - uncore_cpumask_init(); + max_packages = topology_max_packages(); + + pret = uncore_pci_init(); + cret = uncore_cpu_init(); - uncore_pmus_register(); + if (cret && pret) + return -ENODEV; + + cpu_notifier_register_begin(); + ret = uncore_cpumask_init(!cret); + if (ret) + goto err; + cpu_notifier_register_done(); return 0; -fail: + +err: + /* Undo box->init_box() */ + on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1); + uncore_types_exit(uncore_msr_uncores); + uncore_pci_exit(); + cpu_notifier_register_done(); return ret; } device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/events/intel/uncore.h index a7086b862156..79766b9a3580 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -1,8 +1,10 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/pci.h> +#include <asm/apicdef.h> + #include <linux/perf_event.h> -#include "perf_event.h" +#include "../perf_event.h" #define UNCORE_PMU_NAME_LEN 32 #define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) @@ -19,11 +21,12 @@ #define UNCORE_EXTRA_PCI_DEV 0xff #define UNCORE_EXTRA_PCI_DEV_MAX 3 -/* support up to 8 sockets */ -#define UNCORE_SOCKET_MAX 8 - #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) +struct pci_extra_dev { + struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX]; +}; + struct intel_uncore_ops; struct intel_uncore_pmu; struct intel_uncore_box; @@ -61,6 +64,7 @@ struct intel_uncore_type { struct intel_uncore_ops { void (*init_box)(struct intel_uncore_box *); + void (*exit_box)(struct intel_uncore_box *); void (*disable_box)(struct intel_uncore_box *); void (*enable_box)(struct intel_uncore_box *); void (*disable_event)(struct intel_uncore_box *, struct perf_event *); @@ -73,13 +77,14 @@ struct intel_uncore_ops { }; struct intel_uncore_pmu { - struct pmu pmu; - char name[UNCORE_PMU_NAME_LEN]; - int pmu_idx; - int func_id; - struct intel_uncore_type *type; - struct intel_uncore_box ** __percpu box; - struct list_head box_list; + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + bool registered; + atomic_t activeboxes; + struct intel_uncore_type *type; + struct intel_uncore_box **boxes; }; struct intel_uncore_extra_reg { @@ -89,7 +94,8 @@ struct intel_uncore_extra_reg { }; struct intel_uncore_box { - int phys_id; + int pci_phys_id; + int pkgid; int n_active; /* number of active events */ int n_events; int cpu; /* cpu to collect events */ @@ -123,7 +129,6 @@ struct pci2phy_map { int pbus_to_physid[256]; }; -int uncore_pcibus_to_physid(struct pci_bus *bus); struct pci2phy_map *__find_pci2phy_map(int segment); ssize_t uncore_event_show(struct kobject *kobj, @@ -305,14 +310,30 @@ static inline void uncore_box_init(struct intel_uncore_box *box) } } +static inline void uncore_box_exit(struct intel_uncore_box *box) +{ + if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->exit_box) + box->pmu->type->ops->exit_box(box); + } +} + static inline bool uncore_box_is_fake(struct intel_uncore_box *box) { - return (box->phys_id < 0); + return (box->pkgid < 0); +} + +static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + return event->pmu_private; } -struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event); struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu); -struct intel_uncore_box *uncore_event_to_box(struct perf_event *event); u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event); void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); @@ -328,7 +349,7 @@ extern struct intel_uncore_type **uncore_pci_uncores; extern struct pci_driver *uncore_pci_driver; extern raw_spinlock_t pci2phy_map_lock; extern struct list_head pci2phy_map_head; -extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; +extern struct pci_extra_dev *uncore_extra_pci_dev; extern struct event_constraint uncore_constraint_empty; /* perf_event_intel_uncore_snb.c */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 2749965afed0..cda569332005 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -1,5 +1,5 @@ /* Nehalem-EX/Westmere-EX uncore support */ -#include "perf_event_intel_uncore.h" +#include "uncore.h" /* NHM-EX event control */ #define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff @@ -201,6 +201,11 @@ static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); } +static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box) +{ + wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0); +} + static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) { unsigned msr = uncore_msr_box_ctl(box); @@ -250,6 +255,7 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p #define NHMEX_UNCORE_OPS_COMMON_INIT() \ .init_box = nhmex_uncore_msr_init_box, \ + .exit_box = nhmex_uncore_msr_exit_box, \ .disable_box = nhmex_uncore_msr_disable_box, \ .enable_box = nhmex_uncore_msr_enable_box, \ .disable_event = nhmex_uncore_msr_disable_event, \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 2bd030ddd0db..96531d2b843f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1,5 +1,5 @@ /* Nehalem/SandBridge/Haswell uncore support */ -#include "perf_event_intel_uncore.h" +#include "uncore.h" /* Uncore IMC PCI IDs */ #define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 @@ -95,6 +95,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0); +} + static struct uncore_event_desc snb_uncore_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), { /* end: all zeroes */ }, @@ -116,6 +122,7 @@ static struct attribute_group snb_uncore_format_group = { static struct intel_uncore_ops snb_uncore_msr_ops = { .init_box = snb_uncore_msr_init_box, + .exit_box = snb_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, .read_counter = uncore_msr_read_counter, @@ -231,6 +238,11 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } +static void snb_uncore_imc_exit_box(struct intel_uncore_box *box) +{ + iounmap(box->io_addr); +} + static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) {} @@ -301,6 +313,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event) return -EINVAL; event->cpu = box->cpu; + event->pmu_private = box; event->hw.idx = -1; event->hw.last_tag = ~0ULL; @@ -458,6 +471,7 @@ static struct pmu snb_uncore_imc_pmu = { static struct intel_uncore_ops snb_uncore_imc_ops = { .init_box = snb_uncore_imc_init_box, + .exit_box = snb_uncore_imc_exit_box, .enable_box = snb_uncore_imc_enable_box, .disable_box = snb_uncore_imc_disable_box, .disable_event = snb_uncore_imc_disable_event, diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 33acb884ccf1..93f6bd9bf761 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1,6 +1,5 @@ /* SandyBridge-EP/IvyTown uncore support */ -#include "perf_event_intel_uncore.h" - +#include "uncore.h" /* SNB-EP Box level control */ #define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) @@ -987,7 +986,9 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve if (reg1->idx != EXTRA_REG_NONE) { int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; - struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx]; + int pkg = topology_phys_to_logical_pkg(box->pci_phys_id); + struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx]; + if (filter_pdev) { pci_write_config_dword(filter_pdev, reg1->reg, (u32)reg1->config); @@ -2521,14 +2522,16 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { + int pkg = topology_phys_to_logical_pkg(0); + if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; /* Detect 6-8 core systems with only two SBOXes */ - if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) { + if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { u32 capid4; - pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3], + pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3], 0x94, &capid4); if (((capid4 >> 6) & 0x3) == 0) hswep_uncore_sbox.num_boxes = 2; @@ -2875,11 +2878,13 @@ static struct intel_uncore_type bdx_uncore_sbox = { .format_group = &hswep_uncore_sbox_format_group, }; +#define BDX_MSR_UNCORE_SBOX 3 + static struct intel_uncore_type *bdx_msr_uncores[] = { &bdx_uncore_ubox, &bdx_uncore_cbox, - &bdx_uncore_sbox, &hswep_uncore_pcu, + &bdx_uncore_sbox, NULL, }; @@ -2888,6 +2893,10 @@ void bdx_uncore_cpu_init(void) if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; + + /* BDX-DE doesn't have SBOX */ + if (boot_cpu_data.x86_model == 86) + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; } static struct intel_uncore_type bdx_uncore_ha = { diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/events/msr.c index ec863b9a9f78..ec863b9a9f78 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/events/msr.c diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/events/perf_event.h index 7bb61e32fb29..68155cafa8a1 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -586,6 +586,7 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1; int pebs_record_size; + int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); @@ -860,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[]; extern struct event_constraint intel_hsw_pebs_event_constraints[]; +extern struct event_constraint intel_bdw_pebs_event_constraints[]; + extern struct event_constraint intel_skl_pebs_event_constraints[]; struct event_constraint *intel_pebs_constraints(struct perf_event *event); @@ -904,6 +907,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_pebs_data_source_nhm(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 3c56ef1ae068..5e828da2e18f 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -27,15 +27,23 @@ struct amd_l3_cache { }; struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - bool interrupt_capable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; + unsigned int block; /* Number within bank */ + unsigned int bank; /* MCA bank the block belongs to */ + unsigned int cpu; /* CPU which controls MCA bank */ + u32 address; /* MSR address for the block */ + u16 interrupt_enable; /* Enable/Disable APIC interrupt */ + bool interrupt_capable; /* Bank can generate an interrupt. */ + + u16 threshold_limit; /* + * Value upon which threshold + * interrupt is generated. + */ + + struct kobject kobj; /* sysfs object */ + struct list_head miscj; /* + * List of threshold blocks + * within a bank. + */ }; struct threshold_bank { diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 189679aba703..f5063b6659eb 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -44,19 +44,22 @@ /* Exception table entry */ #ifdef __ASSEMBLY__ -# define _ASM_EXTABLE(from,to) \ +# define _ASM_EXTABLE_HANDLE(from, to, handler) \ .pushsection "__ex_table","a" ; \ - .balign 8 ; \ + .balign 4 ; \ .long (from) - . ; \ .long (to) - . ; \ + .long (handler) - . ; \ .popsection -# define _ASM_EXTABLE_EX(from,to) \ - .pushsection "__ex_table","a" ; \ - .balign 8 ; \ - .long (from) - . ; \ - .long (to) - . + 0x7ffffff0 ; \ - .popsection +# define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) + +# define _ASM_EXTABLE_FAULT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) + +# define _ASM_EXTABLE_EX(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ @@ -89,19 +92,24 @@ .endm #else -# define _ASM_EXTABLE(from,to) \ +# define _EXPAND_EXTABLE_HANDLE(x) #x +# define _ASM_EXTABLE_HANDLE(from, to, handler) \ " .pushsection \"__ex_table\",\"a\"\n" \ - " .balign 8\n" \ + " .balign 4\n" \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ + " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ " .popsection\n" -# define _ASM_EXTABLE_EX(from,to) \ - " .pushsection \"__ex_table\",\"a\"\n" \ - " .balign 8\n" \ - " .long (" #from ") - .\n" \ - " .long (" #to ") - . + 0x7ffffff0\n" \ - " .popsection\n" +# define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) + +# define _ASM_EXTABLE_FAULT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) + +# define _ASM_EXTABLE_EX(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index a584e1c50918..bfb28caf97b1 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -6,18 +6,17 @@ /* * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking + * And yes, this might be required on UP too when we're talking * to devices. */ #ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \ + X86_FEATURE_XMM2) ::: "memory", "cc") +#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \ + X86_FEATURE_XMM2) ::: "memory", "cc") +#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \ + X86_FEATURE_XMM2) ::: "memory", "cc") #else #define mb() asm volatile("mfence":::"memory") #define rmb() asm volatile("lfence":::"memory") diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e63aa38e85fb..61518cf79437 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -91,16 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) -#ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); extern const int rodata_test_data; extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); -#else -static inline void set_kernel_text_rw(void) { } -static inline void set_kernel_text_ro(void) { } -#endif #ifdef CONFIG_DEBUG_RODATA_TEST int rodata_test(void); diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index f50de6951738..532f85e6651f 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -112,8 +112,7 @@ static inline __sum16 csum_fold(__wsum sum) } static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, - unsigned short len, - unsigned short proto, + __u32 len, __u8 proto, __wsum sum) { asm("addl %1, %0 ;\n" @@ -131,8 +130,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, * returns a 16-bit checksum, already complemented */ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, - unsigned short len, - unsigned short proto, + __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); @@ -151,8 +149,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len) #define _HAVE_ARCH_IPV6_CSUM static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, - __u32 len, unsigned short proto, - __wsum sum) + __u32 len, __u8 proto, __wsum sum) { asm("addl 0(%1), %0 ;\n" "adcl 4(%1), %0 ;\n" diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index cd00e1774491..c020ee75dce7 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -84,8 +84,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) * 32bit unfolded. */ static inline __wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, - unsigned short proto, __wsum sum) +csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) { asm(" addl %1, %0\n" " adcl %2, %0\n" @@ -110,8 +110,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, * complemented and ready to be filled in. */ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, - unsigned short len, - unsigned short proto, __wsum sum) + __u32 len, __u8 proto, + __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } @@ -177,7 +177,7 @@ struct in6_addr; #define _HAVE_ARCH_IPV6_CSUM 1 extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, - __u32 len, unsigned short proto, __wsum sum); + __u32 len, __u8 proto, __wsum sum); static inline unsigned add32_with_carry(unsigned a, unsigned b) { diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 89949a299cd5..3d1a84383162 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -85,7 +85,7 @@ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ #define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ +#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ @@ -106,6 +106,7 @@ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ @@ -219,6 +220,7 @@ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ @@ -229,6 +231,8 @@ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ #define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ #define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ @@ -289,4 +293,12 @@ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#ifdef CONFIG_X86_32 +/* + * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional + * to avoid confusion. + */ +#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#endif + #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 278441f39856..eb5deb42484d 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -98,4 +98,27 @@ struct desc_ptr { #endif /* !__ASSEMBLY__ */ +/* Access rights as returned by LAR */ +#define AR_TYPE_RODATA (0 * (1 << 9)) +#define AR_TYPE_RWDATA (1 * (1 << 9)) +#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9)) +#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9)) +#define AR_TYPE_XOCODE (4 * (1 << 9)) +#define AR_TYPE_XRCODE (5 * (1 << 9)) +#define AR_TYPE_XOCODE_CONF (6 * (1 << 9)) +#define AR_TYPE_XRCODE_CONF (7 * (1 << 9)) +#define AR_TYPE_MASK (7 * (1 << 9)) + +#define AR_DPL0 (0 * (1 << 13)) +#define AR_DPL3 (3 * (1 << 13)) +#define AR_DPL_MASK (3 * (1 << 13)) + +#define AR_A (1 << 8) /* "Accessed" */ +#define AR_S (1 << 12) /* If clear, "System" segment */ +#define AR_P (1 << 15) /* "Present" */ +#define AR_AVL (1 << 20) /* "AVaiLable" (no HW effect) */ +#define AR_L (1 << 21) /* "Long mode" for code segments */ +#define AR_DB (1 << 22) /* D/B, effect depends on type */ +#define AR_G (1 << 23) /* "Granularity" (limit in pages) */ + #endif /* _ASM_X86_DESC_DEFS_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0010c78c4998..53748c45e488 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,7 @@ #include <asm/fpu/api.h> #include <asm/pgtable.h> +#include <asm/tlb.h> /* * We map the EFI regions needed for runtime services non-contiguously, @@ -25,6 +26,8 @@ #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" +#define MAX_CMDLINE_ADDRESS UINT_MAX + #ifdef CONFIG_X86_32 @@ -64,6 +67,17 @@ extern u64 asmlinkage efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) +/* + * Scratch space used for switching the pagetable in the EFI stub + */ +struct efi_scratch { + u64 r15; + u64 prev_cr3; + pgd_t *efi_pgt; + bool use_pgd; + u64 phys_stack; +} __packed; + #define efi_call_virt(f, ...) \ ({ \ efi_status_t __s; \ @@ -71,7 +85,20 @@ extern u64 asmlinkage efi_call(void *fp, ...); efi_sync_low_kernel_mappings(); \ preempt_disable(); \ __kernel_fpu_begin(); \ + \ + if (efi_scratch.use_pgd) { \ + efi_scratch.prev_cr3 = read_cr3(); \ + write_cr3((unsigned long)efi_scratch.efi_pgt); \ + __flush_tlb_all(); \ + } \ + \ __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ + \ + if (efi_scratch.use_pgd) { \ + write_cr3(efi_scratch.prev_cr3); \ + __flush_tlb_all(); \ + } \ + \ __kernel_fpu_end(); \ preempt_enable(); \ __s; \ @@ -111,11 +138,12 @@ extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); +extern int __init efi_alloc_page_tables(void); extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); -extern void __init efi_runtime_mkexec(void); +extern void __init efi_runtime_update_mappings(void); extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1514753fd435..15340e36ddcb 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -256,7 +256,7 @@ extern int force_personality32; instruction set this CPU supports. This could be done in user space, but it's not easy, and we've already done it here. */ -#define ELF_HWCAP (boot_cpu_data.x86_capability[0]) +#define ELF_HWCAP (boot_cpu_data.x86_capability[CPUID_1_EDX]) /* This yields a string that ld.so will use to load implementation specific libraries for optimization. This is more specific in diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 9994d4229002..38951b0fcc5a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -20,17 +20,16 @@ /* Supported features which support lazy state saving */ #define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ - XFEATURE_MASK_SSE) - -/* Supported features which require eager state saving */ -#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU) +/* Supported features which require eager state saving */ +#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) + /* All currently supported features */ #define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h deleted file mode 100644 index b3799d88ffcf..000000000000 --- a/arch/x86/include/asm/gpio.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __LINUX_GPIO_H -#warning Include linux/gpio.h instead of asm/gpio.h -#include <linux/gpio.h> -#endif diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h index cd2ce4068441..ebea2c9d2cdc 100644 --- a/arch/x86/include/asm/imr.h +++ b/arch/x86/include/asm/imr.h @@ -53,7 +53,7 @@ #define IMR_MASK (IMR_ALIGN - 1) int imr_add_range(phys_addr_t base, size_t size, - unsigned int rmask, unsigned int wmask, bool lock); + unsigned int rmask, unsigned int wmask); int imr_remove_range(phys_addr_t base, size_t size); diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index cfc9a0d2d07c..a4fe16e42b7b 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void) cpu_relax(); } -static inline void -__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) -{ - /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; - - /* - * Wait for idle. - */ - __xapic_wait_icr_idle(); - - /* - * No need to touch the target chip field - */ - cfg = __prepare_ICR(shortcut, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); -} +void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); /* * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -static inline void - __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) -{ - unsigned long cfg; - - /* - * Wait for idle. - */ - if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); - else - __xapic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - native_apic_mem_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); -} +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); extern void default_send_IPI_single(int cpu, int vector); extern void default_send_IPI_single_phys(int cpu, int vector); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 44adbb819041..01c8b501cb6d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -32,6 +32,7 @@ #include <asm/mtrr.h> #include <asm/msr-index.h> #include <asm/asm.h> +#include <asm/kvm_page_track.h> #define KVM_MAX_VCPUS 255 #define KVM_SOFT_MAX_VCPUS 160 @@ -214,6 +215,14 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; +/* + * the pages used as guest page table on soft mmu are tracked by + * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used + * by indirect shadow page can not be more than 15 bits. + * + * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access, + * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. + */ union kvm_mmu_page_role { unsigned word; struct { @@ -276,7 +285,7 @@ struct kvm_mmu_page { #endif /* Number of writes since the last time traversal visited this page. */ - int write_flooding_count; + atomic_t write_flooding_count; }; struct kvm_pio_request { @@ -338,12 +347,8 @@ struct kvm_mmu { struct rsvd_bits_validate guest_rsvd_check; - /* - * Bitmap: bit set = last pte in walk - * index[0:1]: level (zero-based) - * index[2]: pte.ps - */ - u8 last_pte_bitmap; + /* Can have large pages at levels 2..last_nonleaf_level-1. */ + u8 last_nonleaf_level; bool nx; @@ -498,7 +503,6 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_header_cache; struct fpu guest_fpu; - bool eager_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -644,12 +648,13 @@ struct kvm_vcpu_arch { }; struct kvm_lpage_info { - int write_count; + int disallow_lpage; }; struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; }; /* @@ -694,6 +699,8 @@ struct kvm_arch { */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; + struct kvm_page_track_notifier_node mmu_sp_tracker; + struct kvm_page_track_notifier_head track_notifier_head; struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; @@ -754,6 +761,8 @@ struct kvm_arch { bool irqchip_split; u8 nr_reserved_ioapic_pins; + + bool disabled_lapic_found; }; struct kvm_vm_stat { @@ -988,6 +997,8 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_setup(struct kvm_vcpu *vcpu); +void kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); @@ -1127,8 +1138,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h new file mode 100644 index 000000000000..c2b8d24a235c --- /dev/null +++ b/arch/x86/include/asm/kvm_page_track.h @@ -0,0 +1,61 @@ +#ifndef _ASM_X86_KVM_PAGE_TRACK_H +#define _ASM_X86_KVM_PAGE_TRACK_H + +enum kvm_page_track_mode { + KVM_PAGE_TRACK_WRITE, + KVM_PAGE_TRACK_MAX, +}; + +/* + * The notifier represented by @kvm_page_track_notifier_node is linked into + * the head which will be notified when guest is triggering the track event. + * + * Write access on the head is protected by kvm->mmu_lock, read access + * is protected by track_srcu. + */ +struct kvm_page_track_notifier_head { + struct srcu_struct track_srcu; + struct hlist_head track_notifier_list; +}; + +struct kvm_page_track_notifier_node { + struct hlist_node node; + + /* + * It is called when guest is writing the write-tracked page + * and write emulation is finished at that time. + * + * @vcpu: the vcpu where the write access happened. + * @gpa: the physical address written by guest. + * @new: the data was written to the address. + * @bytes: the written length. + */ + void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes); +}; + +void kvm_page_track_init(struct kvm *kvm); + +void kvm_page_track_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); +int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages); + +void kvm_slot_page_track_add_page(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode); +void kvm_slot_page_track_remove_page(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode); +bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode); + +void +kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void +kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes); +#endif diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c1adf33fdd0d..bc62e7cbf1b1 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void) } #endif /* CONFIG_KVM_GUEST */ -#ifdef CONFIG_DEBUG_RODATA #define KVM_HYPERCALL \ ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) -#else -/* On AMD processors, vmcall will generate a trap that we will - * then rewrite to the appropriate instruction. - */ -#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" -#endif /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 19c099afa861..7e68f9558552 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -25,7 +25,6 @@ #include <linux/module.h> #include <linux/ftrace.h> -#ifdef CONFIG_LIVEPATCH static inline int klp_check_compiler_support(void) { #ifndef CC_USING_FENTRY @@ -40,8 +39,5 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } -#else -#error Live patching support is disabled; check CONFIG_LIVEPATCH -#endif #endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 2ea4527e462f..92b6f651fa4f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,8 +40,20 @@ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ /* AMD-specific bits */ -#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ +#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ + +/* + * McaX field if set indicates a given bank supports MCA extensions: + * - Deferred error interrupt type is specifiable by bank. + * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers, + * But should not be used to determine MSR numbers. + * - TCC bit is present in MCx_STATUS. + */ +#define MCI_CONFIG_MCAX 0x1 +#define MCI_IPID_MCATYPE 0xFFFF0000 +#define MCI_IPID_HWID 0xFFF /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is @@ -91,6 +103,16 @@ #define MCE_LOG_LEN 32 #define MCE_LOG_SIGNATURE "MACHINECHECK" +/* AMD Scalable MCA */ +#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003 +#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 +#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 +#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) + /* * This structure contains all data related to the MCE log. Also * carries a signature to make it easier to find from external @@ -113,6 +135,7 @@ struct mca_config { bool ignore_ce; bool disabled; bool ser; + bool recovery; bool bios_cmci_threshold; u8 banks; s8 bootlog; @@ -287,4 +310,49 @@ struct cper_sec_mem_err; extern void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err); +/* + * Enumerate new IP types and HWID values in AMD processors which support + * Scalable MCA. + */ +#ifdef CONFIG_X86_MCE_AMD +enum amd_ip_types { + SMCA_F17H_CORE = 0, /* Core errors */ + SMCA_DF, /* Data Fabric */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ + SMCA_SMU, /* System Management Unit */ + N_AMD_IP_TYPES +}; + +struct amd_hwid { + const char *name; + unsigned int hwid; +}; + +extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES]; + +enum amd_core_mca_blocks { + SMCA_LS = 0, /* Load Store */ + SMCA_IF, /* Instruction Fetch */ + SMCA_L2_CACHE, /* L2 cache */ + SMCA_DE, /* Decoder unit */ + RES, /* Reserved */ + SMCA_EX, /* Execution unit */ + SMCA_FP, /* Floating Point */ + SMCA_L3_CACHE, /* L3 cache */ + N_CORE_MCA_BLOCKS +}; + +extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS]; + +enum amd_df_mca_blocks { + SMCA_CS = 0, /* Coherent Slave */ + SMCA_PIE, /* Power management, Interrupts, etc */ + N_DF_BLOCKS +}; + +extern const char * const amd_df_mcablock_names[N_DF_BLOCKS]; +#endif + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 1e1b07a5a738..9d3a96c4da78 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -3,6 +3,7 @@ #include <asm/cpu.h> #include <linux/earlycpio.h> +#include <linux/initrd.h> #define native_rdmsr(msr, val1, val2) \ do { \ @@ -143,4 +144,29 @@ static inline void reload_early_microcode(void) { } static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif + +static inline unsigned long get_initrd_start(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + return initrd_start; +#else + return 0; +#endif +} + +static inline unsigned long get_initrd_start_addr(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + + return (unsigned long)__pa_nodebug(*initrd_start_p); +#else + return get_initrd_start(); +#endif +#else /* CONFIG_BLK_DEV_INITRD */ + return 0; +#endif +} + #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 8559b0102ea1..603417f8dd6c 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -40,7 +40,6 @@ struct extended_sigtable { #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) -#define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.datasize ? \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b05402ef3b84..2da46ac16e37 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1,7 +1,12 @@ #ifndef _ASM_X86_MSR_INDEX_H #define _ASM_X86_MSR_INDEX_H -/* CPU model specific register (MSR) numbers */ +/* + * CPU model specific register (MSR) numbers. + * + * Do not add new entries to this file unless the definitions are shared + * between multiple compilation units. + */ /* x86-64 specific MSRs */ #define MSR_EFER 0xc0000080 /* extended feature register */ @@ -230,10 +235,10 @@ #define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11) /* IA32_HWP_CAPABILITIES */ -#define HWP_HIGHEST_PERF(x) (x & 0xff) -#define HWP_GUARANTEED_PERF(x) ((x & (0xff << 8)) >>8) -#define HWP_MOSTEFFICIENT_PERF(x) ((x & (0xff << 16)) >>16) -#define HWP_LOWEST_PERF(x) ((x & (0xff << 24)) >>24) +#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff) +#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff) +#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff) +#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) /* IA32_HWP_REQUEST */ #define HWP_MIN_PERF(x) (x & 0xff) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f6192502149e..601f1b8f9961 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -13,6 +13,7 @@ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> +#include <asm/frame.h> static inline int paravirt_enabled(void) { @@ -756,15 +757,19 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, * call. The return value in rax/eax will not be saved, even for void * functions. */ +#define PV_THUNK_NAME(func) "__raw_callee_save_" #func #define PV_CALLEE_SAVE_REGS_THUNK(func) \ extern typeof(func) __raw_callee_save_##func; \ \ asm(".pushsection .text;" \ - ".globl __raw_callee_save_" #func " ; " \ - "__raw_callee_save_" #func ": " \ + ".globl " PV_THUNK_NAME(func) ";" \ + ".type " PV_THUNK_NAME(func) ", @function;" \ + PV_THUNK_NAME(func) ":" \ + FRAME_BEGIN \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ + FRAME_END \ "ret;" \ ".popsection") diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 77db5616a473..e8c2326478c8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -466,8 +466,9 @@ int paravirt_disable_iospace(void); * makes sure the incoming and outgoing types are always correct. */ #ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx +#define PVOP_VCALL_ARGS \ + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \ + register void *__sp asm("esp") #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) @@ -485,9 +486,10 @@ int paravirt_disable_iospace(void); #define VEXTRA_CLOBBERS #else /* CONFIG_X86_64 */ /* [re]ax isn't an arg, but the return val */ -#define PVOP_VCALL_ARGS \ - unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx, __eax = __eax +#define PVOP_VCALL_ARGS \ + unsigned long __edi = __edi, __esi = __esi, \ + __edx = __edx, __ecx = __ecx, __eax = __eax; \ + register void *__sp asm("rsp") #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) @@ -526,7 +528,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr \ + : call_clbr, "+r" (__sp) \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -536,7 +538,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr \ + : call_clbr, "+r" (__sp) \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -563,7 +565,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr \ + : call_clbr, "+r" (__sp) \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 462594320d39..9ab7507ca1c2 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -20,6 +20,9 @@ struct pci_sysdata { #ifdef CONFIG_X86_64 void *iommu; /* IOMMU private data */ #endif +#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN + void *fwnode; /* IRQ domain for MSI assignment */ +#endif }; extern int pci_routeirq; @@ -32,6 +35,7 @@ extern int noioapicreroute; static inline int pci_domain_nr(struct pci_bus *bus) { struct pci_sysdata *sd = bus->sysdata; + return sd->domain; } @@ -41,6 +45,17 @@ static inline int pci_proc_domain(struct pci_bus *bus) } #endif +#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + + return sd->fwnode; +} + +#define pci_root_bus_fwnode _pci_root_bus_fwnode +#endif + /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ @@ -105,9 +120,6 @@ void native_restore_msi_irqs(struct pci_dev *dev); #include <asm/pci_64.h> #endif -/* implement the pci_ DMA API in terms of the generic device dma_ one */ -#include <asm-generic/pci-dma-compat.h> - /* generic pci stuff */ #include <asm-generic/pci.h> diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 46873fbd44e1..d08eacd298c2 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); +extern bool mp_should_keep_irq(struct device *dev); + struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7bcb861a04e5..5a2ed3ed2f26 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -165,6 +165,7 @@ struct x86_pmu_capability { #define GLOBAL_STATUS_ASIF BIT_ULL(60) #define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) +#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) /* * IBS cpuid feature detection diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index c57fd1ea9689..bf8b35d2035a 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -137,6 +137,11 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) arch_wb_cache_pmem(addr, size); } +static inline void arch_invalidate_pmem(void __pmem *addr, size_t size) +{ + clflush_cache_range((void __force *) addr, size); +} + static inline bool __arch_has_wmb_pmem(void) { /* diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 01bcde84d3e4..d397deb58146 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -94,10 +94,19 @@ static __always_inline bool should_resched(int preempt_offset) #ifdef CONFIG_PREEMPT extern asmlinkage void ___preempt_schedule(void); -# define __preempt_schedule() asm ("call ___preempt_schedule") +# define __preempt_schedule() \ +({ \ + register void *__sp asm(_ASM_SP); \ + asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \ +}) + extern asmlinkage void preempt_schedule(void); extern asmlinkage void ___preempt_schedule_notrace(void); -# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace") +# define __preempt_schedule_notrace() \ +({ \ + register void *__sp asm(_ASM_SP); \ + asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \ +}) extern asmlinkage void preempt_schedule_notrace(void); #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ecb410310e70..983738ac014c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -128,6 +128,8 @@ struct cpuinfo_x86 { u16 booted_cores; /* Physical processor id: */ u16 phys_proc_id; + /* Logical processor id: */ + u16 logical_proc_id; /* Core id: */ u16 cpu_core_id; /* Compute unit id */ @@ -297,10 +299,13 @@ struct tss_struct { */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +#ifdef CONFIG_X86_32 /* - * Space for the temporary SYSENTER stack: + * Space for the temporary SYSENTER stack. */ + unsigned long SYSENTER_stack_canary; unsigned long SYSENTER_stack[64]; +#endif } ____cacheline_aligned; diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a4a77286cb1d..9b9b30b19441 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -7,12 +7,23 @@ void syscall_init(void); +#ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); -void entry_SYSCALL_compat(void); +#endif + +#ifdef CONFIG_X86_32 void entry_INT80_32(void); -void entry_INT80_compat(void); void entry_SYSENTER_32(void); +void __begin_SYSENTER_singlestep_region(void); +void __end_SYSENTER_singlestep_region(void); +#endif + +#ifdef CONFIG_IA32_EMULATION void entry_SYSENTER_compat(void); +void __end_entry_SYSENTER_compat(void); +void entry_SYSCALL_compat(void); +void entry_INT80_compat(void); +#endif void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 9f92c180ed2f..9d55f9b6e167 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -36,8 +36,10 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); */ asm (".pushsection .text;" ".globl " PV_UNLOCK ";" + ".type " PV_UNLOCK ", @function;" ".align 4,0x90;" PV_UNLOCK ": " + FRAME_BEGIN "push %rdx;" "mov $0x1,%eax;" "xor %edx,%edx;" @@ -45,6 +47,7 @@ asm (".pushsection .text;" "cmp $0x1,%al;" "jne .slowpath;" "pop %rdx;" + FRAME_END "ret;" ".slowpath: " "push %rsi;" @@ -52,6 +55,7 @@ asm (".pushsection .text;" "call " PV_UNLOCK_SLOWPATH ";" "pop %rsi;" "pop %rdx;" + FRAME_END "ret;" ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" ".popsection"); diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 0a5242428659..13b6cdd0af57 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -7,7 +7,7 @@ extern char __brk_base[], __brk_limit[]; extern struct exception_table_entry __stop___ex_table[]; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; #endif diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 89db46752a8f..452c88b8ad06 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -13,7 +13,6 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask); diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 70bbe39043a9..7c247e7404be 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo, /* Generic stack tracer with callbacks */ struct stacktrace_ops { - void (*address)(void *data, unsigned long address, int reliable); + int (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); walk_stack_t walk_stack; diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index ff8b9a17dc4b..ca6ba3607705 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +/** + * memcpy_mcsafe - copy memory with indication if a machine check happened + * + * @dst: destination address + * @src: source address + * @cnt: number of bytes to copy + * + * Low level memory copy function that catches machine checks + * + * Return true for success, false for fail + */ +bool memcpy_mcsafe(void *dst, const void *src, size_t cnt); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c0778fcab06d..82866697fcf1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -134,10 +134,13 @@ struct thread_info { #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) -/* work to do in syscall_trace_enter() */ +/* + * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for + * enter_from_user_mode() + */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) /* work to do on any return to user space */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 0fb46482dfde..7f991bd5031b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -119,12 +119,23 @@ static inline void setup_node_to_cpumask_map(void) { } extern const struct cpumask *cpu_coregroup_mask(int cpu); +#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #ifdef ENABLE_TOPO_DEFINES #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) + +extern unsigned int __max_logical_packages; +#define topology_max_packages() (__max_logical_packages) +int topology_update_package_map(unsigned int apicid, unsigned int cpu); +extern int topology_phys_to_logical_pkg(unsigned int pkg); +#else +#define topology_max_packages() (1) +static inline int +topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 6d7c5479bcea..174c4212780a 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void) return rdtsc(); } +extern struct system_counterval_t convert_art_to_tsc(cycle_t art); + extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a4a30e4b2d34..88bff6dd23ad 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un likely(!__range_not_ok(addr, size, user_addr_max())) /* - * The exception table consists of pairs of addresses relative to the - * exception table enty itself: the first is the address of an - * instruction that is allowed to fault, and the second is the address - * at which the program should continue. No registers are modified, - * so it is entirely up to the continuation code to figure out what to - * do. + * The exception table consists of triples of addresses relative to the + * exception table entry itself. The first address is of an instruction + * that is allowed to fault, the second is the target at which the program + * should continue. The third is a handler function to deal with the fault + * caused by the instruction in the first field. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un */ struct exception_table_entry { - int insn, fixup; + int insn, fixup, handler; }; /* This is not the generic standard exception_table_entry format */ #define ARCH_HAS_SORT_EXTABLE #define ARCH_HAS_SEARCH_EXTABLE -extern int fixup_exception(struct pt_regs *regs); +extern int fixup_exception(struct pt_regs *regs, int trapnr); +extern bool ex_has_fault_handler(unsigned long ip); extern int early_fixup_exception(unsigned long *ip); /* @@ -179,10 +179,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ + register void *__sp asm(_ASM_SP); \ __chk_user_ptr(ptr); \ might_fault(); \ - asm volatile("call __get_user_%P3" \ - : "=a" (__ret_gu), "=r" (__val_gu) \ + asm volatile("call __get_user_%P4" \ + : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index f5dcb5204dcd..3fe0eac59462 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -48,20 +48,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __put_user_size(*(u8 *)from, (u8 __user *)to, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __put_user_size(*(u16 *)from, (u16 __user *)to, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); + __uaccess_end(); return ret; case 8: + __uaccess_begin(); __put_user_size(*(u64 *)from, (u64 __user *)to, 8, ret, 8); + __uaccess_end(); return ret; } } @@ -103,13 +111,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -148,13 +162,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -170,13 +190,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 3bcdcc84259d..a12a047184ee 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -110,9 +110,10 @@ extern struct { char _entry[32]; } hypercall_page[]; register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ - register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \ + register void *__sp asm(_ASM_SP); -#define __HYPERCALL_0PARAM "=r" (__res) +#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp) #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 968d57dd54c9..f320ee32d5a1 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -57,7 +57,7 @@ static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, { if (xen_pci_frontend && xen_pci_frontend->enable_msi) return xen_pci_frontend->enable_msi(dev, vectors); - return -ENODEV; + return -ENOSYS; } static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) { @@ -69,7 +69,7 @@ static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, { if (xen_pci_frontend && xen_pci_frontend->enable_msix) return xen_pci_frontend->enable_msix(dev, vectors, nvec); - return -ENODEV; + return -ENOSYS; } static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev) { diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 7956412d09bd..9b1a91834ac8 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -226,7 +226,9 @@ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) /* Declare the various hypercall operations. */ -#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_POST_MESSAGE 0x005c +#define HVCALL_SIGNAL_EVENT 0x005d #define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d485232f1e9f..62d4111c1c54 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -256,7 +256,7 @@ struct sigcontext_64 { __u16 cs; __u16 gs; __u16 fs; - __u16 __pad0; + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; @@ -341,9 +341,37 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; + + /* + * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), + * Linux saved and restored fs and gs in these slots. This + * was counterproductive, as fsbase and gsbase were never + * saved, so arch_prctl was presumably unreliable. + * + * These slots should never be reused without extreme caution: + * + * - Some DOSEMU versions stash fs and gs in these slots manually, + * thus overwriting anything the kernel expects to be preserved + * in these slots. + * + * - If these slots are ever needed for any other purpose, + * there is some risk that very old 64-bit binaries could get + * confused. I doubt that many such binaries still work, + * though, since the same patch in 2.5.64 also removed the + * 64-bit set_thread_area syscall, so it appears that there + * is no TLS API beyond modify_ldt that works in both pre- + * and post-2.5.64 kernels. + * + * If the kernel ever adds explicit fs, gs, fsbase, and gsbase + * save/restore, it will most likely need to be opt-in and use + * different context slots. + */ __u16 gs; __u16 fs; - __u16 __pad0; + union { + __u16 ss; /* If UC_SIGCONTEXT_SS */ + __u16 __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */ + }; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h index b7c29c8017f2..e3d1ec90616e 100644 --- a/arch/x86/include/uapi/asm/ucontext.h +++ b/arch/x86/include/uapi/asm/ucontext.h @@ -1,11 +1,54 @@ #ifndef _ASM_X86_UCONTEXT_H #define _ASM_X86_UCONTEXT_H -#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state - * information in the memory layout pointed - * by the fpstate pointer in the ucontext's - * sigcontext struct (uc_mcontext). - */ +/* + * Indicates the presence of extended state information in the memory + * layout pointed by the fpstate pointer in the ucontext's sigcontext + * struct (uc_mcontext). + */ +#define UC_FP_XSTATE 0x1 + +#ifdef __x86_64__ +/* + * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on + * kernels that save SS in the sigcontext. All kernels that set + * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp + * regardless of SS (i.e. they implement espfix). + * + * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS + * when delivering a signal that came from 64-bit code. + * + * Sigreturn restores SS as follows: + * + * if (saved SS is valid || UC_STRICT_RESTORE_SS is set || + * saved CS is not 64-bit) + * new SS = saved SS (will fail IRET and signal if invalid) + * else + * new SS = a flat 32-bit data segment + * + * This behavior serves three purposes: + * + * - Legacy programs that construct a 64-bit sigcontext from scratch + * with zero or garbage in the SS slot (e.g. old CRIU) and call + * sigreturn will still work. + * + * - Old DOSEMU versions sometimes catch a signal from a segmented + * context, delete the old SS segment (with modify_ldt), and change + * the saved CS to a 64-bit segment. These DOSEMU versions expect + * sigreturn to send them back to 64-bit mode without killing them, + * despite the fact that the SS selector when the signal was raised is + * no longer valid. UC_STRICT_RESTORE_SS will be clear, so the kernel + * will fix up SS for these DOSEMU versions. + * + * - Old and new programs that catch a signal and return without + * modifying the saved context will end up in exactly the state they + * started in, even if they were running in a segmented context when + * the signal was raised.. Old kernels would lose track of the + * previous SS value. + */ +#define UC_SIGCONTEXT_SS 0x2 +#define UC_STRICT_RESTORE_SS 0x4 +#endif #include <asm-generic/ucontext.h> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b1b78ffe01d0..d5fb0871aba3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -16,9 +16,14 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif -KASAN_SANITIZE_head$(BITS).o := n -KASAN_SANITIZE_dumpstack.o := n -KASAN_SANITIZE_dumpstack_$(BITS).o := n +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n + +OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_test_nx.o := y CFLAGS_irq.o := -I$(src)/../include/asm/trace diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index d1daead5fcdd..adb3eaf8fe2a 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,6 +16,7 @@ #include <asm/cacheflush.h> #include <asm/realmode.h> +#include <linux/ftrace.h> #include "../../realmode/rm/wakeup.h" #include "sleep.h" @@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + /* + * Pause/unpause graph tracing around do_suspend_lowlevel as it has + * inconsistent call/return info after it jumps to the wakeup vector. + */ + pause_graph_tracing(); do_suspend_lowlevel(); + unpause_graph_tracing(); return 0; } diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 8c35df468104..169963f471bb 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -5,6 +5,7 @@ #include <asm/page_types.h> #include <asm/msr.h> #include <asm/asm-offsets.h> +#include <asm/frame.h> # Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 @@ -39,6 +40,7 @@ bogus_64_magic: jmp bogus_64_magic ENTRY(do_suspend_lowlevel) + FRAME_BEGIN subq $8, %rsp xorl %eax, %eax call save_processor_state @@ -109,6 +111,7 @@ ENTRY(do_suspend_lowlevel) xorl %eax, %eax addq $8, %rsp + FRAME_END jmp restore_processor_state ENDPROC(do_suspend_lowlevel) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e85f713641d..0a2bb1f62e72 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) return 0; } -static int gart_fix_e820 __initdata = 1; +static bool gart_fix_e820 __initdata = true; static int __init parse_gart_mem(char *p) { - if (!p) - return -EINVAL; - - if (!strncmp(p, "off", 3)) - gart_fix_e820 = 0; - else if (!strncmp(p, "on", 2)) - gart_fix_e820 = 1; - - return 0; + return kstrtobool(p, &gart_fix_e820); } early_param("gart_fix_e820", parse_gart_mem); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8a5cddac7d44..531b9611c51d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2078,6 +2078,20 @@ int generic_processor_info(int apicid, int version) cpu = cpumask_next_zero(-1, cpu_present_mask); /* + * This can happen on physical hotplug. The sanity check at boot time + * is done from native_smp_prepare_cpus() after num_possible_cpus() is + * established. + */ + if (topology_update_package_map(apicid, cpu) < 0) { + int thiscpu = max + disabled_cpus; + + pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n", + thiscpu, apicid); + disabled_cpus++; + return -ENOSPC; + } + + /* * Validate version */ if (version == 0x0) { diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 9968f30cca3e..76f89e2b245a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ void flat_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static inline void _flat_send_IPI_mask(unsigned long mask, int vector) +static void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index eb45fc9b6124..28bde88b0085 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -18,6 +18,66 @@ #include <asm/proto.h> #include <asm/ipi.h> +void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +{ + /* + * Subtle. In the case of the 'never do double writes' workaround + * we have to lock out interrupts to be safe. As we don't care + * of the value read we use an atomic rmw access to avoid costly + * cli/sti. Otherwise we use an even cheaper single atomic write + * to the APIC. + */ + unsigned int cfg; + + /* + * Wait for idle. + */ + __xapic_wait_icr_idle(); + + /* + * No need to touch the target chip field + */ + cfg = __prepare_ICR(shortcut, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + native_apic_mem_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + void default_send_IPI_single_phys(int cpu, int vector) { unsigned long flags; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 84a7524b202c..5c042466f274 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -59,7 +59,6 @@ void common(void) { #ifdef CONFIG_PARAVIRT BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index fdeb0ce07c16..ecdc1d217dc0 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -52,6 +52,11 @@ void foo(void) DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - offsetofend(struct tss_struct, SYSENTER_stack)); + /* Offset from cpu_tss to SYSENTER_stack */ + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index faa7b5204129..0d373d7affc8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,33 +30,11 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o - -ifdef CONFIG_PERF_EVENTS -obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o -ifdef CONFIG_AMD_IOMMU -obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o -endif -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o - -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ - perf_event_intel_uncore_snb.o \ - perf_event_intel_uncore_snbep.o \ - perf_event_intel_uncore_nhmex.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o -obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o -endif - - obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a07956a08936..5026a13356c4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -75,7 +75,10 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) */ extern __visible void vide(void); -__asm__(".globl vide\n\t.align 4\nvide: ret"); +__asm__(".globl vide\n" + ".type vide, @function\n" + ".align 4\n" + "vide: ret\n"); static void init_amd_k5(struct cpuinfo_x86 *c) { @@ -117,7 +120,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) void (*f_vide)(void); u64 d, d2; - printk(KERN_INFO "AMD K6 stepping B detected - "); + pr_info("AMD K6 stepping B detected - "); /* * It looks like AMD fixed the 2.6.2 bug and improved indirect @@ -133,10 +136,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c) d = d2-d; if (d > 20*K6_BUG_LOOP) - printk(KERN_CONT - "system stability may be impaired when more than 32 MB are used.\n"); + pr_cont("system stability may be impaired when more than 32 MB are used.\n"); else - printk(KERN_CONT "probably OK (after B9730xxxx).\n"); + pr_cont("probably OK (after B9730xxxx).\n"); } /* K6 with old style WHCR */ @@ -154,7 +156,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) wbinvd(); wrmsr(MSR_K6_WHCR, l, h); local_irq_restore(flags); - printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + pr_info("Enabling old style K6 write allocation for %d Mb\n", mbytes); } return; @@ -175,7 +177,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) wbinvd(); wrmsr(MSR_K6_WHCR, l, h); local_irq_restore(flags); - printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + pr_info("Enabling new style K6 write allocation for %d Mb\n", mbytes); } @@ -202,7 +204,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c) */ if (c->x86_model >= 6 && c->x86_model <= 10) { if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + pr_info("Enabling disabled K7/SSE Support.\n"); msr_clear_bit(MSR_K7_HWCR, 15); set_cpu_cap(c, X86_FEATURE_XMM); } @@ -216,9 +218,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c) if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { - printk(KERN_INFO - "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", - l, ((l & 0x000fffff)|0x20000000)); + pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } @@ -485,7 +486,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { unsigned long pfn = tseg >> PAGE_SHIFT; - printk(KERN_DEBUG "tseg: %010llx\n", tseg); + pr_debug("tseg: %010llx\n", tseg); if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } @@ -500,8 +501,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) rdmsrl(MSR_K7_HWCR, val); if (!(val & BIT(24))) - printk(KERN_WARNING FW_BUG "TSC doesn't count " - "with P0 frequency!\n"); + pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } } diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 04f0fe5af83e..a972ac4c7e7d 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -15,7 +15,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #if !defined(CONFIG_SMP) - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif alternative_instructions(); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 6608c03c2126..1661d8ec9280 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c) rdmsr(MSR_VIA_FCR, lo, hi); lo |= ACE_FCR; /* enable ACE unit */ wrmsr(MSR_VIA_FCR, lo, hi); - printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n"); + pr_info("CPU: Enabled ACE h/w crypto\n"); } /* enable RNG unit, if present and disabled */ @@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c) rdmsr(MSR_VIA_RNG, lo, hi); lo |= RNG_ENABLE; /* enable RNG unit */ wrmsr(MSR_VIA_RNG, lo, hi); - printk(KERN_INFO "CPU: Enabled h/w RNG\n"); + pr_info("CPU: Enabled h/w RNG\n"); } /* store Centaur Extended Feature Flags as @@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c) name = "C6"; fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK; fcr_clr = DPDC; - printk(KERN_NOTICE "Disabling bugged TSC.\n"); + pr_notice("Disabling bugged TSC.\n"); clear_cpu_cap(c, X86_FEATURE_TSC); break; case 8: @@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c) newlo = (lo|fcr_set) & (~fcr_clr); if (newlo != lo) { - printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", + pr_info("Centaur FCR was 0x%X now 0x%X\n", lo, newlo); wrmsr(MSR_IDT_FCR1, newlo, hi); } else { - printk(KERN_INFO "Centaur FCR is 0x%X\n", lo); + pr_info("Centaur FCR is 0x%X\n", lo); } /* Emulate MTRRs using Centaur's MCR. */ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4fac2634ba19..06ad72383b4e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -244,7 +244,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) lo |= 0x200000; wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); + pr_notice("CPU serial number disabled.\n"); clear_cpu_cap(c, X86_FEATURE_PN); /* Disabling the serial number may affect the cpuid level */ @@ -387,9 +387,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - printk(KERN_WARNING - "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", + x86_cap_flag(df->feature), df->level); } } @@ -568,7 +567,7 @@ void detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { - printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); + pr_info_once("CPU0: Hyper-Threading is disabled\n"); goto out; } @@ -589,10 +588,10 @@ void detect_ht(struct cpuinfo_x86 *c) out: if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); + pr_info("CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + pr_info("CPU: Processor Core ID: %d\n", + c->cpu_core_id); printed = 1; } #endif @@ -617,9 +616,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c) } } - printk_once(KERN_ERR - "CPU: vendor_id '%s' unknown, using generic init.\n" \ - "CPU: Your system may be unstable.\n", v); + pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; @@ -819,7 +817,7 @@ void __init early_cpu_init(void) int count = 0; #ifdef CONFIG_PROCESSOR_SELECT - printk(KERN_INFO "KERNEL supported cpus:\n"); + pr_info("KERNEL supported cpus:\n"); #endif for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { @@ -837,7 +835,7 @@ void __init early_cpu_init(void) for (j = 0; j < 2; j++) { if (!cpudev->c_ident[j]) continue; - printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + pr_info(" %s %s\n", cpudev->c_vendor, cpudev->c_ident[j]); } } @@ -862,6 +860,31 @@ static void detect_nopl(struct cpuinfo_x86 *c) #else set_cpu_cap(c, X86_FEATURE_NOPL); #endif + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_PARAVIRT + do { + extern void native_iret(void); + if (pv_cpu_ops.iret == native_iret) + set_cpu_bug(c, X86_BUG_ESPFIX); + } while (0); +#else + set_cpu_bug(c, X86_BUG_ESPFIX); +#endif +#endif } static void generic_identify(struct cpuinfo_x86 *c) @@ -1037,6 +1060,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif + /* The boot/hotplug time assigment got cleared, restore it */ + c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id); } /* @@ -1121,7 +1146,7 @@ static void __print_cpu_msr(void) for (index = index_min; index < index_max; index++) { if (rdmsrl_safe(index, &val)) continue; - printk(KERN_INFO " MSR%08x: %016llx\n", index, val); + pr_info(" MSR%08x: %016llx\n", index, val); } } } @@ -1160,19 +1185,19 @@ void print_cpu_info(struct cpuinfo_x86 *c) } if (vendor && !strstr(c->x86_model_id, vendor)) - printk(KERN_CONT "%s ", vendor); + pr_cont("%s ", vendor); if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); + pr_cont("%s", c->x86_model_id); else - printk(KERN_CONT "%d86", c->x86); + pr_cont("%d86", c->x86); - printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model); + pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model); if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask); + pr_cont(", stepping: 0x%x)\n", c->x86_mask); else - printk(KERN_CONT ")\n"); + pr_cont(")\n"); print_cpu_msr(c); } @@ -1498,7 +1523,7 @@ void cpu_init(void) show_ucode_info_early(); - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + pr_info("Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 15e47c1cd412..6adef9cac23e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -104,7 +104,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c) local_irq_restore(flags); if (ccr5 & 2) { /* possible wrong calibration done */ - printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n"); + pr_info("Recalibrating delay loop with SLOP bit reset\n"); calibrate_delay(); c->loops_per_jiffy = loops_per_jiffy; } @@ -116,7 +116,7 @@ static void set_cx86_reorder(void) { u8 ccr3; - printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n"); + pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n"); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ @@ -129,7 +129,7 @@ static void set_cx86_reorder(void) static void set_cx86_memwb(void) { - printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); + pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); @@ -269,7 +269,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) * VSA1 we work around however. */ - printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); + pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n"); isa_dma_bridge_buggy = 2; /* We do this before the PCI layer is running. However we @@ -427,7 +427,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c) if (dir0 == 5 || dir0 == 3) { unsigned char ccr3; unsigned long flags; - printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); + pr_info("Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); /* enable MAPEN */ diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index d820d8eae96b..73d391ae452f 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -56,7 +56,7 @@ detect_hypervisor_vendor(void) } if (max_pri) - printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); + pr_info("Hypervisor detected: %s\n", x86_hyper->name); } void init_hypervisor(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9299e3bdfad6..1f7fdb91a818 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && c->microcode < 0x20e) { - printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); } @@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { - printk(KERN_INFO "Disabled fast string operations\n"); + pr_info("Disabled fast string operations\n"); setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); setup_clear_cpu_cap(X86_FEATURE_ERMS); } @@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) pr_info("Disabling PGE capability bit\n"); setup_clear_cpu_cap(X86_FEATURE_PGE); } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } } #ifdef CONFIG_X86_32 @@ -176,7 +189,7 @@ int ppro_with_ram_bug(void) boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask < 8) { - printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n"); + pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; } return 0; @@ -225,7 +238,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_F00F); if (!f00f_workaround_enabled) { - printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } } @@ -244,7 +257,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * Forcefully enable PAE if kernel parameter "forcepae" is present. */ if (forcepae) { - printk(KERN_WARNING "PAE forced!\n"); + pr_warn("PAE forced!\n"); set_cpu_cap(c, X86_FEATURE_PAE); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 341449c49f34..de6626c18e42 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf, err = amd_set_l3_disable_slot(nb, cpu, slot, val); if (err) { if (err == -EEXIST) - pr_warning("L3 slot %d in use/index already disabled!\n", + pr_warn("L3 slot %d in use/index already disabled!\n", slot); return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 4cfba4371a71..517619ea6498 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -115,7 +115,7 @@ static int raise_local(void) int cpu = m->extcpu; if (m->inject_flags & MCJ_EXCEPTION) { - printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + pr_info("Triggering MCE exception on CPU %d\n", cpu); switch (context) { case MCJ_CTX_IRQ: /* @@ -128,15 +128,15 @@ static int raise_local(void) raise_exception(m, NULL); break; default: - printk(KERN_INFO "Invalid MCE context\n"); + pr_info("Invalid MCE context\n"); ret = -EINVAL; } - printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + pr_info("MCE exception done on CPU %d\n", cpu); } else if (m->status) { - printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + pr_info("Starting machine check poll CPU %d\n", cpu); raise_poll(m); mce_notify_irq(); - printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + pr_info("Machine check poll done on CPU %d\n", cpu); } else m->finished = 0; @@ -183,8 +183,7 @@ static void raise_mce(struct mce *m) start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { - printk(KERN_ERR - "Timeout waiting for mce inject %lx\n", + pr_err("Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } @@ -241,7 +240,7 @@ static int inject_init(void) { if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; - printk(KERN_INFO "Machine check injector initialized\n"); + pr_info("Machine check injector initialized\n"); register_mce_write_callback(mce_write); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 9c682c222071..5119766d9889 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/debugfs.h> #include <asm/mce.h> +#include <asm/uaccess.h> #include "mce-internal.h" @@ -29,7 +30,7 @@ * panic situations) */ -enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; @@ -48,6 +49,7 @@ static struct severity { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER +#define KERNEL_RECOV .context = IN_KERNEL_RECOV #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER #define EXCP .excp = EXCP_CONTEXT @@ -87,6 +89,10 @@ static struct severity { EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( DEFERRED, "Deferred error", NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), @@ -123,6 +129,11 @@ static struct severity { MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) ), MCESEV( + AR, "Action required: data load in error recoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL_RECOV + ), + MCESEV( AR, "Action required: data load error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER @@ -170,6 +181,9 @@ static struct severity { ) /* always matches. keep at end */ }; +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -183,7 +197,11 @@ static struct severity { */ static int error_context(struct mce *m) { - return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + if ((m->cs & 3) == 3) + return IN_USER; + if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) + return IN_KERNEL_RECOV; + return IN_KERNEL; } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a006f4cd792b..f0c921b03e42 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear) } } +static int do_memory_failure(struct mce *m) +{ + int flags = MF_ACTION_REQUIRED; + int ret; + + pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); + if (!(m->mcgstatus & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); + if (ret) + pr_err("Memory error not recovered"); + return ret; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - u64 recover_paddr = ~0ull; - int flags = MF_ACTION_REQUIRED; int lmce = 0; /* If this CPU is offline, just bail out. */ @@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) } /* - * At insane "tolerant" levels we take no action. Otherwise - * we only die if we have no other choice. For less serious - * issues we try to recover, or limit damage to the current - * process. + * If tolerant is at an insane level we drop requests to kill + * processes and continue even when there is no way out. */ - if (cfg->tolerant < 3) { - if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - if (worst == MCE_AR_SEVERITY) { - recover_paddr = m.addr; - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - } else if (kill_it) { - force_sig(SIGBUS, current); - } - } + if (cfg->tolerant == 3) + kill_it = 0; + else if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) mce_report_event(regs); @@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code) out: sync_core(); - if (recover_paddr == ~0ull) - goto done; + if (worst != MCE_AR_SEVERITY && !kill_it) + goto out_ist; - pr_err("Uncorrected hardware memory error in user-access at %llx", - recover_paddr); - /* - * We must call memory_failure() here even if the current process is - * doomed. We still need to mark the page as poisoned and alert any - * other users of the page. - */ - ist_begin_non_atomic(regs); - local_irq_enable(); - if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { - pr_err("Memory error not recovered"); - force_sig(SIGBUS, current); + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3) { + ist_begin_non_atomic(regs); + local_irq_enable(); + + if (kill_it || do_memory_failure(&m)) + force_sig(SIGBUS, current); + local_irq_disable(); + ist_end_non_atomic(); + } else { + if (!fixup_exception(regs, X86_TRAP_MC)) + mce_panic("Failed kernel mode recovery", &m, NULL); } - local_irq_disable(); - ist_end_non_atomic(); -done: + +out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1576,6 +1578,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; + /* + * MCG_CAP.MCG_SER_P is necessary but not sufficient to know + * whether this processor will actually generate recoverable + * machine checks. Check to see if this is an E7 model Xeon. + * We can't do a model number check because E5 and E7 use the + * same model number. E5 doesn't support recovery, E7 does. + */ + if (mca_cfg.recovery || (mca_cfg.ser && + !strncmp(c->x86_model_id, + "Intel(R) Xeon(R) CPU E7-", 24))) + set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -1617,10 +1630,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) case X86_VENDOR_AMD: { u32 ebx = cpuid_ebx(0x80000007); - mce_amd_feature_init(c); mce_flags.overflow_recov = !!(ebx & BIT(0)); mce_flags.succor = !!(ebx & BIT(1)); mce_flags.smca = !!(ebx & BIT(3)); + mce_amd_feature_init(c); break; } @@ -2028,6 +2041,8 @@ static int __init mcheck_enable(char *str) cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; + else if (!strcmp(str, "recovery")) + cfg->recovery = true; else if (isdigit(str[0])) { if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e99b15077e94..9d656fd436ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,5 +1,5 @@ /* - * (c) 2005-2015 Advanced Micro Devices, Inc. + * (c) 2005-2016 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -28,7 +28,7 @@ #include <asm/msr.h> #include <asm/trace/irq_vectors.h> -#define NR_BLOCKS 9 +#define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 #define MASK_VALID_HI 0x80000000 @@ -49,6 +49,19 @@ #define DEF_LVT_OFF 0x2 #define DEF_INT_TYPE_APIC 0x2 +/* Scalable MCA: */ + +/* Threshold LVT offset is at MSR0xC0000410[15:12] */ +#define SMCA_THR_LVT_OFF 0xF000 + +/* + * OS is required to set the MCAX bit to acknowledge that it is now using the + * new MSR ranges and new registers under each bank. It also means that the OS + * will configure deferred errors in the new MCx_CONFIG register. If the bit is + * not set, uncorrectable errors will cause a system panic. + */ +#define SMCA_MCAX_EN_OFF 0x1 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -58,6 +71,35 @@ static const char * const th_names[] = { "execution_unit", }; +/* Define HWID to IP type mappings for Scalable MCA */ +struct amd_hwid amd_hwids[] = { + [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, + [SMCA_DF] = { "data_fabric", 0x2E }, + [SMCA_UMC] = { "umc", 0x96 }, + [SMCA_PB] = { "param_block", 0x5 }, + [SMCA_PSP] = { "psp", 0xFF }, + [SMCA_SMU] = { "smu", 0x1 }, +}; +EXPORT_SYMBOL_GPL(amd_hwids); + +const char * const amd_core_mcablock_names[] = { + [SMCA_LS] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [RES] = "", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", +}; +EXPORT_SYMBOL_GPL(amd_core_mcablock_names); + +const char * const amd_df_mcablock_names[] = { + [SMCA_CS] = "coherent_slave", + [SMCA_PIE] = "pie", +}; +EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -84,6 +126,13 @@ struct thresh_restart { static inline bool is_shared_bank(int bank) { + /* + * Scalable MCA provides for only one core to have access to the MSRs of + * a shared bank. + */ + if (mce_flags.smca) + return false; + /* Bank 4 is for northbridge reporting and is thus shared */ return (bank == 4); } @@ -135,6 +184,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) } if (apic != msr) { + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return 0; + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); @@ -144,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* - * Called via smp_call_function_single(), must be called with correct - * cpu affinity. - */ +/* Reprogram MCx_MISC MSR behind this threshold bank. */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; @@ -247,27 +301,116 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } +static u32 get_block_address(u32 current_addr, u32 low, u32 high, + unsigned int bank, unsigned int block) +{ + u32 addr = 0, offset = 0; + + if (mce_flags.smca) { + if (!block) { + addr = MSR_AMD64_SMCA_MCx_MISC(bank); + } else { + /* + * For SMCA enabled processors, BLKPTR field of the + * first MISC register (MCx_MISC0) indicates presence of + * additional MISC register set (MISC1-4). + */ + u32 low, high; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return addr; + + if (!(low & MCI_CONFIG_MCAX)) + return addr; + + if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + (low & MASK_BLKPTR_LO)) + addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } + return addr; + } + + /* Fall back to method we used for older processors: */ + switch (block) { + case 0: + addr = MSR_IA32_MCx_MISC(bank); + break; + case 1: + offset = ((low & MASK_BLKPTR_LO) >> 21); + if (offset) + addr = MCG_XBLK_ADDR + offset; + break; + default: + addr = ++current_addr; + } + return addr; +} + +static int +prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) +{ + unsigned int cpu = smp_processor_id(); + struct threshold_block b; + int new; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = addr; + b.interrupt_capable = lvt_interrupt_supported(bank, misc_high); + + if (!b.interrupt_capable) + goto done; + + b.interrupt_enable = 1; + + if (mce_flags.smca) { + u32 smca_low, smca_high; + u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { + smca_high |= SMCA_MCAX_EN_OFF; + wrmsr(smca_addr, smca_low, smca_high); + } + + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + } else { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + } + + offset = setup_APIC_mce_threshold(offset, new); + + if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) + mce_threshold_vector = amd_threshold_interrupt; + +done: + mce_threshold_block_init(&b, offset); + +out: + return offset; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - struct threshold_block b; - unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - int offset = -1, new; + int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) - address = MSR_IA32_MCx_MISC(bank); - else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - - address += MCG_XBLK_ADDR; - } else - ++address; + address = get_block_address(address, low, high, bank, block); + if (!address) + break; if (rdmsr_safe(address, &low, &high)) break; @@ -279,29 +422,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) (high & MASK_LOCKED_HI)) continue; - if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); - - memset(&b, 0, sizeof(b)); - b.cpu = cpu; - b.bank = bank; - b.block = block; - b.address = address; - b.interrupt_capable = lvt_interrupt_supported(bank, high); - - if (!b.interrupt_capable) - goto init; - - b.interrupt_enable = 1; - new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce_threshold(offset, new); - - if ((offset == new) && - (mce_threshold_vector != amd_threshold_interrupt)) - mce_threshold_vector = amd_threshold_interrupt; - -init: - mce_threshold_block_init(&b, offset); + offset = prepare_threshold_block(bank, block, address, offset, high); } } @@ -394,16 +515,9 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) { - address = MSR_IA32_MCx_MISC(bank); - } else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } else { - ++address; - } + address = get_block_address(address, low, high, bank, block); + if (!address) + break; if (rdmsr_safe(address, &low, &high)) break; @@ -623,16 +737,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - if (!block) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - return 0; - address += MCG_XBLK_ADDR; - } else { - ++address; - } + address = get_block_address(address, low, high, bank, ++block); + if (!address) + return 0; - err = allocate_threshold_blocks(cpu, bank, ++block, address); + err = allocate_threshold_blocks(cpu, bank, block, address); if (err) goto out_free; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 12402e10aeff..2a0717bf8033 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); - printk(KERN_EMERG - "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", - smp_processor_id(), loaddr, lotype); + pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); if (lotype & (1<<5)) { - printk(KERN_EMERG - "CPU#%d: Possible thermal failure (CPU on fire ?).\n", - smp_processor_id()); + pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); - printk(KERN_INFO - "Intel old style machine check architecture supported.\n"); + pr_info("Intel old style machine check architecture supported.\n"); /* Enable MCE: */ cr4_set_bits(X86_CR4_MCE); - printk(KERN_INFO - "Intel old style machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); + pr_info("Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 2c5aaf8c2e2f..0b445c2ff735 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level) /* if we just entered the thermal event */ if (new_event) { if (event == THERMAL_THROTTLING_EVENT) - printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); @@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level) } if (old_event) { if (event == THERMAL_THROTTLING_EVENT) - printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", - this_cpu, + pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void) static void unexpected_thermal_interrupt(void) { - printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; @@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { if (system_state == SYSTEM_BOOTING) - printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); + pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } @@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n", - tm2 ? "TM2" : "TM1"); + pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 7245980186ee..fcf9ae9384f4 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -12,8 +12,8 @@ static void default_threshold_interrupt(void) { - printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", - THRESHOLD_APIC_VECTOR); + pr_err("Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); } void (*mce_threshold_vector)(void) = default_threshold_interrupt; diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 01dd8702880b..c6a722e1d011 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) { ist_enter(regs); - printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); + pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); ist_exit(regs); @@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) cr4_set_bits(X86_CR4_MCE); - printk(KERN_INFO - "Winchip machine check reporting enabled on CPU#0.\n"); + pr_info("Winchip machine check reporting enabled on CPU#0.\n"); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 2233f8a76615..8581963894c7 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; - if (ucode_new_rev) - pr_info("microcode: updated early to new patch_level=0x%08x\n", - ucode_new_rev); - eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); @@ -469,8 +465,7 @@ void reload_ucode_amd(void) if (mc && rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; - pr_info("microcode: reload patch_level=0x%08x\n", - ucode_new_rev); + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); } } } @@ -793,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) return -EINVAL; } - patch->data = kzalloc(patch_size, GFP_KERNEL); + patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL); if (!patch->data) { pr_err("Patch data allocation failure.\n"); kfree(patch); return -EINVAL; } - /* All looks ok, copy patch... */ - memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); INIT_LIST_HEAD(&patch->plist); patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; @@ -953,10 +946,14 @@ struct microcode_ops * __init init_amd_microcode(void) struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + pr_warn("AMD CPU family 0x%x not supported\n", c->x86); return NULL; } + if (ucode_new_rev) + pr_info_once("microcode updated early to new patch_level=0x%08x\n", + ucode_new_rev); + return µcode_amd_ops; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index faec7120c508..ac360bfbbdb6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,16 +43,8 @@ #define MICROCODE_VERSION "2.01" static struct microcode_ops *microcode_ops; - static bool dis_ucode_ldr; -static int __init disable_loader(char *str) -{ - dis_ucode_ldr = true; - return 1; -} -__setup("dis_ucode_ldr", disable_loader); - /* * Synchronization. * @@ -81,15 +73,16 @@ struct cpu_info_ctx { static bool __init check_loader_disabled_bsp(void) { + static const char *__dis_opt_str = "dis_ucode_ldr"; + #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); - const char *opt = "dis_ucode_ldr"; - const char *option = (const char *)__pa_nodebug(opt); + const char *option = (const char *)__pa_nodebug(__dis_opt_str); bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); #else /* CONFIG_X86_64 */ const char *cmdline = boot_command_line; - const char *option = "dis_ucode_ldr"; + const char *option = __dis_opt_str; bool *res = &dis_ucode_ldr; #endif @@ -479,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) enum ucode_state ustate; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci && uci->valid) + if (uci->valid) return UCODE_OK; if (collect_cpu_info(cpu)) @@ -630,7 +623,7 @@ int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (paravirt_enabled() || dis_ucode_ldr) + if (dis_ucode_ldr) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ee81c544ee0d..cbb3cf09b065 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,9 +39,15 @@ #include <asm/setup.h> #include <asm/msr.h> -static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +/* + * Temporary microcode blobs pointers storage. We note here the pointers to + * microcode blobs we've got from whatever storage (detached initrd, builtin). + * Later on, we put those into final storage mc_saved_data.mc_saved. + */ +static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; + static struct mc_saved_data { - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; } mc_saved_data; @@ -78,53 +84,50 @@ load_microcode_early(struct microcode_intel **saved, } static inline void -copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, - unsigned long off, int num_saved) +copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, + unsigned long off, int num_saved) { int i; for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); + mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); } #ifdef CONFIG_X86_32 static void -microcode_phys(struct microcode_intel **mc_saved_tmp, - struct mc_saved_data *mc_saved_data) +microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) { int i; struct microcode_intel ***mc_saved; - mc_saved = (struct microcode_intel ***) - __pa_nodebug(&mc_saved_data->mc_saved); - for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + + for (i = 0; i < mcs->num_saved; i++) { struct microcode_intel *p; - p = *(struct microcode_intel **) - __pa_nodebug(mc_saved_data->mc_saved + i); + p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); } } #endif static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, - unsigned long initrd_start, struct ucode_cpu_info *uci) +load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, + unsigned long offset, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data->mc_saved_count; + unsigned int count = mcs->num_saved; - if (!mc_saved_data->mc_saved) { - copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); + if (!mcs->mc_saved) { + copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); return load_microcode_early(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mc_saved_data); + microcode_phys(mc_saved_tmp, mcs); return load_microcode_early(mc_saved_tmp, count, uci); #else - return load_microcode_early(mc_saved_data->mc_saved, - count, uci); + return load_microcode_early(mcs->mc_saved, count, uci); #endif } } @@ -175,25 +178,25 @@ matching_model_microcode(struct microcode_header_intel *mc_header, } static int -save_microcode(struct mc_saved_data *mc_saved_data, +save_microcode(struct mc_saved_data *mcs, struct microcode_intel **mc_saved_src, - unsigned int mc_saved_count) + unsigned int num_saved) { int i, j; struct microcode_intel **saved_ptr; int ret; - if (!mc_saved_count) + if (!num_saved) return -EINVAL; /* * Copy new microcode data. */ - saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); if (!saved_ptr) return -ENOMEM; - for (i = 0; i < mc_saved_count; i++) { + for (i = 0; i < num_saved; i++) { struct microcode_header_intel *mc_hdr; struct microcode_intel *mc; unsigned long size; @@ -207,20 +210,18 @@ save_microcode(struct mc_saved_data *mc_saved_data, mc_hdr = &mc->hdr; size = get_totalsize(mc_hdr); - saved_ptr[i] = kmalloc(size, GFP_KERNEL); + saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); if (!saved_ptr[i]) { ret = -ENOMEM; goto err; } - - memcpy(saved_ptr[i], mc, size); } /* * Point to newly saved microcode. */ - mc_saved_data->mc_saved = saved_ptr; - mc_saved_data->mc_saved_count = mc_saved_count; + mcs->mc_saved = saved_ptr; + mcs->num_saved = num_saved; return 0; @@ -284,22 +285,20 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, * BSP can stay in the platform. */ static enum ucode_state __init -get_matching_model_microcode(int cpu, unsigned long start, - void *data, size_t size, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, +get_matching_model_microcode(unsigned long start, void *data, size_t size, + struct mc_saved_data *mcs, unsigned long *mc_ptrs, struct ucode_cpu_info *uci) { - u8 *ucode_ptr = data; - unsigned int leftover = size; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + struct microcode_header_intel *mc_header; + unsigned int num_saved = mcs->num_saved; enum ucode_state state = UCODE_OK; + unsigned int leftover = size; + u8 *ucode_ptr = data; unsigned int mc_size; - struct microcode_header_intel *mc_header; - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mc_saved_data->mc_saved_count; int i; - while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { if (leftover < sizeof(mc_header)) break; @@ -318,32 +317,31 @@ get_matching_model_microcode(int cpu, unsigned long start, * the platform, we need to find and save microcode patches * with the same family and model as the BSP. */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { ucode_ptr += mc_size; continue; } - mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); ucode_ptr += mc_size; } if (leftover) { state = UCODE_ERROR; - goto out; + return state; } - if (mc_saved_count == 0) { + if (!num_saved) { state = UCODE_NFOUND; - goto out; + return state; } - for (i = 0; i < mc_saved_count; i++) - mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + for (i = 0; i < num_saved; i++) + mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; + + mcs->num_saved = num_saved; - mc_saved_data->mc_saved_count = mc_saved_count; -out: return state; } @@ -373,7 +371,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -396,11 +394,11 @@ static void show_saved_mc(void) unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; - if (mc_saved_data.mc_saved_count == 0) { + if (!mc_saved_data.num_saved) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); @@ -409,7 +407,7 @@ static void show_saved_mc(void) rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + for (i = 0; i < mc_saved_data.num_saved; i++) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; int ext_sigcount; @@ -465,7 +463,7 @@ int save_mc_for_early(u8 *mc) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int mc_saved_count_init; - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; int ret = 0; int i; @@ -476,23 +474,23 @@ int save_mc_for_early(u8 *mc) */ mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.mc_saved_count; - mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved_count_init = mc_saved_data.num_saved; + num_saved = mc_saved_data.num_saved; mc_saved = mc_saved_data.mc_saved; - if (mc_saved && mc_saved_count) + if (mc_saved && num_saved) memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct microcode_intel *)); + num_saved * sizeof(struct microcode_intel *)); /* * Save the microcode patch mc in mc_save_tmp structure if it's a newer * version. */ - mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, mc, num_saved); /* * Save the mc_save_tmp in global mc_saved_data. */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); if (ret) { pr_err("Cannot save microcode patch.\n"); goto out; @@ -536,7 +534,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state -scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, +scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size, struct ucode_cpu_info *uci) { @@ -551,14 +549,18 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, cd.data = NULL; cd.size = 0; - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) { + /* try built-in microcode if no initrd */ + if (!size) { if (!load_builtin_intel_microcode(&cd)) return UCODE_ERROR; + } else { + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; } - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, initrd, uci); + return get_matching_model_microcode(start, cd.data, cd.size, + mcs, mc_ptrs, uci); } /* @@ -567,14 +569,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, static void print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) { - int cpu = smp_processor_id(); - - pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, - uci->cpu_sig.rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); + pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); } #ifdef CONFIG_X86_32 @@ -603,19 +602,19 @@ void show_ucode_info_early(void) */ static void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; int *delay_ucode_info_p; int *current_mc_date_p; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); *delay_ucode_info_p = 1; - *current_mc_date_p = mc_intel->hdr.date; + *current_mc_date_p = mc->hdr.date; } #else @@ -630,37 +629,35 @@ static inline void flush_tlb_early(void) static inline void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); } #endif static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; unsigned int val[2]; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return 0; /* write microcode via MSR 0x79 */ - native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); /* get the current revision from MSR 0x8B */ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) + if (val[1] != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 @@ -672,25 +669,26 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (early) print_ucode(uci); else - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); return 0; } /* * This function converts microcode patch offsets previously stored in - * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. */ int __init save_microcode_in_initrd_intel(void) { - unsigned int count = mc_saved_data.mc_saved_count; + unsigned int count = mc_saved_data.num_saved; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; int ret = 0; - if (count == 0) + if (!count) return ret; - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); + copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count); + ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -701,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void) } static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *initrd, +_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size) { struct ucode_cpu_info uci; @@ -710,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, collect_cpu_info_early(&uci); - ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + ret = scan_microcode(mcs, mc_ptrs, start, size, &uci); if (ret != UCODE_OK) return; - ret = load_microcode(mc_saved_data, initrd, start, &uci); + ret = load_microcode(mcs, mc_ptrs, start, &uci); if (ret != UCODE_OK) return; @@ -728,53 +725,49 @@ void __init load_ucode_intel_bsp(void) struct boot_params *p; p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; size = p->hdr.ramdisk_size; - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - start, size); + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? p->hdr.ramdisk_image : 0); + + _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_tmp_ptrs), + start, size); #else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; size = boot_params.hdr.ramdisk_size; + start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); + _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size); #endif } void load_ucode_intel_ap(void) { - struct mc_saved_data *mc_saved_data_p; + unsigned long *mcs_tmp_p; + struct mc_saved_data *mcs_p; struct ucode_cpu_info uci; - unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; enum ucode_state ret; #ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); + mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); + mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); #else - mc_saved_data_p = &mc_saved_data; - mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; + mcs_tmp_p = mc_tmp_ptrs; + mcs_p = &mc_saved_data; #endif /* * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (mc_saved_data_p->mc_saved_count == 0) + if (!mcs_p->num_saved) return; collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); - + ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci); if (ret != UCODE_OK) return; @@ -786,13 +779,13 @@ void reload_ucode_intel(void) struct ucode_cpu_info uci; enum ucode_state ret; - if (!mc_saved_data.mc_saved_count) + if (!mc_saved_data.num_saved) return; collect_cpu_info_early(&uci); ret = load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); + mc_saved_data.num_saved, &uci); if (ret != UCODE_OK) return; @@ -825,7 +818,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) * return 0 - no update found * return 1 - found update */ -static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +static int get_matching_mc(struct microcode_intel *mc, int cpu) { struct cpu_signature cpu_sig; unsigned int csig, cpf, crev; @@ -836,39 +829,36 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return has_newer_microcode(mc_intel, csig, cpf, crev); + return has_newer_microcode(mc, csig, cpf, crev); } static int apply_microcode_intel(int cpu) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; struct ucode_cpu_info *uci; + struct cpuinfo_x86 *c; unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - - uci = ucode_cpu_info + cpu; - mc_intel = uci->mc; /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); + if (WARN_ON(raw_smp_processor_id() != cpu)) + return -1; - if (mc_intel == NULL) + uci = ucode_cpu_info + cpu; + mc = uci->mc; + if (!mc) return 0; /* * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc_intel when it is newer than the one on this + * microcode patch in mc when it is newer than the one on this * CPU. */ - if (get_matching_mc(mc_intel, cpu) == 0) + if (!get_matching_mc(mc, cpu)) return 0; /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); + wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -876,16 +866,19 @@ static int apply_microcode_intel(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) { + if (val[1] != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + cpu, mc->hdr.rev); return -1; } + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu_num, val[1], - mc_intel->hdr.date & 0xffff, - mc_intel->hdr.date >> 24, - (mc_intel->hdr.date >> 16) & 0xff); + cpu, val[1], + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + + c = &cpu_data(cpu); uci->cpu_sig.rev = val[1]; c->microcode = val[1]; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index b96896bcbdaf..2ce1a7dc45b7 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err) unsigned long total_size, data_size, ext_table_size; struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; - int sum, orig_sum, ext_sigcount = 0, i; + u32 sum, orig_sum, ext_sigcount = 0, i; struct extended_signature *ext_sig; total_size = get_totalsize(mc_header); @@ -57,69 +57,85 @@ int microcode_sanity_check(void *mc, int print_err) if (data_size + MC_HEADER_SIZE > total_size) { if (print_err) - pr_err("error! Bad data size in microcode data file\n"); + pr_err("Error: bad microcode data file size.\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { if (print_err) - pr_err("error! Unknown microcode update format\n"); + pr_err("Error: invalid/unknown microcode update format.\n"); return -EINVAL; } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { if (print_err) - pr_err("error! Small exttable size in microcode data file\n"); + pr_err("Error: truncated extended signature table.\n"); return -EINVAL; } + ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { if (print_err) - pr_err("error! Bad exttable size in microcode data file\n"); + pr_err("Error: extended signature table size mismatch.\n"); return -EFAULT; } + ext_sigcount = ext_header->count; - } - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; - i = ext_table_size / DWSIZE; + i = ext_table_size / sizeof(u32); while (i--) ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { if (print_err) - pr_warn("aborting, bad extended signature table checksum\n"); + pr_warn("Bad extended signature table checksum, aborting.\n"); return -EINVAL; } } - /* calculate the checksum */ + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); while (i--) - orig_sum += ((int *)mc)[i]; + orig_sum += ((u32 *)mc)[i]; + if (orig_sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad microcode data checksum, aborting.\n"); return -EINVAL; } + if (!ext_table_size) return 0; - /* check extended signature checksum */ + + /* + * Check extended signature checksum: 0 => valid. + */ for (i = 0; i < ext_sigcount; i++) { ext_sig = (void *)ext_header + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad extended signature checksum, aborting.\n"); return -EINVAL; } } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 20e242ea1bc4..4e7c6933691c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", - ms_hyperv.features, ms_hyperv.hints); + pr_info("HyperV: features 0x%x, hints 0x%x\n", + ms_hyperv.features, ms_hyperv.hints); #ifdef CONFIG_X86_LOCAL_APIC if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { @@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", - lapic_timer_frequency); + pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + lapic_timer_frequency); } #endif diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index 316fe3e60a97..3d689937fc1b 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t */ if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { - pr_warning("mtrr: only write-combining%s supported\n", + pr_warn("mtrr: only write-combining%s supported\n", centaur_mcr_type ? " and uncacheable are" : " is"); return -EINVAL; } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 0d98503c2245..31e951ce6dff 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -57,9 +57,9 @@ static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; -#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) +#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0) -#define BIOS_BUG_MSG KERN_WARNING \ +#define BIOS_BUG_MSG \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init @@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, base, base + size); } if (debug_print) { - printk(KERN_DEBUG "After WB checking\n"); + pr_debug("After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end); } @@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { /* Var MTRR contains UC entry below 1M? Skip it: */ - printk(BIOS_BUG_MSG, i); + pr_warn(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) continue; size -= (1<<(20-PAGE_SHIFT)) - base; @@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, extra_remove_base + extra_remove_size); if (debug_print) { - printk(KERN_DEBUG "After UC checking\n"); + pr_debug("After UC checking\n"); for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end); } } @@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, /* sort the ranges */ nr_range = clean_sort_range(range, RANGE_NUM); if (debug_print) { - printk(KERN_DEBUG "After sorting\n"); + pr_debug("After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end); } @@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void) start_base = to_size_factor(start_base, &start_factor), type = range_state[i].type; - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", i, start_base, start_factor, size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE) ? "UC" : @@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits) return 0; /* Print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); + pr_debug("original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); @@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits) x_remove_base, x_remove_size); range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM covered: %ldM\n", + pr_info("total RAM covered: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { @@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits) if (!result[i].bad) { set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); + pr_debug("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } - printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " - "will find optimal one\n"); + pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n"); } i = 0; @@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits) x_remove_base, x_remove_size, i); if (debug_print) { mtrr_print_out_one_result(i); - printk(KERN_INFO "\n"); + pr_info("\n"); } i++; @@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits) index_good = mtrr_search_optimal_index(); if (index_good != -1) { - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); + pr_info("Found optimal setting for mtrr clean up\n"); i = index_good; mtrr_print_out_one_result(i); @@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits) gran_size <<= 10; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); + pr_debug("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } else { @@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits) mtrr_print_out_one_result(i); } - printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); - printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); + pr_info("mtrr_cleanup: can not find optimal value\n"); + pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n"); return 0; } @@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ if (!highest_pfn) { - printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); + pr_info("CPU MTRRs all blank - virtualized system.\n"); return 0; } @@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) end_pfn); if (total_trim_size) { - pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); + pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", + total_trim_size >> 20); if (!changed_by_mtrr_cleanup) WARN_ON(1); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index c870af161008..fcbcb2f678ca 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void) rdmsr(MSR_K8_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { - printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" + pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; @@ -501,14 +501,14 @@ void __init mtrr_state_warn(void) if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) - pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) - pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); + pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) - pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); - printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); - printk(KERN_INFO "mtrr: corrected configuration.\n"); + pr_info("mtrr: probably your BIOS does not setup all CPUs.\n"); + pr_info("mtrr: corrected configuration.\n"); } /* @@ -519,8 +519,7 @@ void __init mtrr_state_warn(void) void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { if (wrmsr_safe(msr, a, b) < 0) { - printk(KERN_ERR - "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", + pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); } } @@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1ULL<<(hi - 1)) - 1); if (tmp != mask) { - printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); mask = tmp; } @@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size, boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { - pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { - pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } @@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, lbase = lbase >> 1, last = last >> 1) ; if (lbase != last) { - pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); + pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 74f1d90f9c29..10f8d4796240 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size, return error; if (type >= MTRR_NUM_TYPES) { - pr_warning("mtrr: type: %u invalid\n", type); + pr_warn("mtrr: type: %u invalid\n", type); return -EINVAL; } /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - pr_warning("mtrr: your processor doesn't support write-combining\n"); + pr_warn("mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - pr_warning("mtrr: zero sized request\n"); + pr_warn("mtrr: zero sized request\n"); return -EINVAL; } if ((base | (base + size - 1)) >> (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { - pr_warning("mtrr: base or size exceeds the MTRR width\n"); + pr_warn("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, } else if (types_compatible(type, ltype)) continue; } - pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing" " 0x%lx000,0x%lx000\n", base, size, lbase, lsize); goto out; @@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (ltype != type) { if (types_compatible(type, ltype)) continue; - pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", base, size, mtrr_attrib_to_str(ltype), mtrr_attrib_to_str(type)); goto out; @@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_warn("mtrr: size and base must be multiples of 4 kiB\n"); pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; @@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg >= max) { - pr_warning("mtrr: register: %d too big\n", reg); + pr_warn("mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - pr_warning("mtrr: MTRR %d not used\n", reg); + pr_warn("mtrr: MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - pr_warning("mtrr: reg: %d has count=0\n", reg); + pr_warn("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 819d94982e07..f6f50c4ceaec 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) for (i = 0; i < SANITY_CHECK_LOOPS; i++) { if (!rdrand_long(&tmp)) { clear_cpu_cap(c, X86_FEATURE_RDRAND); - printk_once(KERN_WARNING "rdrand: disabled\n"); + pr_warn_once("rdrand: disabled\n"); return; } } diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 4c60eaf0571c..cd531355e838 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c) c->x86_max_cores = (core_level_siblings / smp_num_siblings); if (!printed) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + pr_info("CPU: Physical Processor ID: %d\n", c->phys_proc_id); if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", + pr_info("CPU: Processor Core ID: %d\n", c->cpu_core_id); printed = 1; } diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index a19a663282b5..34178564be2a 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c) if (max >= 0x80860001) { cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags); if (cpu_rev != 0x02000000) { - printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n", + pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n", (cpu_rev >> 24) & 0xff, (cpu_rev >> 16) & 0xff, (cpu_rev >> 8) & 0xff, @@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c) if (max >= 0x80860002) { cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy); if (cpu_rev == 0x02000000) { - printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n", + pr_info("CPU: Processor revision %08X, %u MHz\n", new_cpu_rev, cpu_freq); } - printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", + pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", (cms_rev1 >> 24) & 0xff, (cms_rev1 >> 16) & 0xff, (cms_rev1 >> 8) & 0xff, @@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c) (void *)&cpu_info[56], (void *)&cpu_info[60]); cpu_info[64] = '\0'; - printk(KERN_INFO "CPU: %s\n", cpu_info); + pr_info("CPU: %s\n", cpu_info); } /* Unhide possibly hidden capability flags */ diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 628a059a9a06..364e58346897 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void) tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); - printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); @@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void) if (ebx != UINT_MAX) x86_platform.calibrate_tsc = vmware_get_tsc_khz; else - printk(KERN_WARNING - "Failed to get TSC freq from the hypervisor\n"); + pr_warn("Failed to get TSC freq from the hypervisor\n"); } /* diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 58f34319b29a..9ef978d69c22 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -57,10 +57,9 @@ struct crash_elf_data { struct kimage *image; /* * Total number of ram ranges we have after various adjustments for - * GART, crash reserved region etc. + * crash reserved region, etc. */ unsigned int max_nr_ranges; - unsigned long gart_start, gart_end; /* Pointer to elf header */ void *ehdr; @@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) return 0; } -static int get_gart_ranges_callback(u64 start, u64 end, void *arg) -{ - struct crash_elf_data *ced = arg; - - ced->gart_start = start; - ced->gart_end = end; - - /* Not expecting more than 1 gart aperture */ - return 1; -} - /* Gather all the required information to prepare elf headers for ram regions */ static void fill_up_crash_elf_data(struct crash_elf_data *ced, @@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced, ced->max_nr_ranges = nr_ranges; - /* - * We don't create ELF headers for GART aperture as an attempt - * to dump this memory in second kernel leads to hang/crash. - * If gart aperture is present, one needs to exclude that region - * and that could lead to need of extra phdr. - */ - walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, - ced, get_gart_ranges_callback); - - /* - * If we have gart region, excluding that could potentially split - * a memory range, resulting in extra header. Account for that. - */ - if (ced->gart_end) - ced->max_nr_ranges++; - /* Exclusion of crash region could split memory ranges */ ced->max_nr_ranges++; @@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, return ret; } - /* Exclude GART region */ - if (ced->gart_end) { - ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); - if (ret) - return ret; - } - return ret; } @@ -599,12 +564,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) /* Add ACPI tables */ cmd.type = E820_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; - walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, memmap_entry_callback); /* Add ACPI Non-volatile Storage */ cmd.type = E820_NVS; - walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); /* Add crashk_low_res region */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9c30acfadae2..8efa57a5f29e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo, if (!__kernel_text_address(addr)) break; - ops->address(data, addr, 1); + if (ops->address(data, addr, 1)) + break; frame = frame->next_frame; ret_addr = &frame->return_address; print_ftrace_graph_addr(addr, data, ops, tinfo, graph); @@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name) /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr, int reliable) +static int print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); printk_stack_address(addr, reliable, data); + return 0; } static const struct stacktrace_ops print_trace_ops = { @@ -265,9 +267,8 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_SMP printk("SMP "); #endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC "); -#endif + if (debug_pagealloc_enabled()) + printk("DEBUG_PAGEALLOC "); #ifdef CONFIG_KASAN printk("KASAN"); #endif diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b3c2a697820a..621b501f8935 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -926,6 +926,41 @@ static const char *e820_type_to_string(int e820_type) } } +static unsigned long e820_type_to_iomem_type(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: + return IORESOURCE_SYSTEM_RAM; + case E820_ACPI: + case E820_NVS: + case E820_UNUSABLE: + case E820_PRAM: + case E820_PMEM: + default: + return IORESOURCE_MEM; + } +} + +static unsigned long e820_type_to_iores_desc(int e820_type) +{ + switch (e820_type) { + case E820_ACPI: + return IORES_DESC_ACPI_TABLES; + case E820_NVS: + return IORES_DESC_ACPI_NV_STORAGE; + case E820_PMEM: + return IORES_DESC_PERSISTENT_MEMORY; + case E820_PRAM: + return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_RESERVED_KERN: + case E820_RAM: + case E820_UNUSABLE: + default: + return IORES_DESC_NONE; + } +} + static bool do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ @@ -968,7 +1003,8 @@ void __init e820_reserve_resources(void) res->start = e820.map[i].addr; res->end = end; - res->flags = IORESOURCE_MEM; + res->flags = e820_type_to_iomem_type(e820.map[i].type); + res->desc = e820_type_to_iores_desc(e820.map[i].type); /* * don't register the region that could be conflicted with diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index dea8e76d60c6..8e37cc8a539a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -474,8 +474,10 @@ static inline void copy_init_fpstate_to_fpregs(void) { if (use_xsave()) copy_kernel_to_xregs(&init_fpstate.xsave, -1); - else + else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); + else + copy_kernel_to_fregs(&init_fpstate.fsave); } /* diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 471fe277ff40..54c86fffbf9f 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) cr0 &= ~(X86_CR0_TS | X86_CR0_EM); write_cr0(cr0); - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); + if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); - else - clear_cpu_cap(c, X86_FEATURE_FPU); + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); + } #ifndef CONFIG_MATH_EMULATION if (!cpu_has_fpu) { @@ -132,7 +134,7 @@ static void __init fpu__init_system_generic(void) * Set up the legacy init FPU context. (xstate init might overwrite this * with a more modern format, if the CPU supports it.) */ - fpstate_init_fxstate(&init_fpstate.fxsave); + fpstate_init(&init_fpstate); fpu__init_system_mxcsr(); } @@ -303,12 +305,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void) static void __init fpu__clear_eager_fpu_features(void) { setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1b1981812bb6..b48ef35b28d4 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -59,6 +59,9 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); + setup_clear_cpu_cap(X86_FEATURE_AVX512BW); + setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); setup_clear_cpu_cap(X86_FEATURE_PKU); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 29408d6d6626..702547ce33c9 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end) static unsigned long text_ip_addr(unsigned long ip) { /* - * On x86_64, kernel text mappings are mapped read-only with - * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead - * of the kernel text mapping to modify the kernel text. + * On x86_64, kernel text mappings are mapped read-only, so we use + * the kernel identity mapping instead of the kernel text mapping + * to modify the kernel text. * * For 32bit kernels, these mappings are same and we can use * kernel identity mapping to modify code. @@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ -extern void ftrace_caller_end(void); extern void ftrace_regs_caller_end(void); -extern void ftrace_return(void); +extern void ftrace_epilogue(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_offset = (unsigned long)ftrace_regs_caller_op_ptr; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_caller_end; + end_offset = (unsigned long)ftrace_epilogue; op_offset = (unsigned long)ftrace_caller_op_ptr; } @@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* * Allocate enough size to store the ftrace_caller code, - * the jmp to ftrace_return, as well as the address of + * the jmp to ftrace_epilogue, as well as the address of * the ftrace_ops this trampoline is used for. */ trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); @@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = (unsigned long)trampoline + size; - /* The trampoline ends with a jmp to ftrace_return */ - jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); + /* The trampoline ends with a jmp to ftrace_epilogue */ + jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue); memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2c0f3407bd1f..1f4422d5c8d0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { - unsigned long i; - - for (i = 0; i < PTRS_PER_PGD-1; i++) - early_level4_pgt[i].pgd = 0; - + memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); } @@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void) int __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; - unsigned long i; pgdval_t pgd, *pgd_p; pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; @@ -81,8 +75,7 @@ again: } pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PUD; i++) - pud_p[i] = 0; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); @@ -97,8 +90,7 @@ again: } pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PMD; i++) - pmd_p[i] = 0; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pmd = (physaddr & PMD_MASK) + early_pmd_flags; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2e974680f5ad..22fbf9df61bb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -75,9 +75,7 @@ startup_64: subq $_text - __START_KERNEL_map, %rbp /* Is the address not 2M aligned? */ - movq %rbp, %rax - andl $~PMD_PAGE_MASK, %eax - testl %eax, %eax + testl $~PMD_PAGE_MASK, %ebp jnz bad_address /* diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 44256a62702b..ed15cd486d06 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; -#ifdef CONFIG_DEBUG_RODATA char opc[BREAK_INSTR_SIZE]; -#endif /* CONFIG_DEBUG_RODATA */ bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) return err; err = probe_kernel_write((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); -#ifdef CONFIG_DEBUG_RODATA if (!err) return err; /* @@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) return -EINVAL; bpt->type = BP_POKE_BREAKPOINT; -#endif /* CONFIG_DEBUG_RODATA */ + return err; } int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { -#ifdef CONFIG_DEBUG_RODATA int err; char opc[BREAK_INSTR_SIZE]; @@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) goto knl_write; return err; + knl_write: -#endif /* CONFIG_DEBUG_RODATA */ return probe_kernel_write((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 1deffe6cc873..ae703acb85c1 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -49,6 +49,7 @@ #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> +#include <linux/frame.h> #include <asm/cacheflush.h> #include <asm/desc.h> @@ -671,39 +672,39 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. */ -static void __used kretprobe_trampoline_holder(void) -{ - asm volatile ( - ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" +asm( + ".global kretprobe_trampoline\n" + ".type kretprobe_trampoline, @function\n" + "kretprobe_trampoline:\n" #ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rdi\n" - " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movq %rax, 152(%rsp)\n" - RESTORE_REGS_STRING - " popfq\n" + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 152(%rsp)\n" + RESTORE_REGS_STRING + " popfq\n" #else - " pushf\n" - SAVE_REGS_STRING - " movl %esp, %eax\n" - " call trampoline_handler\n" - /* Move flags to cs */ - " movl 56(%esp), %edx\n" - " movl %edx, 52(%esp)\n" - /* Replace saved flags with true return address. */ - " movl %eax, 56(%esp)\n" - RESTORE_REGS_STRING - " popf\n" + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Move flags to cs */ + " movl 56(%esp), %edx\n" + " movl %edx, 52(%esp)\n" + /* Replace saved flags with true return address. */ + " movl %eax, 56(%esp)\n" + RESTORE_REGS_STRING + " popf\n" #endif - " ret\n"); -} -NOKPROBE_SYMBOL(kretprobe_trampoline_holder); + " ret\n" + ".size kretprobe_trampoline, .-kretprobe_trampoline\n" +); NOKPROBE_SYMBOL(kretprobe_trampoline); +STACK_FRAME_NON_STANDARD(kretprobe_trampoline); /* * Called from kretprobe_trampoline @@ -988,7 +989,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - if (fixup_exception(regs)) + if (fixup_exception(regs, trapnr)) return 1; /* diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 87e1762e2bca..ed48a9f465f8 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -168,12 +168,14 @@ GLOBAL(ftrace_call) restore_mcount_regs /* - * The copied trampoline must call ftrace_return as it + * The copied trampoline must call ftrace_epilogue as it * still may need to call the function graph tracer. + * + * The code up to this label is copied into trampolines so + * think twice before adding any new code or changing the + * layout here. */ -GLOBAL(ftrace_caller_end) - -GLOBAL(ftrace_return) +GLOBAL(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call) popfq /* - * As this jmp to ftrace_return can be a short jump + * As this jmp to ftrace_epilogue can be a short jump * it must not be copied into the trampoline. * The trampoline will add the code to jump * to the return. */ GLOBAL(ftrace_regs_caller_end) - jmp ftrace_return + jmp ftrace_epilogue END(ftrace_regs_caller) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 30ca7607cbbb..97340f2c437c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) processor.cpuflag = CPU_ENABLED; processor.cpufeature = (boot_cpu_data.x86 << 8) | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.featureflag = boot_cpu_data.x86_capability[0]; + processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX]; processor.reserved[0] = 0; processor.reserved[1] = 0; for (i = 0; i < 2; i++) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 8a2cdd736fa4..04b132a767f1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -30,6 +30,7 @@ #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/reboot.h> +#include <asm/cache.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -69,7 +70,7 @@ struct nmi_stats { static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); -static int ignore_nmis; +static int ignore_nmis __read_mostly; int unknown_nmi_panic; /* diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 14415aff1813..92f70147a9a6 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data) static __init int register_e820_pmem(void) { - char *pmem = "Persistent Memory (legacy)"; struct platform_device *pdev; int rc; - rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found); + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, NULL, found); if (rc <= 0) return 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9f7c21c22477..2915d54e9dd5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif +#ifdef CONFIG_X86_32 + .SYSENTER_stack_canary = STACK_END_MAGIC, +#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss); @@ -418,9 +421,9 @@ static void mwait_idle(void) if (!current_set_polling_and_test()) { trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { - smp_mb(); /* quirk */ + mb(); /* quirk */ clflush((void *)¤t_thread_info()->flags); - smp_mb(); /* quirk */ + mb(); /* quirk */ } __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7260f992cd11..2367ae07eb76 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -153,21 +153,21 @@ static struct resource data_resource = { .name = "Kernel data", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; static struct resource code_resource = { .name = "Kernel code", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; static struct resource bss_resource = { .name = "Kernel bss", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index c07ff5ddbd47..548ddf7d6fd2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,7 +61,38 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) +#ifdef CONFIG_X86_64 +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} +#endif + +static int restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *sc, + unsigned long uc_flags) { unsigned long buf_val; void __user *buf; @@ -94,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && + user_64bit_mode(regs))) + force_valid_ss(regs); +#endif get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -165,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->cs, &sc->cs); put_user_ex(0, &sc->gs); put_user_ex(0, &sc->fs); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -403,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return 0; } #else /* !CONFIG_X86_32 */ +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (cpu_has_xsave) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { @@ -422,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -459,10 +506,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatbility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ regs->cs = __USER_CS; + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + return 0; } #endif /* CONFIG_X86_32 */ @@ -489,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -554,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc)) + /* + * x86_32 has no uc_flags bits relevant to restore_sigcontext. + * Save a few cycles by skipping the __get_user. + */ + if (restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -570,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -766,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -773,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 24d57f77b3c1..643dbdccf4bc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* Logical package management. We might want to allocate that dynamically */ +static int *physical_to_logical_pkg __read_mostly; +static unsigned long *physical_package_map __read_mostly;; +static unsigned long *logical_package_map __read_mostly; +static unsigned int max_physical_pkg_id __read_mostly; +unsigned int __max_logical_packages __read_mostly; +EXPORT_SYMBOL(__max_logical_packages); + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -248,7 +256,98 @@ static void notrace start_secondary(void *unused) x86_cpuinit.setup_percpu_clockev(); wmb(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +int topology_update_package_map(unsigned int apicid, unsigned int cpu) +{ + unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits; + + /* Called from early boot ? */ + if (!physical_package_map) + return 0; + + if (pkg >= max_physical_pkg_id) + return -EINVAL; + + /* Set the logical package id */ + if (test_and_set_bit(pkg, physical_package_map)) + goto found; + + if (pkg < __max_logical_packages) { + set_bit(pkg, logical_package_map); + physical_to_logical_pkg[pkg] = pkg; + goto found; + } + new = find_first_zero_bit(logical_package_map, __max_logical_packages); + if (new >= __max_logical_packages) { + physical_to_logical_pkg[pkg] = -1; + pr_warn("APIC(%x) Package %u exceeds logical package map\n", + apicid, pkg); + return -ENOSPC; + } + set_bit(new, logical_package_map); + pr_info("APIC(%x) Converting physical %u to logical package %u\n", + apicid, pkg, new); + physical_to_logical_pkg[pkg] = new; + +found: + cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg]; + return 0; +} + +/** + * topology_phys_to_logical_pkg - Map a physical package id to a logical + * + * Returns logical package id or -1 if not found + */ +int topology_phys_to_logical_pkg(unsigned int phys_pkg) +{ + if (phys_pkg >= max_physical_pkg_id) + return -1; + return physical_to_logical_pkg[phys_pkg]; +} +EXPORT_SYMBOL(topology_phys_to_logical_pkg); + +static void __init smp_init_package_map(void) +{ + unsigned int ncpus, cpu; + size_t size; + + /* + * Today neither Intel nor AMD support heterogenous systems. That + * might change in the future.... + */ + ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings; + __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); + + /* + * Possibly larger than what we need as the number of apic ids per + * package can be smaller than the actual used apic ids. + */ + max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus); + size = max_physical_pkg_id * sizeof(unsigned int); + physical_to_logical_pkg = kmalloc(size, GFP_KERNEL); + memset(physical_to_logical_pkg, 0xff, size); + size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); + physical_package_map = kzalloc(size, GFP_KERNEL); + size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); + logical_package_map = kzalloc(size, GFP_KERNEL); + + pr_info("Max logical packages: %u\n", __max_logical_packages); + + for_each_present_cpu(cpu) { + unsigned int apicid = apic->cpu_present_to_apicid(cpu); + + if (apicid == BAD_APICID || !apic->apic_id_valid(apicid)) + continue; + if (!topology_update_package_map(apicid, cpu)) + continue; + pr_warn("CPU %u APICId %x disabled\n", cpu, apicid); + per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID; + set_cpu_possible(cpu, false); + set_cpu_present(cpu, false); + } } void __init smp_store_boot_cpu_info(void) @@ -258,6 +357,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; + smp_init_package_map(); } /* diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c6430e5a..9ee98eefc44d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void +static int __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; #ifdef CONFIG_FRAME_POINTER if (!reliable) - return; + return 0; #endif if (nosched && in_sched_functions(addr)) - return; + return 0; if (trace->skip > 0) { trace->skip--; - return; + return 0; } - if (trace->nr_entries < trace->max_entries) + if (trace->nr_entries < trace->max_entries) { trace->entries[trace->nr_entries++] = addr; + return 0; + } else { + return -1; /* no more room, stop walking the stack */ + } } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static int save_stack_address(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, false); } -static void +static int save_stack_address_nosched(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, true); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 91a4496db434..e72a07f20b05 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -135,7 +135,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 3f92ce07e525..27538f183c3b 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -142,7 +142,6 @@ static int test_NX(void) * by the error message */ -#ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ if (rodata_test_data != 0xC3) { printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); @@ -151,7 +150,6 @@ static int test_NX(void) printk(KERN_ERR "test_nx: .rodata section is executable\n"); ret = -ENODEV; } -#endif #if 0 /* Test 4: Check if the .data section of a module is executable */ diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index 5ecbfe5099da..cb4a01b41e27 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -76,5 +76,5 @@ int rodata_test(void) } MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_DESCRIPTION("Testcase for marking rodata as read-only"); MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1e630d1b7ad9..06cbe25861f1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -185,7 +185,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (!fixup_exception(regs)) { + if (!fixup_exception(regs, trapnr)) { tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); @@ -248,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; -#ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", @@ -257,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, print_vma_addr(" in ", regs->ip); pr_cont("\n"); } -#endif force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } @@ -439,7 +437,7 @@ do_general_protection(struct pt_regs *regs, long error_code) tsk = current; if (!user_mode(regs)) { - if (fixup_exception(regs)) + if (fixup_exception(regs, X86_TRAP_GP)) return; tsk->thread.error_code = error_code; @@ -559,6 +557,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) NOKPROBE_SYMBOL(fixup_bad_iret); #endif +static bool is_sysenter_singlestep(struct pt_regs *regs) +{ + /* + * We don't try for precision here. If we're anywhere in the region of + * code that can be single-stepped in the SYSENTER entry path, then + * assume that this is a useless single-step trap due to SYSENTER + * being invoked with TF set. (We don't know in advance exactly + * which instructions will be hit because BTF could plausibly + * be set.) + */ +#ifdef CONFIG_X86_32 + return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < + (unsigned long)__end_SYSENTER_singlestep_region - + (unsigned long)__begin_SYSENTER_singlestep_region; +#elif defined(CONFIG_IA32_EMULATION) + return (regs->ip - (unsigned long)entry_SYSENTER_compat) < + (unsigned long)__end_entry_SYSENTER_compat - + (unsigned long)entry_SYSENTER_compat; +#else + return false; +#endif +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -593,11 +614,42 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) ist_enter(regs); get_debugreg(dr6, 6); + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + set_debugreg(0, 6); /* Filter out all the reserved bits which are preset to 1 */ dr6 &= ~DR6_RESERVED; /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + + if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && + is_sysenter_singlestep(regs))) { + dr6 &= ~DR_STEP; + if (!dr6) + goto exit; + /* + * else we might have gotten a single-step trap and hit a + * watchpoint at the same time, in which case we should fall + * through and handle the watchpoint. + */ + } + + /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. @@ -605,18 +657,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions first of all! */ + /* Catch kmemcheck conditions! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) goto exit; - /* DR6 may or may not be cleared by the CPU */ - set_debugreg(0, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; @@ -648,14 +692,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) goto exit; } - /* - * Single-stepping through system calls: ignore any exceptions in - * kernel space, but re-enable TF when returning to user mode. - * - * We already checked v86 mode above, so we can check for kernel mode - * by just checking the CPL of CS. - */ - if ((dr6 & DR_STEP) && !user_mode(regs)) { + if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { + /* + * Historical junk that used to handle SYSENTER single-stepping. + * This should be unreachable now. If we survive for a while + * without anyone hitting this warning, we'll turn this into + * an oops. + */ tsk->thread.debugreg6 &= ~DR_STEP; set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; @@ -668,6 +711,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: +#if defined(CONFIG_X86_32) + /* + * This is the most likely code path that involves non-trivial use + * of the SYSENTER stack. Check that we haven't overrun it. + */ + WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, + "Overran or corrupted SYSENTER stack\n"); +#endif ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -690,7 +741,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (!fixup_exception(regs)) { + if (!fixup_exception(regs, trapnr)) { task->thread.error_code = error_code; task->thread.trap_nr = trapnr; die(str, regs, error_code); @@ -858,7 +909,7 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3d743da828d3..56380440d862 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; +static u32 art_to_tsc_numerator; +static u32 art_to_tsc_denominator; +static u64 art_to_tsc_offset; +struct clocksource *art_related_clocksource; + /* * Use a ring-buffer like data structure, where a writer advances the head by * writing a new data entry and a reader advances the tail when it observes a @@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc); #endif /* CONFIG_CPU_FREQ */ +#define ART_CPUID_LEAF (0x15) +#define ART_MIN_DENOMINATOR (1) + + +/* + * If ART is present detect the numerator:denominator to convert to TSC + */ +static void detect_art(void) +{ + unsigned int unused[2]; + + if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + return; + + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + /* Don't enable ART in a VM, non-stop TSC required */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || + art_to_tsc_denominator < ART_MIN_DENOMINATOR) + return; + + if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + return; + + /* Make this sticky over multiple CPU init calls */ + setup_force_cpu_cap(X86_FEATURE_ART); +} + + /* clocksource code */ static struct clocksource clocksource_tsc; @@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void) return 0; } +/* + * Convert ART to TSC given numerator/denominator found in detect_art() + */ +struct system_counterval_t convert_art_to_tsc(cycle_t art) +{ + u64 tmp, res, rem; + + rem = do_div(art, art_to_tsc_denominator); + + res = art * art_to_tsc_numerator; + tmp = rem * art_to_tsc_numerator; + + do_div(tmp, art_to_tsc_denominator); + res += tmp + art_to_tsc_offset; + + return (struct system_counterval_t) {.cs = art_related_clocksource, + .cycles = res}; +} +EXPORT_SYMBOL(convert_art_to_tsc); static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); @@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz % 1000); out: + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); } @@ -1235,6 +1292,8 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); + + detect_art(); } #ifdef CONFIG_SMP @@ -1246,14 +1305,14 @@ void __init tsc_init(void) */ unsigned long calibrate_delay_is_known(void) { - int i, cpu = smp_processor_id(); + int sibling, cpu = smp_processor_id(); if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) return 0; - for_each_online_cpu(i) - if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) - return cpu_data(i).loops_per_jiffy; + sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu); + if (sibling < nr_cpu_ids) + return cpu_data(sibling).loops_per_jiffy; return 0; } #endif diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92dc211c11db..d239639e0c1d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -41,29 +41,28 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* - * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA - * we retain large page mappings for boundaries spanning kernel text, rodata - * and data sections. + * On 64-bit, align RODATA to 2MB so we retain large page mappings for + * boundaries spanning kernel text, rodata and data sections. * * However, kernel identity mappings will have different RWX permissions * to the pages mapping to text and to the pages padding (which are freed) the * text section. Hence kernel identity mappings will be broken to smaller * pages. For 64-bit, kernel text and kernel identity mappings are different, - * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, - * as well as retain 2MB large page mappings for kernel text. + * so we can enable protection checks as well as retain 2MB large page + * mappings for kernel text. */ -#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_DEBUG_RODATA_END \ +#define X64_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; #else -#define X64_ALIGN_DEBUG_RODATA_BEGIN -#define X64_ALIGN_DEBUG_RODATA_END +#define X64_ALIGN_RODATA_BEGIN +#define X64_ALIGN_RODATA_END #endif @@ -82,11 +81,11 @@ PHDRS { SECTIONS { #ifdef CONFIG_X86_32 - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET); #else - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; + . = __START_KERNEL; + phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET); #endif /* Text and read-only data */ @@ -112,13 +111,11 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 -#if defined(CONFIG_DEBUG_RODATA) /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); -#endif - X64_ALIGN_DEBUG_RODATA_BEGIN + X64_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_DEBUG_RODATA_END + X64_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -336,6 +333,7 @@ SECTIONS __brk_limit = .; } + . = ALIGN(PAGE_SIZE); _end = .; STABS_DEBUG @@ -343,7 +341,10 @@ SECTIONS /* Sections to be discarded */ DISCARDS - /DISCARD/ : { *(.eh_frame) } + /DISCARD/ : { + *(.eh_frame) + *(__func_stack_frame_non_standard) + } } diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0695be19864..cd05942bc918 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); +EXPORT_SYMBOL_GPL(memcpy_mcsafe); + EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a1ff508bb423..464fa477afbf 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,9 +13,10 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o + hyperv.o page_track.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o + kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 9dc091acd5fb..308b8597c691 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -51,11 +51,9 @@ struct kvm_assigned_dev_kernel { static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) { - struct list_head *ptr; struct kvm_assigned_dev_kernel *match; - list_for_each(ptr, head) { - match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + list_for_each_entry(match, head, list) { if (match->assigned_dev_id == assigned_dev_id) return match; } @@ -373,14 +371,10 @@ static void kvm_free_assigned_device(struct kvm *kvm, void kvm_free_all_assigned_devices(struct kvm *kvm) { - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); + struct kvm_assigned_dev_kernel *assigned_dev, *tmp; + list_for_each_entry_safe(assigned_dev, tmp, + &kvm->arch.assigned_dev_head, list) { kvm_free_assigned_device(kvm, assigned_dev); } } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6525e926f566..0029644bf09c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -46,11 +46,18 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } +bool kvm_mpx_supported(void) +{ + return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + && kvm_x86_ops->mpx_supported()); +} +EXPORT_SYMBOL_GPL(kvm_mpx_supported); + u64 kvm_supported_xcr0(void) { u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; - if (!kvm_x86_ops->mpx_supported()) + if (!kvm_mpx_supported()) xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); return xcr0; @@ -97,8 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu); - if (vcpu->arch.eager_fpu) + if (use_eager_fpu()) kvm_x86_ops->fpu_activate(vcpu); /* @@ -295,7 +301,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; - unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; + unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; /* cpuid 1.edx */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c8eda1498121..66a6581724ad 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include <asm/cpu.h> int kvm_update_cpuid(struct kvm_vcpu *vcpu); +bool kvm_mpx_supported(void); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, @@ -135,14 +136,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } -static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_MPX)); -} - static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1505587d06e9..0f6294376fbd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -309,23 +309,29 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); -#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" +#define FOP_FUNC(name) \ + ".align " __stringify(FASTOP_SIZE) " \n\t" \ + ".type " name ", @function \n\t" \ + name ":\n\t" + #define FOP_RET "ret \n\t" #define FOP_START(op) \ extern void em_##op(struct fastop *fake); \ asm(".pushsection .text, \"ax\" \n\t" \ ".global em_" #op " \n\t" \ - FOP_ALIGN \ - "em_" #op ": \n\t" + FOP_FUNC("em_" #op) #define FOP_END \ ".popsection") -#define FOPNOP() FOP_ALIGN FOP_RET +#define FOPNOP() \ + FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \ + FOP_RET #define FOP1E(op, dst) \ - FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst) \ + "10: " #op " %" #dst " \n\t" FOP_RET #define FOP1EEX(op, dst) \ FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) @@ -357,7 +363,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP2E(op, dst, src) \ - FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src) \ + #op " %" #src ", %" #dst " \n\t" FOP_RET #define FASTOP2(op) \ FOP_START(op) \ @@ -395,7 +402,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP3E(op, dst, src, src2) \ - FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \ + #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET /* 3-operand, word-only, src2=cl */ #define FASTOP3WCL(op) \ @@ -407,7 +415,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END /* Special case for SETcc - 1 instruction per cc */ -#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" +#define FOP_SETCC(op) \ + ".align 4 \n\t" \ + ".type " #op ", @function \n\t" \ + #op ": \n\t" \ + #op " %al \n\t" \ + FOP_RET asm(".global kvm_fastop_exception \n" "kvm_fastop_exception: xor %esi, %esi; ret"); @@ -650,10 +663,10 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, u16 sel; la = seg_base(ctxt, addr.seg) + addr.ea; - *linear = la; *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: + *linear = la; if (is_noncanonical_address(la)) goto bad; @@ -662,6 +675,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, goto bad; break; default: + *linear = la = (u32)la; usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, addr.seg); if (!usable) @@ -689,7 +703,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, if (size > *max_size) goto bad; } - la &= (u32)-1; break; } if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) @@ -956,7 +969,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt) return fastop(ctxt, em_bsr); } -static u8 test_cc(unsigned int condition, unsigned long flags) +static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c58ba67175ac..5ff3485acb60 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm) return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; } +static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) +{ + bool longmode; + + longmode = is_64_bit_mode(vcpu); + if (longmode) + kvm_register_write(vcpu, VCPU_REGS_RAX, result); + else { + kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); + } +} + +static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); + return 1; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { u64 param, ingpa, outgpa, ret; @@ -1055,7 +1076,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) */ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); - return 0; + return 1; } longmode = is_64_bit_mode(vcpu); @@ -1083,22 +1104,33 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + /* Hypercall continuation is not supported yet */ + if (rep_cnt || rep_idx) { + res = HV_STATUS_INVALID_HYPERCALL_CODE; + goto set_result; + } + switch (code) { - case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + case HVCALL_NOTIFY_LONG_SPIN_WAIT: kvm_vcpu_on_spin(vcpu); break; + case HVCALL_POST_MESSAGE: + case HVCALL_SIGNAL_EVENT: + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = param; + vcpu->run->hyperv.u.hcall.params[0] = ingpa; + vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->arch.complete_userspace_io = + kvm_hv_hypercall_complete_userspace; + return 0; default: res = HV_STATUS_INVALID_HYPERCALL_CODE; break; } +set_result: ret = res | (((u64)rep_done & 0xfff) << 32); - if (longmode) { - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - } else { - kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); - } - + kvm_hv_hypercall_set_result(vcpu, ret); return 1; } diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index b0ea42b78ccd..a4bf5b45d65a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -51,32 +51,9 @@ #define RW_STATE_WORD0 3 #define RW_STATE_WORD1 4 -/* Compute with 96 bit intermediate result: (a*b)/c */ -static u64 muldiv64(u64 a, u32 b, u32 c) +static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val) { - union { - u64 ll; - struct { - u32 low, high; - } l; - } u, res; - u64 rl, rh; - - u.ll = a; - rl = (u64)u.l.low * (u64)b; - rh = (u64)u.l.high * (u64)b; - rh += (rl >> 32); - res.l.high = div64_u64(rh, c); - res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c); - return res.ll; -} - -static void pit_set_gate(struct kvm *kvm, int channel, u32 val) -{ - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; switch (c->mode) { default: @@ -97,18 +74,16 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val) c->gate = val; } -static int pit_get_gate(struct kvm *kvm, int channel) +static int pit_get_gate(struct kvm_pit *pit, int channel) { - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - return kvm->arch.vpit->pit_state.channels[channel].gate; + return pit->pit_state.channels[channel].gate; } -static s64 __kpit_elapsed(struct kvm *kvm) +static s64 __kpit_elapsed(struct kvm_pit *pit) { s64 elapsed; ktime_t remaining; - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + struct kvm_kpit_state *ps = &pit->pit_state; if (!ps->period) return 0; @@ -128,26 +103,23 @@ static s64 __kpit_elapsed(struct kvm *kvm) return elapsed; } -static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, +static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c, int channel) { if (channel == 0) - return __kpit_elapsed(kvm); + return __kpit_elapsed(pit); return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); } -static int pit_get_count(struct kvm *kvm, int channel) +static int pit_get_count(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; s64 d, t; int counter; - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - t = kpit_elapsed(kvm, c, channel); - d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + t = kpit_elapsed(pit, c, channel); + d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { case 0: @@ -167,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel) return counter; } -static int pit_get_out(struct kvm *kvm, int channel) +static int pit_get_out(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; s64 d, t; int out; - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - t = kpit_elapsed(kvm, c, channel); - d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + t = kpit_elapsed(pit, c, channel); + d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { default: @@ -202,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel) return out; } -static void pit_latch_count(struct kvm *kvm, int channel) +static void pit_latch_count(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; if (!c->count_latched) { - c->latched_count = pit_get_count(kvm, channel); + c->latched_count = pit_get_count(pit, channel); c->count_latched = c->rw_mode; } } -static void pit_latch_status(struct kvm *kvm, int channel) +static void pit_latch_status(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; if (!c->status_latched) { /* TODO: Return NULL COUNT (bit 6). */ - c->status = ((pit_get_out(kvm, channel) << 7) | + c->status = ((pit_get_out(pit, channel) << 7) | (c->rw_mode << 4) | (c->mode << 1) | c->bcd); @@ -232,26 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } +static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps) +{ + return container_of(ps, struct kvm_pit, pit_state); +} + static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - int value; - - spin_lock(&ps->inject_lock); - value = atomic_dec_return(&ps->pending); - if (value < 0) - /* spurious acks can be generated if, for example, the - * PIC is being reset. Handle it gracefully here - */ - atomic_inc(&ps->pending); - else if (value > 0) - /* in this case, we had multiple outstanding pit interrupts - * that we needed to inject. Reinject - */ - queue_kthread_work(&ps->pit->worker, &ps->pit->expired); - ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + struct kvm_pit *pit = pit_state_to_pit(ps); + + atomic_set(&ps->irq_ack, 1); + /* irq_ack should be set before pending is read. Order accesses with + * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work. + */ + smp_mb(); + if (atomic_dec_if_positive(&ps->pending) > 0) + queue_kthread_work(&pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -282,45 +243,36 @@ static void pit_do_work(struct kthread_work *work) struct kvm_vcpu *vcpu; int i; struct kvm_kpit_state *ps = &pit->pit_state; - int inject = 0; - /* Try to inject pending interrupts when - * last one has been acked. + if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) + return; + + kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); + kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation does not follow + * the MP specification. We propagate a PIT interrupt to all + * VCPUs and only when LVT0 is in NMI mode. The interrupt can + * also be simultaneously delivered through PIC and IOAPIC. */ - spin_lock(&ps->inject_lock); - if (ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - spin_unlock(&ps->inject_lock); - if (inject) { - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); - } + if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); } static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); - struct kvm_pit *pt = ps->kvm->arch.vpit; + struct kvm_pit *pt = pit_state_to_pit(ps); - if (ps->reinject || !atomic_read(&ps->pending)) { + if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); - queue_kthread_work(&pt->worker, &pt->expired); - } + + queue_kthread_work(&pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); @@ -329,30 +281,54 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) +static inline void kvm_pit_reset_reinject(struct kvm_pit *pit) { - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + atomic_set(&pit->pit_state.pending, 0); + atomic_set(&pit->pit_state.irq_ack, 1); +} + +void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) +{ + struct kvm_kpit_state *ps = &pit->pit_state; + struct kvm *kvm = pit->kvm; + + if (atomic_read(&ps->reinject) == reinject) + return; + + if (reinject) { + /* The initial state is preserved while ps->reinject == 0. */ + kvm_pit_reset_reinject(pit); + kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); + kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + } else { + kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + } + + atomic_set(&ps->reinject, reinject); +} + +static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) +{ + struct kvm_kpit_state *ps = &pit->pit_state; + struct kvm *kvm = pit->kvm; s64 interval; if (!ioapic_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; - interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); + interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&ps->timer); - flush_kthread_work(&ps->pit->expired); + flush_kthread_work(&pit->expired); ps->period = interval; ps->is_periodic = is_period; - ps->timer.function = pit_timer_fn; - ps->kvm = ps->pit->kvm; - - atomic_set(&ps->pending, 0); - ps->irq_ack = 1; + kvm_pit_reset_reinject(pit); /* * Do not allow the guest to program periodic timers with small @@ -375,11 +351,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) HRTIMER_MODE_ABS); } -static void pit_load_count(struct kvm *kvm, int channel, u32 val) +static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) { - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - - WARN_ON(!mutex_is_locked(&ps->lock)); + struct kvm_kpit_state *ps = &pit->pit_state; pr_debug("load_count val is %d, channel is %d\n", val, channel); @@ -404,29 +378,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(kvm, val, 0); + create_pit_timer(pit, val, 0); break; case 2: case 3: - create_pit_timer(kvm, val, 1); + create_pit_timer(pit, val, 1); break; default: - destroy_pit_timer(kvm->arch.vpit); + destroy_pit_timer(pit); } } -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) +void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, + int hpet_legacy_start) { u8 saved_mode; + + WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock)); + if (hpet_legacy_start) { /* save existing mode for later reenablement */ WARN_ON(channel != 0); - saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; - kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ - pit_load_count(kvm, channel, val); - kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; + saved_mode = pit->pit_state.channels[0].mode; + pit->pit_state.channels[0].mode = 0xff; /* disable timer */ + pit_load_count(pit, channel, val); + pit->pit_state.channels[0].mode = saved_mode; } else { - pit_load_count(kvm, channel, val); + pit_load_count(pit, channel, val); } } @@ -452,7 +430,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; int channel, access; struct kvm_kpit_channel_state *s; u32 val = *(u32 *) data; @@ -476,9 +453,9 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, s = &pit_state->channels[channel]; if (val & (2 << channel)) { if (!(val & 0x20)) - pit_latch_count(kvm, channel); + pit_latch_count(pit, channel); if (!(val & 0x10)) - pit_latch_status(kvm, channel); + pit_latch_status(pit, channel); } } } else { @@ -486,7 +463,7 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, s = &pit_state->channels[channel]; access = (val >> 4) & KVM_PIT_CHANNEL_MASK; if (access == 0) { - pit_latch_count(kvm, channel); + pit_latch_count(pit, channel); } else { s->rw_mode = access; s->read_state = access; @@ -503,17 +480,17 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, switch (s->write_state) { default: case RW_STATE_LSB: - pit_load_count(kvm, addr, val); + pit_load_count(pit, addr, val); break; case RW_STATE_MSB: - pit_load_count(kvm, addr, val << 8); + pit_load_count(pit, addr, val << 8); break; case RW_STATE_WORD0: s->write_latch = val; s->write_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: - pit_load_count(kvm, addr, s->write_latch | (val << 8)); + pit_load_count(pit, addr, s->write_latch | (val << 8)); s->write_state = RW_STATE_WORD0; break; } @@ -529,7 +506,6 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; int ret, count; struct kvm_kpit_channel_state *s; if (!pit_in_range(addr)) @@ -566,20 +542,20 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu, switch (s->read_state) { default: case RW_STATE_LSB: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = count & 0xff; break; case RW_STATE_MSB: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; break; case RW_STATE_WORD0: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = count & 0xff; s->read_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; s->read_state = RW_STATE_WORD0; break; @@ -600,14 +576,13 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; u32 val = *(u32 *) data; if (addr != KVM_SPEAKER_BASE_ADDRESS) return -EOPNOTSUPP; mutex_lock(&pit_state->lock); pit_state->speaker_data_on = (val >> 1) & 1; - pit_set_gate(kvm, 2, val & 1); + pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; } @@ -618,7 +593,6 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; unsigned int refresh_clock; int ret; if (addr != KVM_SPEAKER_BASE_ADDRESS) @@ -628,8 +602,8 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) | - (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4)); + ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | + (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); @@ -637,33 +611,28 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, return 0; } -void kvm_pit_reset(struct kvm_pit *pit) +static void kvm_pit_reset(struct kvm_pit *pit) { int i; struct kvm_kpit_channel_state *c; - mutex_lock(&pit->pit_state.lock); pit->pit_state.flags = 0; for (i = 0; i < 3; i++) { c = &pit->pit_state.channels[i]; c->mode = 0xff; c->gate = (i != 2); - pit_load_count(pit->kvm, i, 0); + pit_load_count(pit, i, 0); } - mutex_unlock(&pit->pit_state.lock); - atomic_set(&pit->pit_state.pending, 0); - pit->pit_state.irq_ack = 1; + kvm_pit_reset_reinject(pit); } static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) { struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); - if (!mask) { - atomic_set(&pit->pit_state.pending, 0); - pit->pit_state.irq_ack = 1; - } + if (!mask) + kvm_pit_reset_reinject(pit); } static const struct kvm_io_device_ops pit_dev_ops = { @@ -690,14 +659,10 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return NULL; pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) { - kfree(pit); - return NULL; - } + if (pit->irq_source_id < 0) + goto fail_request; mutex_init(&pit->pit_state.lock); - mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); pid = get_pid(task_tgid(current)); pid_nr = pid_vnr(pid); @@ -706,36 +671,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) init_kthread_worker(&pit->worker); pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) { - mutex_unlock(&pit->pit_state.lock); - kvm_free_irq_source_id(kvm, pit->irq_source_id); - kfree(pit); - return NULL; - } + if (IS_ERR(pit->worker_task)) + goto fail_kthread; + init_kthread_work(&pit->expired, pit_do_work); - kvm->arch.vpit = pit; pit->kvm = kvm; pit_state = &pit->pit_state; - pit_state->pit = pit; hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pit_state->timer.function = pit_timer_fn; + pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; - kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - pit_state->reinject = true; - mutex_unlock(&pit->pit_state.lock); + pit->mask_notifier.func = pit_mask_notifer; kvm_pit_reset(pit); - pit->mask_notifier.func = pit_mask_notifer; - kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_pit_set_reinject(pit, true); kvm_iodevice_init(&pit->dev, &pit_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, KVM_PIT_MEM_LENGTH, &pit->dev); if (ret < 0) - goto fail; + goto fail_register_pit; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); @@ -743,42 +702,35 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) KVM_SPEAKER_BASE_ADDRESS, 4, &pit->speaker_dev); if (ret < 0) - goto fail_unregister; + goto fail_register_speaker; } return pit; -fail_unregister: +fail_register_speaker: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); - -fail: - kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - kvm_free_irq_source_id(kvm, pit->irq_source_id); +fail_register_pit: + kvm_pit_set_reinject(pit, false); kthread_stop(pit->worker_task); +fail_kthread: + kvm_free_irq_source_id(kvm, pit->irq_source_id); +fail_request: kfree(pit); return NULL; } void kvm_free_pit(struct kvm *kvm) { - struct hrtimer *timer; - - if (kvm->arch.vpit) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &kvm->arch.vpit->speaker_dev); - kvm_unregister_irq_mask_notifier(kvm, 0, - &kvm->arch.vpit->mask_notifier); - kvm_unregister_irq_ack_notifier(kvm, - &kvm->arch.vpit->pit_state.irq_ack_notifier); - mutex_lock(&kvm->arch.vpit->pit_state.lock); - timer = &kvm->arch.vpit->pit_state.timer; - hrtimer_cancel(timer); - flush_kthread_work(&kvm->arch.vpit->expired); - kthread_stop(kvm->arch.vpit->worker_task); - kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - kfree(kvm->arch.vpit); + struct kvm_pit *pit = kvm->arch.vpit; + + if (pit) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); + kvm_pit_set_reinject(pit, false); + hrtimer_cancel(&pit->pit_state.timer); + flush_kthread_work(&pit->expired); + kthread_stop(pit->worker_task); + kvm_free_irq_source_id(kvm, pit->irq_source_id); + kfree(pit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index c84990b42b5b..2f5af0798326 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -22,19 +22,18 @@ struct kvm_kpit_channel_state { }; struct kvm_kpit_state { + /* All members before "struct mutex lock" are protected by the lock. */ struct kvm_kpit_channel_state channels[3]; u32 flags; bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - atomic_t pending; /* accumulated triggered timers */ - bool reinject; - struct kvm *kvm; u32 speaker_data_on; + struct mutex lock; - struct kvm_pit *pit; - spinlock_t inject_lock; - unsigned long irq_ack; + atomic_t reinject; + atomic_t pending; /* accumulated triggered timers */ + atomic_t irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; @@ -57,9 +56,11 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); -void kvm_pit_reset(struct kvm_pit *pit); + +void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, + int hpet_legacy_start); +void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject); #endif diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 1facfd60b04a..9db47090ead0 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); @@ -117,16 +117,16 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); - old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); if (new_val == old_val) return; if (new_val) { - __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); ioapic->rtc_status.pending_eoi++; } else { - __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); ioapic->rtc_status.pending_eoi--; rtc_status_pending_eoi_check_valid(ioapic); } @@ -156,7 +156,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) { - if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) { + if (test_and_clear_bit(vcpu->vcpu_id, + ioapic->rtc_status.dest_map.map)) { --ioapic->rtc_status.pending_eoi; rtc_status_pending_eoi_check_valid(ioapic); } @@ -236,10 +237,17 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; union kvm_ioapic_redirect_entry *e; int index; spin_lock(&ioapic->lock); + + /* Make sure we see any missing RTC EOI */ + if (test_bit(vcpu->vcpu_id, dest_map->map)) + __set_bit(dest_map->vectors[vcpu->vcpu_id], + ioapic_handled_vectors); + for (index = 0; index < IOAPIC_NUM_PINS; index++) { e = &ioapic->redirtbl[index]; if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || @@ -346,7 +354,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) */ BUG_ON(ioapic->rtc_status.pending_eoi != 0); ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, - ioapic->rtc_status.dest_map); + &ioapic->rtc_status.dest_map); ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); } else ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); @@ -407,8 +415,14 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work) static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { - int i; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + /* RTC special handling */ + if (test_bit(vcpu->vcpu_id, dest_map->map) && + vector == dest_map->vectors[vcpu->vcpu_id]) + rtc_irq_eoi(ioapic, vcpu); for (i = 0; i < IOAPIC_NUM_PINS; i++) { union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; @@ -416,8 +430,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, if (ent->fields.vector != vector) continue; - if (i == RTC_GSI) - rtc_irq_eoi(ioapic, vcpu); /* * We are dropping lock while calling ack notifiers because ack * notifier callbacks for assigned devices call into IOAPIC diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 2d16dc251d81..7d2692a49657 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -40,9 +40,21 @@ struct kvm_vcpu; #define RTC_GSI -1U #endif +struct dest_map { + /* vcpu bitmap where IRQ has been sent */ + DECLARE_BITMAP(map, KVM_MAX_VCPUS); + + /* + * Vector sent to a given vcpu, only valid when + * the vcpu's bit in map is set + */ + u8 vectors[KVM_MAX_VCPUS]; +}; + + struct rtc_status { int pending_eoi; - DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS); + struct dest_map dest_map; }; union kvm_ioapic_redirect_entry { @@ -118,7 +130,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, int level, bool line_status); void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, unsigned long *dest_map); + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 3982b479bb5f..95fcc7b13866 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -33,7 +33,10 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return apic_has_pending_timer(vcpu); + if (lapic_in_kernel(vcpu)) + return apic_has_pending_timer(vcpu); + + return 0; } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); @@ -137,8 +140,8 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { - kvm_inject_apic_timer_irqs(vcpu); - /* TODO: PIT, RTC etc. */ + if (lapic_in_kernel(vcpu)) + kvm_inject_apic_timer_irqs(vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ae5c78f2337d..61ebdc13a29a 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -109,14 +109,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return ret; } -static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) -{ - /* Same as irqchip_in_kernel(vcpu->kvm), but with less - * pointer chasing and no unnecessary memory barriers. - */ - return vcpu->arch.apic != NULL; -} - void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8fc89efb5250..54ead79e444b 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -34,6 +34,7 @@ #include "lapic.h" #include "hyperv.h" +#include "x86.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -53,10 +54,12 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, } int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, unsigned long *dest_map) + struct kvm_lapic_irq *irq, struct dest_map *dest_map) { int i, r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; + unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned int dest_vcpus = 0; if (irq->dest_mode == 0 && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { @@ -67,6 +70,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) return r; + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); + kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; @@ -80,13 +85,25 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); } else if (kvm_lapic_enabled(vcpu)) { - if (!lowest) - lowest = vcpu; - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) - lowest = vcpu; + if (!kvm_vector_hashing_enabled()) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } else { + __set_bit(i, dest_vcpu_bitmap); + dest_vcpus++; + } } } + if (dest_vcpus != 0) { + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, + dest_vcpu_bitmap, KVM_MAX_VCPUS); + + lowest = kvm_get_vcpu(kvm, idx); + } + if (lowest) r = kvm_apic_set_irq(lowest, irq, dest_map); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 36591faed13b..443d2a57ad3d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -281,7 +281,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return; feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); @@ -475,26 +475,20 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { - int highest_irr; - /* This may race with setting of irr in __apic_accept_irq() and * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq * will cause vmexit immediately and the value will be recalculated * on the next vmentry. */ - if (!kvm_vcpu_has_lapic(vcpu)) - return 0; - highest_irr = apic_find_highest_irr(vcpu->arch.apic); - - return highest_irr; + return apic_find_highest_irr(vcpu->arch.apic); } static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, - unsigned long *dest_map); + struct dest_map *dest_map); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map) + struct dest_map *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -675,8 +669,33 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, } } +int kvm_vector_to_index(u32 vector, u32 dest_vcpus, + const unsigned long *bitmap, u32 bitmap_size) +{ + u32 mod; + int i, idx = -1; + + mod = vector % dest_vcpus; + + for (i = 0; i <= mod; i++) { + idx = find_next_bit(bitmap, bitmap_size, idx + 1); + BUG_ON(idx == bitmap_size); + } + + return idx; +} + +static void kvm_apic_disabled_lapic_found(struct kvm *kvm) +{ + if (!kvm->arch.disabled_lapic_found) { + kvm->arch.disabled_lapic_found = true; + printk(KERN_INFO + "Disabled LAPIC found during irq injection\n"); + } +} + bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map) + struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; unsigned long bitmap = 1; @@ -727,21 +746,42 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, dst = map->logical_map[cid]; - if (kvm_lowest_prio_delivery(irq)) { + if (!kvm_lowest_prio_delivery(irq)) + goto set_irq; + + if (!kvm_vector_hashing_enabled()) { int l = -1; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; if (l < 0) l = i; - else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0) + else if (kvm_apic_compare_prio(dst[i]->vcpu, + dst[l]->vcpu) < 0) l = i; } - bitmap = (l >= 0) ? 1 << l : 0; + } else { + int idx; + unsigned int dest_vcpus; + + dest_vcpus = hweight16(bitmap); + if (dest_vcpus == 0) + goto out; + + idx = kvm_vector_to_index(irq->vector, + dest_vcpus, &bitmap, 16); + + if (!dst[idx]) { + kvm_apic_disabled_lapic_found(kvm); + goto out; + } + + bitmap = (idx >= 0) ? 1 << idx : 0; } } +set_irq: for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; @@ -754,6 +794,20 @@ out: return ret; } +/* + * This routine tries to handler interrupts in posted mode, here is how + * it deals with different cases: + * - For single-destination interrupts, handle it in posted mode + * - Else if vector hashing is enabled and it is a lowest-priority + * interrupt, handle it in posted mode and use the following mechanism + * to find the destinaiton vCPU. + * 1. For lowest-priority interrupts, store all the possible + * destination vCPUs in an array. + * 2. Use "guest vector % max number of destination vCPUs" to find + * the right destination vCPU in the array for the lowest-priority + * interrupt. + * - Otherwise, use remapped mode to inject the interrupt. + */ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { @@ -795,16 +849,37 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, if (cid >= ARRAY_SIZE(map->logical_map)) goto out; - for_each_set_bit(i, &bitmap, 16) { - dst = map->logical_map[cid][i]; - if (++r == 2) + if (kvm_vector_hashing_enabled() && + kvm_lowest_prio_delivery(irq)) { + int idx; + unsigned int dest_vcpus; + + dest_vcpus = hweight16(bitmap); + if (dest_vcpus == 0) goto out; - } - if (dst && kvm_apic_present(dst->vcpu)) + idx = kvm_vector_to_index(irq->vector, dest_vcpus, + &bitmap, 16); + + dst = map->logical_map[cid][idx]; + if (!dst) { + kvm_apic_disabled_lapic_found(kvm); + goto out; + } + *dest_vcpu = dst->vcpu; - else - goto out; + } else { + for_each_set_bit(i, &bitmap, 16) { + dst = map->logical_map[cid][i]; + if (++r == 2) + goto out; + } + + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } } ret = true; @@ -819,7 +894,7 @@ out: */ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, - unsigned long *dest_map) + struct dest_map *dest_map) { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; @@ -839,8 +914,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; - if (dest_map) - __set_bit(vcpu->vcpu_id, dest_map); + if (dest_map) { + __set_bit(vcpu->vcpu_id, dest_map->map); + dest_map->vectors[vcpu->vcpu_id] = vector; + } if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) @@ -1195,7 +1272,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; - wait_queue_head_t *q = &vcpu->wq; + struct swait_queue_head *q = &vcpu->wq; struct kvm_timer *ktimer = &apic->lapic_timer; if (atomic_read(&apic->lapic_timer.pending)) @@ -1204,8 +1281,8 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); - if (waitqueue_active(q)) - wake_up_interruptible(q); + if (swait_active(q)) + swake_up(q); if (apic_lvtt_tscdeadline(apic)) ktimer->expired_tscdeadline = ktimer->tscdeadline; @@ -1239,7 +1316,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return; if (apic->lapic_timer.expired_tscdeadline == 0) @@ -1515,8 +1592,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_has_lapic(vcpu)) - apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); + apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); @@ -1566,7 +1642,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return 0; @@ -1577,7 +1653,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return; @@ -1590,9 +1666,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu)) - return; - apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4)); } @@ -1601,9 +1674,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { u64 tpr; - if (!kvm_vcpu_has_lapic(vcpu)) - return 0; - tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; @@ -1728,8 +1798,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) && - apic_lvt_enabled(apic, APIC_LVTT)) + if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT)) return atomic_read(&apic->lapic_timer.pending); return 0; @@ -1826,7 +1895,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; - if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic)) + if (!apic_enabled(apic)) return -1; apic_update_ppr(apic); @@ -1854,9 +1923,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu)) - return; - if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) @@ -1932,7 +1998,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct hrtimer *timer; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return; timer = &vcpu->arch.apic->lapic_timer.timer; @@ -2105,7 +2171,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return 1; /* if this is ICR write vector before command */ @@ -2119,7 +2185,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 low, high = 0; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return 1; if (apic_reg_read(apic, reg, 4, &low)) @@ -2151,7 +2217,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) u8 sipi_vector; unsigned long pe; - if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) + if (!lapic_in_kernel(vcpu) || !apic->pending_events) return; /* diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 41bdb35b4b67..f71183e502ee 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -42,6 +42,9 @@ struct kvm_lapic { unsigned long pending_events; unsigned int sipi_vector; }; + +struct dest_map; + int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); @@ -60,11 +63,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map); + struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); + struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -103,7 +106,7 @@ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) extern struct static_key kvm_no_apic_vcpu; -static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu) +static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu) { if (static_key_false(&kvm_no_apic_vcpu)) return vcpu->arch.apic; @@ -130,7 +133,7 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) { - return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic); + return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic); } static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) @@ -150,7 +153,7 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) { - return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; + return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events; } static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) @@ -161,7 +164,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) { - return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } static inline int kvm_apic_id(struct kvm_lapic *apic) @@ -175,4 +178,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); +int kvm_vector_to_index(u32 vector, u32 dest_vcpus, + const unsigned long *bitmap, u32 bitmap_size); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 95a955de5964..c512f095cdac 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -41,6 +41,7 @@ #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/vmx.h> +#include <asm/kvm_page_track.h> /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -776,62 +777,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, + gfn_t gfn, int count) +{ + struct kvm_lpage_info *linfo; + int i; + + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + linfo = lpage_info_slot(gfn, slot, i); + linfo->disallow_lpage += count; + WARN_ON(linfo->disallow_lpage < 0); + } +} + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +{ + update_gfn_disallow_lpage_count(slot, gfn, 1); +} + +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +{ + update_gfn_disallow_lpage_count(slot, gfn, -1); +} + static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; gfn_t gfn; - int i; + kvm->arch.indirect_shadow_pages++; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count += 1; - } - kvm->arch.indirect_shadow_pages++; + + /* the non-leaf shadow pages are keeping readonly. */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return kvm_slot_page_track_add_page(kvm, slot, gfn, + KVM_PAGE_TRACK_WRITE); + + kvm_mmu_gfn_disallow_lpage(slot, gfn); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; gfn_t gfn; - int i; + kvm->arch.indirect_shadow_pages--; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count -= 1; - WARN_ON(linfo->write_count < 0); - } - kvm->arch.indirect_shadow_pages--; + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return kvm_slot_page_track_remove_page(kvm, slot, gfn, + KVM_PAGE_TRACK_WRITE); + + kvm_mmu_gfn_allow_lpage(slot, gfn); } -static int __has_wrprotected_page(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { struct kvm_lpage_info *linfo; if (slot) { linfo = lpage_info_slot(gfn, slot, level); - return linfo->write_count; + return !!linfo->disallow_lpage; } - return 1; + return true; } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __has_wrprotected_page(gfn, level, slot); + return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); } static int host_mapping_level(struct kvm *kvm, gfn_t gfn) @@ -897,7 +921,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (__has_wrprotected_page(large_gfn, level, slot)) + if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) break; return level - 1; @@ -1323,23 +1347,29 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn) { - struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); } return write_protected; } +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); +} + static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; @@ -1754,7 +1784,7 @@ static void mark_unsync(u64 *spte) static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return 1; + return 0; } static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) @@ -1840,13 +1870,16 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return nr_unsync_leaf; } +#define INVALID_INDEX (-1) + static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { + pvec->nr = 0; if (!sp->unsync_children) return 0; - mmu_pages_add(pvec, sp, 0); + mmu_pages_add(pvec, sp, INVALID_INDEX); return __mmu_unsync_walk(sp, pvec); } @@ -1883,37 +1916,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if ((_sp)->role.direct || (_sp)->role.invalid) {} else /* @sp->gfn should be write-protected at the call site */ -static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list, bool clear_unsync) +static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; + return false; } - if (clear_unsync) - kvm_unlink_unsync_page(vcpu->kvm, sp); - - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; + return false; } - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - return 0; + return true; } -static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) +static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, + struct list_head *invalid_list, + bool remote_flush, bool local_flush) { - LIST_HEAD(invalid_list); - int ret; - - ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (ret) - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (!list_empty(invalid_list)) { + kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); + return; + } - return ret; + if (remote_flush) + kvm_flush_remote_tlbs(vcpu->kvm); + else if (local_flush) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } #ifdef CONFIG_KVM_MMU_AUDIT @@ -1923,46 +1954,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - return __kvm_sync_page(vcpu, sp, invalid_list, true); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return __kvm_sync_page(vcpu, sp, invalid_list); } /* @gfn should be write-protected at the call site */ -static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, + struct list_head *invalid_list) { struct kvm_mmu_page *s; - LIST_HEAD(invalid_list); - bool flush = false; + bool ret = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - kvm_unlink_unsync_page(vcpu->kvm, s); - if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); - continue; - } - flush = true; + ret |= kvm_sync_page(vcpu, s, invalid_list); } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return ret; } struct mmu_page_path { - struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; - unsigned int idx[PT64_ROOT_LEVEL-1]; + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL]; + unsigned int idx[PT64_ROOT_LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ - for (i = mmu_pages_next(&pvec, &parents, -1), \ - sp = pvec.page[i].sp; \ + for (i = mmu_pages_first(&pvec, &parents); \ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) @@ -1974,19 +1997,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec, for (n = i+1; n < pvec->nr; n++) { struct kvm_mmu_page *sp = pvec->page[n].sp; + unsigned idx = pvec->page[n].idx; + int level = sp->role.level; - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - parents->idx[0] = pvec->page[n].idx; - return n; - } + parents->idx[level-1] = idx; + if (level == PT_PAGE_TABLE_LEVEL) + break; - parents->parent[sp->role.level-2] = sp; - parents->idx[sp->role.level-1] = pvec->page[n].idx; + parents->parent[level-2] = sp; } return n; } +static int mmu_pages_first(struct kvm_mmu_pages *pvec, + struct mmu_page_path *parents) +{ + struct kvm_mmu_page *sp; + int level; + + if (pvec->nr == 0) + return 0; + + WARN_ON(pvec->page[0].idx != INVALID_INDEX); + + sp = pvec->page[0].sp; + level = sp->role.level; + WARN_ON(level == PT_PAGE_TABLE_LEVEL); + + parents->parent[level-2] = sp; + + /* Also set up a sentinel. Further entries in pvec are all + * children of sp, so this element is never overwritten. + */ + parents->parent[level-1] = NULL; + return mmu_pages_next(pvec, parents, 0); +} + static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; @@ -1994,22 +2041,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) do { unsigned int idx = parents->idx[level]; - sp = parents->parent[level]; if (!sp) return; + WARN_ON(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; - } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); -} - -static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, - struct mmu_page_path *parents, - struct kvm_mmu_pages *pvec) -{ - parents->parent[parent->role.level-1] = NULL; - pvec->nr = 0; + } while (!sp->unsync_children); } static void mmu_sync_children(struct kvm_vcpu *vcpu, @@ -2020,30 +2059,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct mmu_page_path parents; struct kvm_mmu_pages pages; LIST_HEAD(invalid_list); + bool flush = false; - kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu, sp->gfn); - if (protected) + if (protected) { kvm_flush_remote_tlbs(vcpu->kvm); + flush = false; + } for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp, &invalid_list); + flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - cond_resched_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_pages_init(parent, &parents, &pages); + if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + cond_resched_lock(&vcpu->kvm->mmu_lock); + flush = false; + } } + + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { - sp->write_flooding_count = 0; + atomic_set(&sp->write_flooding_count, 0); } static void clear_sp_write_flooding_count(u64 *spte) @@ -2069,6 +2114,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; + bool flush = false; + LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; role.level = level; @@ -2092,8 +2139,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; + if (sp->unsync) { + /* The page is good, but __kvm_sync_page might still end + * up zapping it. If so, break in order to rebuild it. + */ + if (!__kvm_sync_page(vcpu, sp, &invalid_list)) + break; + + WARN_ON(!list_empty(&invalid_list)); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } if (sp->unsync_children) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -2112,16 +2167,24 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_add_head(&sp->hash_link, &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { - if (rmap_write_protect(vcpu, gfn)) + /* + * we should do write protection before syncing pages + * otherwise the content of the synced shadow page may + * be inconsistent with guest page table. + */ + account_shadowed(vcpu->kvm, sp); + if (level == PT_PAGE_TABLE_LEVEL && + rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) - kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, sp); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); + + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); return sp; } @@ -2269,7 +2332,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm, if (parent->role.level == PT_PAGE_TABLE_LEVEL) return 0; - kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { struct kvm_mmu_page *sp; @@ -2278,7 +2340,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm, mmu_pages_clear_parents(&parents); zapped++; } - kvm_mmu_pages_init(parent, &parents, &pages); } return zapped; @@ -2354,8 +2415,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, if (list_empty(&kvm->arch.active_mmu_pages)) return false; - sp = list_entry(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); + sp = list_last_entry(&kvm->arch.active_mmu_pages, + struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); return true; @@ -2408,7 +2469,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; @@ -2417,37 +2478,26 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { - struct kvm_mmu_page *s; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { - if (s->unsync) - continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - __kvm_unsync_page(vcpu, s); - } -} + struct kvm_mmu_page *sp; -static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) -{ - struct kvm_mmu_page *s; - bool need_unsync = false; + if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + return true; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) - return 1; + return true; - if (s->role.level != PT_PAGE_TABLE_LEVEL) - return 1; + if (sp->unsync) + continue; - if (!s->unsync) - need_unsync = true; + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + kvm_unsync_page(vcpu, sp); } - if (need_unsync) - kvm_unsync_pages(vcpu, gfn); - return 0; + + return false; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) @@ -2503,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * be fixed if guest refault. */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu, gfn, level)) + mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) goto done; spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2768,7 +2818,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { + !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the @@ -2796,20 +2846,16 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, kvm_pfn_t pfn, unsigned access, int *ret_val) { - bool ret = true; - /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); - goto exit; + return true; } if (unlikely(is_noslot_pfn(pfn))) vcpu_cache_mmio_info(vcpu, gva, gfn, access); - ret = false; -exit: - return ret; + return false; } static bool page_fault_can_be_fast(u32 error_code) @@ -3273,7 +3319,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); } -static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { if (direct) return vcpu_match_mmio_gpa(vcpu, addr); @@ -3332,7 +3378,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) u64 spte; bool reserved; - if (quickly_check_mmio_pf(vcpu, addr, direct)) + if (mmio_info_in_cache(vcpu, addr, direct)) return RET_MMIO_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); @@ -3362,20 +3408,53 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) } EXPORT_SYMBOL_GPL(handle_mmio_page_fault); +static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, + u32 error_code, gfn_t gfn) +{ + if (unlikely(error_code & PFERR_RSVD_MASK)) + return false; + + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + /* + * guest is writing the page which is write tracked which can + * not be fixed by page fault handler. + */ + if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + return true; + + return false; +} + +static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) +{ + struct kvm_shadow_walk_iterator iterator; + u64 spte; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + clear_sp_write_flooding_count(iterator.sptep); + if (!is_shadow_present_pte(spte)) + break; + } + walk_shadow_page_lockless_end(vcpu); +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) { - gfn_t gfn; + gfn_t gfn = gva >> PAGE_SHIFT; int r; pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gva, true); - - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - } + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return 1; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3383,7 +3462,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, error_code, gfn, prefault); @@ -3460,12 +3538,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gpa, true); - - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - } + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return 1; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3558,13 +3632,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return false; } -static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) +static inline bool is_last_gpte(struct kvm_mmu *mmu, + unsigned level, unsigned gpte) { - unsigned index; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - index = level - 1; - index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); - return mmu->last_pte_bitmap & (1 << index); + /* + * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. + * If it is clear, there are no large pages at this level, so clear + * PT_PAGE_SIZE_MASK in gpte if that is the case. + */ + gpte &= level - mmu->last_nonleaf_level; + + return gpte & PT_PAGE_SIZE_MASK; } #define PTTYPE_EPT 18 /* arbitrary */ @@ -3721,13 +3806,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { + bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + /* * Passing "true" to the last argument is okay; it adds a check * on bit 8 of the SPTEs which KVM doesn't use anyway. */ __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, boot_cpu_data.x86_phys_bits, - context->shadow_root_level, context->nx, + context->shadow_root_level, uses_nx, guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), true); } @@ -3836,22 +3923,13 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } -static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { - u8 map; - unsigned level, root_level = mmu->root_level; - const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ - - if (root_level == PT32E_ROOT_LEVEL) - --root_level; - /* PT_PAGE_TABLE_LEVEL always terminates */ - map = 1 | (1 << ps_set_index); - for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { - if (level <= PT_PDPE_LEVEL - && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) - map |= 1 << (ps_set_index | (level - 1)); - } - mmu->last_pte_bitmap = map; + unsigned root_level = mmu->root_level; + + mmu->last_nonleaf_level = root_level; + if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu)) + mmu->last_nonleaf_level++; } static void paging64_init_context_common(struct kvm_vcpu *vcpu, @@ -3863,7 +3941,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; @@ -3890,7 +3968,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -3948,7 +4026,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4054,7 +4132,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); - update_last_pte_bitmap(vcpu, g_context); + update_last_nonleaf_level(vcpu, g_context); } static void init_kvm_mmu(struct kvm_vcpu *vcpu) @@ -4125,18 +4203,6 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, - bool remote_flush, bool local_flush) -{ - if (zap_page) - return; - - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); -} - static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, const u8 *new, int *bytes) { @@ -4186,7 +4252,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) if (sp->role.level == PT_PAGE_TABLE_LEVEL) return false; - return ++sp->write_flooding_count >= 3; + atomic_inc(&sp->write_flooding_count); + return atomic_read(&sp->write_flooding_count) >= 3; } /* @@ -4248,15 +4315,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) +static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush, zap_page; + bool remote_flush, local_flush; union kvm_mmu_page_role mask = { }; mask.cr0_wp = 1; @@ -4273,7 +4340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - zap_page = remote_flush = local_flush = false; + remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -4293,8 +4360,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { - zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -4316,8 +4382,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++spte; } } - mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); } @@ -4354,32 +4419,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) -{ - if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) - return vcpu_match_mmio_gpa(vcpu, addr); - - return vcpu_match_mmio_gva(vcpu, addr); -} - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; enum emulation_result er; + bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); + + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, cr2, direct); + if (r == RET_MMIO_PF_EMULATE) { + emulation_type = 0; + goto emulate; + } + if (r == RET_MMIO_PF_RETRY) + return 1; + if (r < 0) + return r; + } r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); if (r < 0) - goto out; - - if (!r) { - r = 1; - goto out; - } + return r; + if (!r) + return 1; - if (is_mmio_page_fault(vcpu, cr2)) + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; - +emulate: er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { @@ -4393,8 +4460,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, default: BUG(); } -out: - return r; } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -4463,6 +4528,21 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +void kvm_mmu_init_vm(struct kvm *kvm) +{ + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + + node->track_write = kvm_mmu_pte_write; + kvm_page_track_register_notifier(kvm, node); +} + +void kvm_mmu_uninit_vm(struct kvm *kvm) +{ + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + + kvm_page_track_unregister_notifier(kvm, node); +} + /* The return value indicates if tlb flush on all vcpus is needed. */ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 55ffb7b0f95e..58fe98a0a526 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -174,4 +174,9 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); #endif diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c new file mode 100644 index 000000000000..11f76436f74f --- /dev/null +++ b/arch/x86/kvm/page_track.c @@ -0,0 +1,222 @@ +/* + * Support KVM gust page tracking + * + * This feature allows us to track page access in guest. Currently, only + * write access is tracked. + * + * Copyright(C) 2015 Intel Corporation. + * + * Author: + * Xiao Guangrong <guangrong.xiao@linux.intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_host.h> +#include <asm/kvm_page_track.h> + +#include "mmu.h" + +void kvm_page_track_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) + if (!dont || free->arch.gfn_track[i] != + dont->arch.gfn_track[i]) { + kvfree(free->arch.gfn_track[i]); + free->arch.gfn_track[i] = NULL; + } +} + +int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { + slot->arch.gfn_track[i] = kvm_kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i])); + if (!slot->arch.gfn_track[i]) + goto track_free; + } + + return 0; + +track_free: + kvm_page_track_free_memslot(slot, NULL); + return -ENOMEM; +} + +static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) +{ + if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX) + return false; + + return true; +} + +static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode, short count) +{ + int index, val; + + index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + + val = slot->arch.gfn_track[mode][index]; + + if (WARN_ON(val + count < 0 || val + count > USHRT_MAX)) + return; + + slot->arch.gfn_track[mode][index] += count; +} + +/* + * add guest page to the tracking pool so that corresponding access on that + * page will be intercepted. + * + * It should be called under the protection both of mmu-lock and kvm->srcu + * or kvm->slots_lock. + * + * @kvm: the guest instance we are interested in. + * @slot: the @gfn belongs to. + * @gfn: the guest page. + * @mode: tracking mode, currently only write track is supported. + */ +void kvm_slot_page_track_add_page(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + + if (WARN_ON(!page_track_mode_is_valid(mode))) + return; + + update_gfn_track(slot, gfn, mode, 1); + + /* + * new track stops large page mapping for the + * tracked page. + */ + kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (mode == KVM_PAGE_TRACK_WRITE) + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) + kvm_flush_remote_tlbs(kvm); +} + +/* + * remove the guest page from the tracking pool which stops the interception + * of corresponding access on that page. It is the opposed operation of + * kvm_slot_page_track_add_page(). + * + * It should be called under the protection both of mmu-lock and kvm->srcu + * or kvm->slots_lock. + * + * @kvm: the guest instance we are interested in. + * @slot: the @gfn belongs to. + * @gfn: the guest page. + * @mode: tracking mode, currently only write track is supported. + */ +void kvm_slot_page_track_remove_page(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + if (WARN_ON(!page_track_mode_is_valid(mode))) + return; + + update_gfn_track(slot, gfn, mode, -1); + + /* + * allow large page mapping for the tracked page + * after the tracker is gone. + */ + kvm_mmu_gfn_allow_lpage(slot, gfn); +} + +/* + * check if the corresponding access on the specified guest page is tracked. + */ +bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + + if (WARN_ON(!page_track_mode_is_valid(mode))) + return false; + + return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); +} + +void kvm_page_track_init(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + init_srcu_struct(&head->track_srcu); + INIT_HLIST_HEAD(&head->track_notifier_list); +} + +/* + * register the notifier so that event interception for the tracked guest + * pages can be received. + */ +void +kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + + spin_lock(&kvm->mmu_lock); + hlist_add_head_rcu(&n->node, &head->track_notifier_list); + spin_unlock(&kvm->mmu_lock); +} + +/* + * stop receiving the event interception. It is the opposed operation of + * kvm_page_track_register_notifier(). + */ +void +kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + + spin_lock(&kvm->mmu_lock); + hlist_del_rcu(&n->node); + spin_unlock(&kvm->mmu_lock); + synchronize_srcu(&head->track_srcu); +} + +/* + * Notify the node that write access is intercepted and write emulation is + * finished at this time. + * + * The node should figure out if the written page is the one that node is + * interested in by itself. + */ +void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes) +{ + struct kvm_page_track_notifier_head *head; + struct kvm_page_track_notifier_node *n; + int idx; + + head = &vcpu->kvm->arch.track_notifier_head; + + if (hlist_empty(&head->track_notifier_list)) + return; + + idx = srcu_read_lock(&head->track_srcu); + hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + if (n->track_write) + n->track_write(vcpu, gpa, new, bytes); + srcu_read_unlock(&head->track_srcu, idx); +} diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6c9fed957cce..e159a8185ad9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -189,8 +189,11 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | ACC_USER_MASK; #else - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access &= ~(gpte >> PT64_NX_SHIFT); + BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); + BUILD_BUG_ON(ACC_EXEC_MASK != 1); + access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); + /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ + access ^= (gpte >> PT64_NX_SHIFT); #endif return access; @@ -249,7 +252,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, return ret; kvm_vcpu_mark_page_dirty(vcpu, table_gfn); - walker->ptes[level] = pte; + walker->ptes[level - 1] = pte; } return 0; } @@ -702,24 +705,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu)); - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - - /* - * page fault with PFEC.RSVD = 1 is caused by shadow - * page fault, should not be used to walk guest page - * table. - */ - error_code &= ~PFERR_RSVD_MASK; - }; - r = mmu_topup_memory_caches(vcpu); if (r) return r; /* + * If PFEC.RSVD is set, this is a shadow page fault. + * The bit needs to be cleared before walking guest page tables. + */ + error_code &= ~PFERR_RSVD_MASK; + + /* * Look up the guest pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); @@ -735,6 +731,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { + shadow_page_table_clear_flood(vcpu, addr); + return 1; + } + vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, @@ -945,7 +946,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) - return -EINVAL; + return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; @@ -977,7 +978,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable); } - return !nr_present; + return nr_present; } #undef pt_element_t diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 31aa2c85dc97..06ce377dcbc9 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -257,7 +257,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apic) + if (lapic_in_kernel(vcpu)) kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c13a64b7d789..95070386d599 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1858,8 +1858,7 @@ static int halt_interception(struct vcpu_svm *svm) static int vmmcall_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - kvm_emulate_hypercall(&svm->vcpu); - return 1; + return kvm_emulate_hypercall(&svm->vcpu); } static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ad9f6a23f139..2f1ea2f61e1f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -996,11 +996,13 @@ TRACE_EVENT(kvm_enter_smm, * Tracepoint for VT-d posted-interrupts. */ TRACE_EVENT(kvm_pi_irte_update, - TP_PROTO(unsigned int vcpu_id, unsigned int gsi, - unsigned int gvec, u64 pi_desc_addr, bool set), - TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set), + TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, + unsigned int gsi, unsigned int gvec, + u64 pi_desc_addr, bool set), + TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), TP_STRUCT__entry( + __field( unsigned int, host_irq ) __field( unsigned int, vcpu_id ) __field( unsigned int, gsi ) __field( unsigned int, gvec ) @@ -1009,6 +1011,7 @@ TRACE_EVENT(kvm_pi_irte_update, ), TP_fast_assign( + __entry->host_irq = host_irq; __entry->vcpu_id = vcpu_id; __entry->gsi = gsi; __entry->gvec = gvec; @@ -1016,9 +1019,10 @@ TRACE_EVENT(kvm_pi_irte_update, __entry->set = set; ), - TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, " + TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, " "gvec: 0x%x, pi_desc_addr: 0x%llx", __entry->set ? "enabled and being updated" : "disabled", + __entry->host_irq, __entry->vcpu_id, __entry->gsi, __entry->gvec, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e2951b6edbbc..75173efccac5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -596,6 +596,8 @@ struct vcpu_vmx { /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; + + u64 current_tsc_ratio; }; enum segment_cache_field { @@ -861,7 +863,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -961,25 +962,36 @@ static const u32 vmx_msr_index[] = { MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; -static inline bool is_page_fault(u32 intr_info) +static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); +} + +static inline bool is_debug(u32 intr_info) +{ + return is_exception_n(intr_info, DB_VECTOR); +} + +static inline bool is_breakpoint(u32 intr_info) +{ + return is_exception_n(intr_info, BP_VECTOR); +} + +static inline bool is_page_fault(u32 intr_info) +{ + return is_exception_n(intr_info, PF_VECTOR); } static inline bool is_no_device(u32 intr_info) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); + return is_exception_n(intr_info, NM_VECTOR); } static inline bool is_invalid_opcode(u32 intr_info) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); + return is_exception_n(intr_info, UD_VECTOR); } static inline bool is_external_interrupt(u32 intr_info) @@ -1811,6 +1823,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, return; } break; + case MSR_IA32_PEBS_ENABLE: + /* PEBS needs a quiescent period after being disabled (to write + * a record). Disabling PEBS through VMX MSR swapping doesn't + * provide that period, so a CPU could write host's record into + * guest's memory. + */ + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } for (i = 0; i < m->nr; ++i) @@ -1848,26 +1867,31 @@ static void reload_tss(void) static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) { - u64 guest_efer; - u64 ignore_bits; + u64 guest_efer = vmx->vcpu.arch.efer; + u64 ignore_bits = 0; - guest_efer = vmx->vcpu.arch.efer; + if (!enable_ept) { + /* + * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing + * host CPUID is more efficient than testing guest CPUID + * or CR4. Host SMEP is anyway a requirement for guest SMEP. + */ + if (boot_cpu_has(X86_FEATURE_SMEP)) + guest_efer |= EFER_NX; + else if (!(guest_efer & EFER_NX)) + ignore_bits |= EFER_NX; + } /* - * NX is emulated; LMA and LME handled by hardware; SCE meaningless - * outside long mode + * LMA and LME handled by hardware; SCE meaningless outside long mode. */ - ignore_bits = EFER_NX | EFER_SCE; + ignore_bits |= EFER_SCE; #ifdef CONFIG_X86_64 ignore_bits |= EFER_LMA | EFER_LME; /* SCE is meaningful only in long mode on Intel */ if (guest_efer & EFER_LMA) ignore_bits &= ~(u64)EFER_SCE; #endif - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; clear_atomic_switch_msr(vmx, MSR_EFER); @@ -1878,16 +1902,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) */ if (cpu_has_load_ia32_efer || (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { - guest_efer = vmx->vcpu.arch.efer; if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; if (guest_efer != host_efer) add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); return false; - } + } else { + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; - return true; + vmx->guest_msrs[efer_offset].data = guest_efer; + vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + + return true; + } } static unsigned long segment_base(u16 selector) @@ -2127,14 +2156,16 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - /* Setup TSC multiplier */ - if (cpu_has_vmx_tsc_scaling()) - vmcs_write64(TSC_MULTIPLIER, - vcpu->arch.tsc_scaling_ratio); - vmx->loaded_vmcs->cpu = cpu; } + /* Setup TSC multiplier */ + if (kvm_has_tsc_control && + vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) { + vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio; + vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); + } + vmx_vcpu_pi_load(vcpu, cpu); } @@ -2584,7 +2615,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - if (vmx_mpx_supported()) + if (kvm_mpx_supported()) vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ @@ -2605,7 +2636,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VM_ENTRY_LOAD_IA32_PAT; vmx->nested.nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - if (vmx_mpx_supported()) + if (kvm_mpx_supported()) vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ @@ -2849,7 +2880,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!vmx_mpx_supported()) + if (!kvm_mpx_supported()) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -2926,7 +2957,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!vmx_mpx_supported()) + if (!kvm_mpx_supported()) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -3399,7 +3430,7 @@ static void init_vmcs_shadow_fields(void) for (i = j = 0; i < max_shadow_read_write_fields; i++) { switch (shadow_read_write_fields[i]) { case GUEST_BNDCFGS: - if (!vmx_mpx_supported()) + if (!kvm_mpx_supported()) continue; break; default: @@ -5608,11 +5639,8 @@ static int handle_dr(struct kvm_vcpu *vcpu) } if (vcpu->guest_debug == 0) { - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_MOV_DR_EXITING); /* * No more DR vmexits; force a reload of the debug registers @@ -5649,8 +5677,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); @@ -5659,10 +5685,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -5747,8 +5770,7 @@ static int handle_halt(struct kvm_vcpu *vcpu) static int handle_vmcall(struct kvm_vcpu *vcpu) { - kvm_emulate_hypercall(vcpu); - return 1; + return kvm_emulate_hypercall(vcpu); } static int handle_invd(struct kvm_vcpu *vcpu) @@ -6435,8 +6457,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { /* Recycle the least recently used VMCS. */ - item = list_entry(vmx->nested.vmcs02_pool.prev, - struct vmcs02_list, list); + item = list_last_entry(&vmx->nested.vmcs02_pool, + struct vmcs02_list, list); item->vmptr = vmx->nested.current_vmptr; list_move(&item->list, &vmx->nested.vmcs02_pool); return &item->vmcs02; @@ -7752,6 +7774,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) return false; + else if (is_debug(intr_info) && + vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; + else if (is_breakpoint(intr_info) && + vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return false; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -8356,6 +8385,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + register void *__sp asm(_ASM_SP); /* * If external interrupt exists, IF bit is set in rflags/eflags on the @@ -8388,8 +8418,9 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) "call *%[entry]\n\t" : #ifdef CONFIG_X86_64 - [sp]"=&r"(tmp) + [sp]"=&r"(tmp), #endif + "+r"(__sp) : [entry]"r"(entry), [ss]"i"(__KERNEL_DS), @@ -10256,7 +10287,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - if (vmx_mpx_supported()) + if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); if (nested_cpu_has_xsaves(vmcs12)) vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); @@ -10764,13 +10795,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, */ kvm_set_msi_irq(e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + continue; + } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; - trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, + trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); if (set) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4244c2baf57d..7236bd3a4c3d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); +static bool __read_mostly vector_hashing = true; +module_param(vector_hashing, bool, S_IRUGO); + static bool __read_mostly backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -1196,17 +1199,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { - uint32_t quotient, remainder; - - /* Don't try to replace with do_div(), this one calculates - * "(dividend << 32) / divisor" */ - __asm__ ( "divl %4" - : "=a" (quotient), "=d" (remainder) - : "0" (0), "1" (dividend), "r" (divisor) ); - return quotient; + do_shl32_div32(dividend, divisor); + return dividend; } -static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, +static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, s8 *pshift, u32 *pmultiplier) { uint64_t scaled64; @@ -1214,8 +1211,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, uint64_t tps64; uint32_t tps32; - tps64 = base_khz * 1000LL; - scaled64 = scaled_khz * 1000LL; + tps64 = base_hz; + scaled64 = scaled_hz; while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; @@ -1233,8 +1230,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", - __func__, base_khz, scaled_khz, shift, *pmultiplier); + pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", + __func__, base_hz, scaled_hz, shift, *pmultiplier); } #ifdef CONFIG_X86_64 @@ -1293,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) return 0; } -static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ - if (this_tsc_khz == 0) { + if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; return -1; } /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, &vcpu->arch.virtual_tsc_shift, &vcpu->arch.virtual_tsc_mult); - vcpu->arch.virtual_tsc_khz = this_tsc_khz; + vcpu->arch.virtual_tsc_khz = user_tsc_khz; /* * Compute the variation in TSC rate which is acceptable @@ -1319,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) */ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); - if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } - return set_tsc_khz(vcpu, this_tsc_khz, use_scaling); + return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) @@ -1716,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags, this_tsc_khz, tgt_tsc_khz; + unsigned long flags, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -1742,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - this_tsc_khz = __this_cpu_read(cpu_tsc_khz); - if (unlikely(this_tsc_khz == 0)) { + tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; @@ -1778,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (!vcpu->pv_time_enabled) return 0; - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - tgt_tsc_khz = kvm_has_tsc_control ? - vcpu->virtual_tsc_khz : this_tsc_khz; - kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz, + if (kvm_has_tsc_control) + tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); + + if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = this_tsc_khz; + vcpu->hw_tsc_khz = tgt_tsc_khz; } /* With all the info we got, fill in the values */ @@ -2987,7 +2985,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && - kvm_vcpu_has_lapic(vcpu)) + lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { @@ -3000,7 +2998,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (kvm_vcpu_has_lapic(vcpu)) { + if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); else @@ -3240,7 +3238,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) goto out; u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); @@ -3258,7 +3256,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_LAPIC: { r = -EINVAL; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); if (IS_ERR(u.lapic)) @@ -3605,20 +3603,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; + + BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); + + mutex_lock(&kps->lock); + memcpy(ps, &kps->channels, sizeof(*ps)); + mutex_unlock(&kps->lock); return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int i; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); for (i = 0; i < 3; i++) - kvm_pit_load_count(kvm, i, ps->channels[i].count, 0); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + kvm_pit_load_count(pit, i, ps->channels[i].count, 0); + mutex_unlock(&pit->pit_state.lock); return 0; } @@ -3638,29 +3642,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) int start = 0; int i; u32 prev_legacy, cur_legacy; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; if (!prev_legacy && cur_legacy) start = 1; - memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, - sizeof(kvm->arch.vpit->pit_state.channels)); - kvm->arch.vpit->pit_state.flags = ps->flags; + memcpy(&pit->pit_state.channels, &ps->channels, + sizeof(pit->pit_state.channels)); + pit->pit_state.flags = ps->flags; for (i = 0; i < 3; i++) - kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count, + kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, start && i == 0); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + mutex_unlock(&pit->pit_state.lock); return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) { - if (!kvm->arch.vpit) + struct kvm_pit *pit = kvm->arch.vpit; + + if (!pit) return -ENXIO; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - kvm->arch.vpit->pit_state.reinject = control->pit_reinject; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + + /* pit->pit_state.lock was overloaded to prevent userspace from getting + * an inconsistent state after running multiple KVM_REINJECT_CONTROL + * ioctls in parallel. Use a separate lock if that ioctl isn't rare. + */ + mutex_lock(&pit->pit_state.lock); + kvm_pit_set_reinject(pit, control->pit_reinject); + mutex_unlock(&pit->pit_state.lock); + return 0; } @@ -4093,7 +4107,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, do { n = min(len, 8); - if (!(vcpu->arch.apic && + if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) break; @@ -4113,7 +4127,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) do { n = min(len, 8); - if (!(vcpu->arch.apic && + if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) @@ -4346,7 +4360,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + kvm_page_track_write(vcpu, gpa, val, bytes); return 1; } @@ -4604,7 +4618,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CMPXCHG_FAILED; kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - kvm_mmu_pte_write(vcpu, gpa, new, bytes); + kvm_page_track_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -6010,7 +6024,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->update_cr8_intercept) return; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) return; if (vcpu->arch.apicv_active) @@ -6618,12 +6632,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * KVM_DEBUGREG_WONT_EXIT again. */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { - int i; - WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); kvm_x86_ops->sync_dirty_debug_regs(vcpu); - for (i = 0; i < KVM_NR_DB_REGS; i++) - vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + kvm_update_dr0123(vcpu); + kvm_update_dr6(vcpu); + kvm_update_dr7(vcpu); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -7038,7 +7052,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (!kvm_vcpu_has_lapic(vcpu) && + if (!lapic_in_kernel(vcpu) && mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; @@ -7314,7 +7328,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) * Every 255 times fpu_counter rolls over to 0; a guest that uses * the FPU in bursts will revert to loading it on demand. */ - if (!vcpu->arch.eager_fpu) { + if (!use_eager_fpu()) { if (++vcpu->fpu_counter < 5) kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); } @@ -7593,6 +7607,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) } struct static_key kvm_no_apic_vcpu __read_mostly; +EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { @@ -7724,6 +7739,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + kvm_page_track_init(kvm); + kvm_mmu_init_vm(kvm); + return 0; } @@ -7850,6 +7868,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvm_mmu_uninit_vm(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -7871,6 +7890,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, free->arch.lpage_info[i - 1] = NULL; } } + + kvm_page_track_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -7879,6 +7900,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; int level = i + 1; @@ -7893,15 +7915,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, if (i == 0) continue; - slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * - sizeof(*slot->arch.lpage_info[i - 1])); - if (!slot->arch.lpage_info[i - 1]) + linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + if (!linfo) goto out_free; + slot->arch.lpage_info[i - 1] = linfo; + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i - 1][0].write_count = 1; + linfo[0].disallow_lpage = 1; if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; + linfo[lpages - 1].disallow_lpage = 1; ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each @@ -7913,10 +7936,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long j; for (j = 0; j < lpages; ++j) - slot->arch.lpage_info[i - 1][j].write_count = 1; + linfo[j].disallow_lpage = 1; } } + if (kvm_page_track_create_memslot(slot, npages)) + goto out_free; + return 0; out_free: @@ -8370,6 +8396,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } +bool kvm_vector_hashing_enabled(void) +{ + return vector_hashing; +} +EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f2afa5fe48a6..007940faa5c6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -179,6 +179,7 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); +bool kvm_vector_hashing_enabled(void); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -192,4 +193,19 @@ extern unsigned int min_timer_period_us; extern unsigned int lapic_timer_advance_ns; extern struct static_key kvm_no_apic_vcpu; + +/* Same "calling convention" as do_div: + * - divide (n << 32) by base + * - put result in n + * - return remainder + */ +#define do_shl32_div32(n, base) \ + ({ \ + u32 __quot, __rem; \ + asm("divl %2" : "=a" (__quot), "=d" (__rem) \ + : "rm" (base), "0" (0), "1" ((u32) n)); \ + n = __quot; \ + __rem; \ + }) + #endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4ba229ac3f4f..fd57d3ae7e16 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1520,12 +1520,6 @@ __init void lguest_init(void) */ reserve_top_address(lguest_data.reserve_mem); - /* - * If we don't initialize the lock dependency checker now, it crashes - * atomic_notifier_chain_register, then paravirt_disable_iospace. - */ - lockdep_init(); - /* Hook in our special panic hypercall code. */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); @@ -1535,7 +1529,7 @@ __init void lguest_init(void) */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[0] = cpuid_edx(1); + new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); /* Math is always hard! */ set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 422db000d727..5cc78bf57232 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -21,12 +21,16 @@ static inline int myisspace(u8 c) * @option: option string to look for * * Returns the position of that @option (starts counting with 1) - * or 0 on not found. + * or 0 on not found. @option will only be found if it is found + * as an entire word in @cmdline. For instance, if @option="car" + * then a cmdline which contains "cart" will not match. */ -int cmdline_find_option_bool(const char *cmdline, const char *option) +static int +__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, + const char *option) { char c; - int len, pos = 0, wstart = 0; + int pos = 0, wstart = 0; const char *opptr = NULL; enum { st_wordstart = 0, /* Start of word/after whitespace */ @@ -37,11 +41,11 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) if (!cmdline) return -1; /* No command line */ - len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); - if (!len) - return 0; - - while (len--) { + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos < max_cmdline_size) { c = *(char *)cmdline++; pos++; @@ -58,18 +62,35 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) /* fall through */ case st_wordcmp: - if (!*opptr) + if (!*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for. If the + * command-line has a space _or_ ends, then + * we matched! + */ if (!c || myisspace(c)) return wstart; - else - state = st_wordskip; - else if (!c) + /* + * We hit the end of the option, but _not_ + * the end of a word on the cmdline. Not + * a match. + */ + } else if (!c) { + /* + * Hit the NULL terminator on the end of + * cmdline. + */ return 0; - else if (c != *opptr++) - state = st_wordskip; - else if (!len) /* last word and is matching */ - return wstart; - break; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ case st_wordskip: if (!c) @@ -82,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) return 0; /* Buffer overrun */ } + +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); +} diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index fba343062055..2b0ef26da0bd 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -232,17 +232,31 @@ ENDPROC(copy_user_enhanced_fast_string) /* * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. + * This will force destination out of cache for more performance. + * + * Note: Cached memory copy is used when destination or size is not + * naturally aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. */ ENTRY(__copy_user_nocache) ASM_STAC + + /* If size is less than 8 bytes, go to 4-byte copy */ cmpl $8,%edx - jb 20f /* less then 8 bytes, go to byte copy loop */ + jb .L_4b_nocache_copy_entry + + /* If destination is not 8-byte aligned, "cache" copy to align it */ ALIGN_DESTINATION + + /* Set 4x8-byte copy count and remainder */ movl %edx,%ecx andl $63,%edx shrl $6,%ecx - jz 17f + jz .L_8b_nocache_copy_entry /* jump if count is 0 */ + + /* Perform 4x8-byte nocache loop-copy */ +.L_4x8b_nocache_copy_loop: 1: movq (%rsi),%r8 2: movq 1*8(%rsi),%r9 3: movq 2*8(%rsi),%r10 @@ -262,60 +276,106 @@ ENTRY(__copy_user_nocache) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi decl %ecx - jnz 1b -17: movl %edx,%ecx + jnz .L_4x8b_nocache_copy_loop + + /* Set 8-byte copy count and remainder */ +.L_8b_nocache_copy_entry: + movl %edx,%ecx andl $7,%edx shrl $3,%ecx - jz 20f -18: movq (%rsi),%r8 -19: movnti %r8,(%rdi) + jz .L_4b_nocache_copy_entry /* jump if count is 0 */ + + /* Perform 8-byte nocache loop-copy */ +.L_8b_nocache_copy_loop: +20: movq (%rsi),%r8 +21: movnti %r8,(%rdi) leaq 8(%rsi),%rsi leaq 8(%rdi),%rdi decl %ecx - jnz 18b -20: andl %edx,%edx - jz 23f + jnz .L_8b_nocache_copy_loop + + /* If no byte left, we're done */ +.L_4b_nocache_copy_entry: + andl %edx,%edx + jz .L_finish_copy + + /* If destination is not 4-byte aligned, go to byte copy: */ + movl %edi,%ecx + andl $3,%ecx + jnz .L_1b_cache_copy_entry + + /* Set 4-byte copy count (1 or 0) and remainder */ movl %edx,%ecx -21: movb (%rsi),%al -22: movb %al,(%rdi) + andl $3,%edx + shrl $2,%ecx + jz .L_1b_cache_copy_entry /* jump if count is 0 */ + + /* Perform 4-byte nocache copy: */ +30: movl (%rsi),%r8d +31: movnti %r8d,(%rdi) + leaq 4(%rsi),%rsi + leaq 4(%rdi),%rdi + + /* If no bytes left, we're done: */ + andl %edx,%edx + jz .L_finish_copy + + /* Perform byte "cache" loop-copy for the remainder */ +.L_1b_cache_copy_entry: + movl %edx,%ecx +.L_1b_cache_copy_loop: +40: movb (%rsi),%al +41: movb %al,(%rdi) incq %rsi incq %rdi decl %ecx - jnz 21b -23: xorl %eax,%eax + jnz .L_1b_cache_copy_loop + + /* Finished copying; fence the prior stores */ +.L_finish_copy: + xorl %eax,%eax ASM_CLAC sfence ret .section .fixup,"ax" -30: shll $6,%ecx +.L_fixup_4x8b_copy: + shll $6,%ecx addl %ecx,%edx - jmp 60f -40: lea (%rdx,%rcx,8),%rdx - jmp 60f -50: movl %ecx,%edx -60: sfence + jmp .L_fixup_handle_tail +.L_fixup_8b_copy: + lea (%rdx,%rcx,8),%rdx + jmp .L_fixup_handle_tail +.L_fixup_4b_copy: + lea (%rdx,%rcx,4),%rdx + jmp .L_fixup_handle_tail +.L_fixup_1b_copy: + movl %ecx,%edx +.L_fixup_handle_tail: + sfence jmp copy_user_handle_tail .previous - _ASM_EXTABLE(1b,30b) - _ASM_EXTABLE(2b,30b) - _ASM_EXTABLE(3b,30b) - _ASM_EXTABLE(4b,30b) - _ASM_EXTABLE(5b,30b) - _ASM_EXTABLE(6b,30b) - _ASM_EXTABLE(7b,30b) - _ASM_EXTABLE(8b,30b) - _ASM_EXTABLE(9b,30b) - _ASM_EXTABLE(10b,30b) - _ASM_EXTABLE(11b,30b) - _ASM_EXTABLE(12b,30b) - _ASM_EXTABLE(13b,30b) - _ASM_EXTABLE(14b,30b) - _ASM_EXTABLE(15b,30b) - _ASM_EXTABLE(16b,30b) - _ASM_EXTABLE(18b,40b) - _ASM_EXTABLE(19b,40b) - _ASM_EXTABLE(21b,50b) - _ASM_EXTABLE(22b,50b) + _ASM_EXTABLE(1b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(2b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(3b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(4b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(5b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(6b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(7b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(8b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(9b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(10b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(11b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(12b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(13b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(14b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(15b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(16b,.L_fixup_4x8b_copy) + _ASM_EXTABLE(20b,.L_fixup_8b_copy) + _ASM_EXTABLE(21b,.L_fixup_8b_copy) + _ASM_EXTABLE(30b,.L_fixup_4b_copy) + _ASM_EXTABLE(31b,.L_fixup_4b_copy) + _ASM_EXTABLE(40b,.L_fixup_1b_copy) + _ASM_EXTABLE(41b,.L_fixup_1b_copy) ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 1318f75d56e4..28a6654f0d08 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -135,7 +135,7 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck); __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, - __u32 len, unsigned short proto, __wsum sum) + __u32 len, __u8 proto, __wsum sum) { __u64 rest, sum64; diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index e912b2f6d36e..2f07c291dcc8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -102,7 +102,7 @@ static void delay_mwaitx(unsigned long __loops) * Use cpu_tss as a cacheline-aligned, seldomly * accessed per-cpu variable as the monitor target. */ - __monitorx(this_cpu_ptr(&cpu_tss), 0, 0); + __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); /* * AMD, like Intel, supports the EAX hint and EAX=0xf diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 8f72b334aea0..1a416935bac9 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -374,7 +374,7 @@ void insn_get_displacement(struct insn *insn) if (mod == 3) goto out; if (mod == 1) { - insn->displacement.value = get_next(char, insn); + insn->displacement.value = get_next(signed char, insn); insn->displacement.nbytes = 1; } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { @@ -532,7 +532,7 @@ void insn_get_immediate(struct insn *insn) switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: - insn->immediate.value = get_next(char, insn); + insn->immediate.value = get_next(signed char, insn); insn->immediate.nbytes = 1; break; case INAT_IMM_WORD: @@ -566,7 +566,7 @@ void insn_get_immediate(struct insn *insn) goto err_out; } if (inat_has_second_immediate(insn->attr)) { - insn->immediate2.value = get_next(char, insn); + insn->immediate2.value = get_next(signed char, insn); insn->immediate2.nbytes = 1; } done: diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index a0de849435ad..cbb8ee5830ff 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -177,3 +177,120 @@ ENTRY(memcpy_orig) .Lend: retq ENDPROC(memcpy_orig) + +#ifndef CONFIG_UML +/* + * memcpy_mcsafe - memory copy with machine check exception handling + * Note that we only catch machine checks when reading the source addresses. + * Writes to target are posted and don't generate machine checks. + */ +ENTRY(memcpy_mcsafe) + cmpl $8, %edx + /* Less than 8 bytes? Go to byte copy loop */ + jb .L_no_whole_words + + /* Check for bad alignment of source */ + testl $7, %esi + /* Already aligned */ + jz .L_8byte_aligned + + /* Copy one byte at a time until source is 8-byte aligned */ + movl %esi, %ecx + andl $7, %ecx + subl $8, %ecx + negl %ecx + subl %ecx, %edx +.L_copy_leading_bytes: + movb (%rsi), %al + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .L_copy_leading_bytes + +.L_8byte_aligned: + /* Figure out how many whole cache lines (64-bytes) to copy */ + movl %edx, %ecx + andl $63, %edx + shrl $6, %ecx + jz .L_no_whole_cache_lines + + /* Loop copying whole cache lines */ +.L_cache_w0: movq (%rsi), %r8 +.L_cache_w1: movq 1*8(%rsi), %r9 +.L_cache_w2: movq 2*8(%rsi), %r10 +.L_cache_w3: movq 3*8(%rsi), %r11 + movq %r8, (%rdi) + movq %r9, 1*8(%rdi) + movq %r10, 2*8(%rdi) + movq %r11, 3*8(%rdi) +.L_cache_w4: movq 4*8(%rsi), %r8 +.L_cache_w5: movq 5*8(%rsi), %r9 +.L_cache_w6: movq 6*8(%rsi), %r10 +.L_cache_w7: movq 7*8(%rsi), %r11 + movq %r8, 4*8(%rdi) + movq %r9, 5*8(%rdi) + movq %r10, 6*8(%rdi) + movq %r11, 7*8(%rdi) + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi + decl %ecx + jnz .L_cache_w0 + + /* Are there any trailing 8-byte words? */ +.L_no_whole_cache_lines: + movl %edx, %ecx + andl $7, %edx + shrl $3, %ecx + jz .L_no_whole_words + + /* Copy trailing words */ +.L_copy_trailing_words: + movq (%rsi), %r8 + mov %r8, (%rdi) + leaq 8(%rsi), %rsi + leaq 8(%rdi), %rdi + decl %ecx + jnz .L_copy_trailing_words + + /* Any trailing bytes? */ +.L_no_whole_words: + andl %edx, %edx + jz .L_done_memcpy_trap + + /* Copy trailing bytes */ + movl %edx, %ecx +.L_copy_trailing_bytes: + movb (%rsi), %al + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .L_copy_trailing_bytes + + /* Copy successful. Return true */ +.L_done_memcpy_trap: + xorq %rax, %rax + ret +ENDPROC(memcpy_mcsafe) + + .section .fixup, "ax" + /* Return false for any failure */ +.L_memcpy_mcsafe_fail: + mov $1, %rax + ret + + .previous + + _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) +#endif diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 40027db99140..be110efa0096 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -15,6 +15,7 @@ #include <linux/linkage.h> #include <asm/alternative-asm.h> +#include <asm/frame.h> #define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) #define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) @@ -84,24 +85,29 @@ /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) + FRAME_BEGIN save_common_regs __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs + FRAME_END ret ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) + FRAME_BEGIN save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs + FRAME_END ret ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) + FRAME_BEGIN /* do nothing if still outstanding active readers */ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) jnz 1f @@ -109,15 +115,18 @@ ENTRY(call_rwsem_wake) movq %rax,%rdi call rwsem_wake restore_common_regs -1: ret +1: FRAME_END + ret ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) + FRAME_BEGIN save_common_regs __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs + FRAME_END ret ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 4a6f1d9b5106..99bfb192803f 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -#ifdef CONFIG_X86_64 static inline bool is_hypervisor_range(int idx) { +#ifdef CONFIG_X86_64 /* * ffff800000000000 - ffff87ffffffffff is reserved for * the hypervisor. */ - return paravirt_enabled() && - (idx >= pgd_index(__PAGE_OFFSET) - 16) && - (idx < pgd_index(__PAGE_OFFSET)); -} + return (idx >= pgd_index(__PAGE_OFFSET) - 16) && + (idx < pgd_index(__PAGE_OFFSET)); #else -static inline bool is_hypervisor_range(int idx) { return false; } + return false; #endif +} static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 903ec1e9c326..9dd7e4b7fcde 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -3,6 +3,9 @@ #include <linux/sort.h> #include <asm/uaccess.h> +typedef bool (*ex_handler_t)(const struct exception_table_entry *, + struct pt_regs *, int); + static inline unsigned long ex_insn_addr(const struct exception_table_entry *x) { @@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } +static inline ex_handler_t +ex_fixup_handler(const struct exception_table_entry *x) +{ + return (ex_handler_t)((unsigned long)&x->handler + x->handler); +} -int fixup_exception(struct pt_regs *regs) +bool ex_handler_default(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { - const struct exception_table_entry *fixup; - unsigned long new_ip; + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_default); + +bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + regs->ax = trapnr; + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fault); + +bool ex_handler_ext(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* Special hack for uaccess_err */ + current_thread_info()->uaccess_err = 1; + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_ext); + +bool ex_has_fault_handler(unsigned long ip) +{ + const struct exception_table_entry *e; + ex_handler_t handler; + + e = search_exception_tables(ip); + if (!e) + return false; + handler = ex_fixup_handler(e); + + return handler == ex_handler_fault; +} + +int fixup_exception(struct pt_regs *regs, int trapnr) +{ + const struct exception_table_entry *e; + ex_handler_t handler; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs) } #endif - fixup = search_exception_tables(regs->ip); - if (fixup) { - new_ip = ex_fixup_addr(fixup); - - if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { - /* Special hack for uaccess_err */ - current_thread_info()->uaccess_err = 1; - new_ip -= 0x7ffffff0; - } - regs->ip = new_ip; - return 1; - } + e = search_exception_tables(regs->ip); + if (!e) + return 0; - return 0; + handler = ex_fixup_handler(e); + return handler(e, regs, trapnr); } /* Restricted version used during very early boot */ int __init early_fixup_exception(unsigned long *ip) { - const struct exception_table_entry *fixup; + const struct exception_table_entry *e; unsigned long new_ip; + ex_handler_t handler; - fixup = search_exception_tables(*ip); - if (fixup) { - new_ip = ex_fixup_addr(fixup); + e = search_exception_tables(*ip); + if (!e) + return 0; - if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { - /* uaccess handling not supported during early boot */ - return 0; - } + new_ip = ex_fixup_addr(e); + handler = ex_fixup_handler(e); - *ip = new_ip; - return 1; - } + /* special handling not supported during early boot */ + if (handler != ex_handler_default) + return 0; - return 0; + *ip = new_ip; + return 1; } /* @@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start, i += 4; p->fixup += i; i += 4; + p->handler += i; + i += 4; } sort(start, finish - start, sizeof(struct exception_table_entry), @@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start, i += 4; p->fixup -= i; i += 4; + p->handler -= i; + i += 4; } } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6138db471b17..5ce1ed02f7e8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -344,6 +344,9 @@ static noinline int vmalloc_fault(unsigned long address) if (!pmd_k) return -1; + if (pmd_huge(*pmd_k)) + return 0; + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; @@ -417,8 +420,6 @@ void vmalloc_sync_all(void) * 64-bit: * * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. */ static noinline int vmalloc_fault(unsigned long address) { @@ -460,17 +461,23 @@ static noinline int vmalloc_fault(unsigned long address) if (pud_none(*pud_ref)) return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref)) BUG(); + if (pud_huge(*pud)) + return 0; + pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); if (pmd_none(*pmd_ref)) return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref)) BUG(); + if (pmd_huge(*pmd)) + return 0; + pte_ref = pte_offset_kernel(pmd_ref, address); if (!pte_present(*pte_ref)) return -1; @@ -715,7 +722,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) { + if (fixup_exception(regs, X86_TRAP_PF)) { /* * Any interrupt that takes a fault gets the fixup. This makes * the below recursive fault logic only apply to a faults from diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index bab259e75984..b8b6a60b32cf 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -120,7 +120,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 0; } - page = pte_page(pte); if (pte_devmap(pte)) { pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { @@ -134,6 +133,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 0; } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); get_page(page); put_dev_pagemap(pgmap); SetPageReferenced(page); @@ -150,7 +150,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) { VM_BUG_ON_PAGE(page != compound_head(page), page); VM_BUG_ON_PAGE(page_count(page) == 0, page); - atomic_add(nr, &page->_count); + page_ref_add(page, nr); SetPageReferenced(page); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 493f54172b4a..9d56f271d519 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -150,13 +150,14 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { -#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) +#if !defined(CONFIG_KMEMCHECK) /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will + * use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (cpu_has_pse) + if (cpu_has_pse && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; #endif @@ -666,21 +667,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) * mark them not present - any buggy init-section access will * create a kernel page fault: */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n", - begin, end - 1); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable and non-executable first. - */ - set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + if (debug_pagealloc_enabled()) { + pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", + begin, end - 1); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); + } else { + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable and non-executable first. + */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); -#endif + free_reserved_area((void *)begin, (void *)end, + POISON_FREE_INITMEM, what); + } } void free_initmem(void) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a4bb1c7ab65e..bd7a9b9e2e14 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -868,7 +868,6 @@ static noinline int do_test_wp_bit(void) return flag; } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -957,5 +956,3 @@ void mark_rodata_ro(void) if (__supported_pte_mask & _PAGE_NX) debug_checkwx(); } -#endif - diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9686535edfb5..214afda97911 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1075,7 +1075,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -1167,8 +1166,6 @@ void mark_rodata_ro(void) debug_checkwx(); } -#endif - int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 637ab34ed632..ddb2244b06a1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -33,7 +33,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; - unsigned long page; /* location of the fault page */ + unsigned long addr; /* the requested address */ pteval_t old_presence; /* page presence prior to arming */ bool armed; @@ -70,9 +70,16 @@ unsigned int kmmio_count; static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); -static struct list_head *kmmio_page_list(unsigned long page) +static struct list_head *kmmio_page_list(unsigned long addr) { - return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + + return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; } /* Accessed per-cpu */ @@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) } /* You must be holding RCU read lock. */ -static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) { struct list_head *head; struct kmmio_fault_page *f; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); - page &= PAGE_MASK; - head = kmmio_page_list(page); + if (!pte) + return NULL; + addr &= page_level_mask(l); + head = kmmio_page_list(addr); list_for_each_entry_rcu(f, head, list) { - if (f->page == page) + if (f->addr == addr) return f; } return NULL; @@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(f->page, &level); + pte_t *pte = lookup_address(f->addr, &level); if (!pte) { - pr_err("no pte for page 0x%08lx\n", f->page); + pr_err("no pte for addr 0x%08lx\n", f->addr); return -1; } @@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one(f->page); + __flush_tlb_one(f->addr); return 0; } @@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), - f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), + f->addr); f->armed = true; return ret; } @@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, - KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); f->armed = false; } @@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) struct kmmio_context *ctx; struct kmmio_fault_page *faultpage; int ret = 0; /* default to fault not handled */ + unsigned long page_base = addr; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + if (!pte) + return -EINVAL; + page_base &= page_level_mask(l); /* * Preemption is now disabled to prevent process switch during @@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) preempt_disable(); rcu_read_lock(); - faultpage = get_kmmio_fault_page(addr); + faultpage = get_kmmio_fault_page(page_base); if (!faultpage) { /* * Either this page fault is not caused by kmmio, or @@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - if (addr == ctx->addr) { + if (page_base == ctx->addr) { /* * A second fault on the same page means some other * condition needs handling by do_page_fault(), the @@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx->active++; ctx->fpage = faultpage; - ctx->probe = get_kmmio_probe(addr); + ctx->probe = get_kmmio_probe(page_base); ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - ctx->addr = addr; + ctx->addr = page_base; if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); @@ -354,12 +371,11 @@ out: } /* You must be holding kmmio_lock. */ -static int add_kmmio_fault_page(unsigned long page) +static int add_kmmio_fault_page(unsigned long addr) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (f) { if (!f->count) arm_kmmio_fault_page(f); @@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page) return -1; f->count = 1; - f->page = page; + f->addr = addr; if (arm_kmmio_fault_page(f)) { kfree(f); return -1; } - list_add_rcu(&f->list, kmmio_page_list(f->page)); + list_add_rcu(&f->list, kmmio_page_list(f->addr)); return 0; } /* You must be holding kmmio_lock. */ -static void release_kmmio_fault_page(unsigned long page, +static void release_kmmio_fault_page(unsigned long addr, struct kmmio_fault_page **release_list) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (!f) return; @@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p) int ret = 0; unsigned long size = 0; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + unsigned int l; + pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { ret = -EEXIST; goto out; } + + pte = lookup_address(p->addr, &l); + if (!pte) { + ret = -EINVAL; + goto out; + } + kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) pr_err("Unable to set page fault.\n"); - size += PAGE_SIZE; + size += page_level_size(l); } out: spin_unlock_irqrestore(&kmmio_lock, flags); @@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p) const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; + unsigned int l; + pte_t *pte; + + pte = lookup_address(p->addr, &l); + if (!pte) + return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { release_kmmio_fault_page(p->addr + size, &release_list); - size += PAGE_SIZE; + size += page_level_size(l); } list_del_rcu(&p->list); kmmio_count--; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 96bd1e2bffaf..d2dc0438d654 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void) if (mmap_is_ia32()) #ifdef CONFIG_COMPAT - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); #else - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); #endif else - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } @@ -94,18 +94,6 @@ static unsigned long mmap_base(unsigned long rnd) } /* - * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 - * does, but not when emulating X86_32 - */ -static unsigned long mmap_legacy_base(unsigned long rnd) -{ - if (mmap_is_ia32()) - return TASK_UNMAPPED_BASE; - else - return TASK_UNMAPPED_BASE + rnd; -} - -/* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ @@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - mm->mmap_legacy_base = mmap_legacy_base(random_factor); + mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 84fa4a482c78..a0a0b9861902 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -123,7 +123,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, break; } - if (regno > nr_registers) { + if (regno >= nr_registers) { WARN_ONCE(1, "decoded an instruction with an invalid register"); return -EINVAL; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2440814b0069..01be9ec3bf79 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -106,12 +106,6 @@ static inline unsigned long highmap_end_pfn(void) #endif -#ifdef CONFIG_DEBUG_PAGEALLOC -# define debug_pagealloc 1 -#else -# define debug_pagealloc 0 -#endif - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -283,7 +277,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections @@ -419,24 +413,30 @@ pmd_t *lookup_pmd_address(unsigned long address) phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; - unsigned long phys_addr, offset; + phys_addr_t phys_addr; + unsigned long offset; enum pg_level level; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); + /* + * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t + * before being left-shifted PAGE_SHIFT bits -- this trick is to + * make 32-PAE kernel work correctly. + */ switch (level) { case PG_LEVEL_1G: - phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PUD_PAGE_MASK; break; case PG_LEVEL_2M: - phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PMD_PAGE_MASK; break; default: - phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; offset = virt_addr & ~PAGE_MASK; } @@ -708,10 +708,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, { struct page *base; - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) return -ENOMEM; @@ -909,16 +909,25 @@ static void populate_pte(struct cpa_data *cpa, pte = pte_offset_kernel(pmd, start); - while (num_pages-- && start < end) { + /* + * Set the GLOBAL flags only if the PRESENT flag is + * set otherwise pte_present will return true even on + * a non present pte. The canon_pgprot will clear + * _PAGE_GLOBAL for the ancient hardware that doesn't + * support it. + */ + if (pgprot_val(pgprot) & _PAGE_PRESENT) + pgprot_val(pgprot) |= _PAGE_GLOBAL; + else + pgprot_val(pgprot) &= ~_PAGE_GLOBAL; - /* deal with the NX bit */ - if (!(pgprot_val(pgprot) & _PAGE_NX)) - cpa->pfn &= ~_PAGE_NX; + pgprot = canon_pgprot(pgprot); - set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); + while (num_pages-- && start < end) { + set_pte(pte, pfn_pte(cpa->pfn, pgprot)); start += PAGE_SIZE; - cpa->pfn += PAGE_SIZE; + cpa->pfn++; pte++; } } @@ -974,11 +983,11 @@ static int populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | + set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pmd_pgprot))); start += PMD_SIZE; - cpa->pfn += PMD_SIZE; + cpa->pfn += PMD_SIZE >> PAGE_SHIFT; cur_pages += PMD_SIZE >> PAGE_SHIFT; } @@ -1046,12 +1055,12 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* * Map everything starting from the Gb boundary, possibly with 1G pages */ - while (end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | + while (cpu_has_gbpages && end - start >= PUD_SIZE) { + set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pud_pgprot))); start += PUD_SIZE; - cpa->pfn += PUD_SIZE; + cpa->pfn += PUD_SIZE >> PAGE_SHIFT; cur_pages += PUD_SIZE >> PAGE_SHIFT; pud++; } @@ -1122,8 +1131,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, /* * Ignore all non primary paths. */ - if (!primary) + if (!primary) { + cpa->numpages = 1; return 0; + } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected @@ -1331,10 +1342,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) return ret; @@ -1962,6 +1973,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_NX)) cpa.mask_clr = __pgprot(_PAGE_NX); + if (!(page_flags & _PAGE_RW)) + cpa.mask_clr = __pgprot(_PAGE_RW); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 4093216b3791..f2a7faf4706e 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -8,6 +8,7 @@ * of the License. */ #include <linux/linkage.h> +#include <asm/frame.h> /* * Calling convention : @@ -22,15 +23,16 @@ 32 /* space for rbx,r13,r14,r15 */ + \ 8 /* space for skb_copy_bits */) -sk_load_word: - .globl sk_load_word +#define FUNC(name) \ + .globl name; \ + .type name, @function; \ + name: +FUNC(sk_load_word) test %esi,%esi js bpf_slow_path_word_neg -sk_load_word_positive_offset: - .globl sk_load_word_positive_offset - +FUNC(sk_load_word_positive_offset) mov %r9d,%eax # hlen sub %esi,%eax # hlen - offset cmp $3,%eax @@ -39,15 +41,11 @@ sk_load_word_positive_offset: bswap %eax /* ntohl() */ ret -sk_load_half: - .globl sk_load_half - +FUNC(sk_load_half) test %esi,%esi js bpf_slow_path_half_neg -sk_load_half_positive_offset: - .globl sk_load_half_positive_offset - +FUNC(sk_load_half_positive_offset) mov %r9d,%eax sub %esi,%eax # hlen - offset cmp $1,%eax @@ -56,15 +54,11 @@ sk_load_half_positive_offset: rol $8,%ax # ntohs() ret -sk_load_byte: - .globl sk_load_byte - +FUNC(sk_load_byte) test %esi,%esi js bpf_slow_path_byte_neg -sk_load_byte_positive_offset: - .globl sk_load_byte_positive_offset - +FUNC(sk_load_byte_positive_offset) cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */ jle bpf_slow_path_byte movzbl (SKBDATA,%rsi),%eax @@ -72,16 +66,18 @@ sk_load_byte_positive_offset: /* rsi contains offset and can be scratched */ #define bpf_slow_path_common(LEN) \ + lea -MAX_BPF_STACK + 32(%rbp), %rdx;\ + FRAME_BEGIN; \ mov %rbx, %rdi; /* arg1 == skb */ \ push %r9; \ push SKBDATA; \ /* rsi already has offset */ \ mov $LEN,%ecx; /* len */ \ - lea - MAX_BPF_STACK + 32(%rbp),%rdx; \ call skb_copy_bits; \ test %eax,%eax; \ pop SKBDATA; \ - pop %r9; + pop %r9; \ + FRAME_END bpf_slow_path_word: @@ -106,6 +102,7 @@ bpf_slow_path_byte: ret #define sk_negative_common(SIZE) \ + FRAME_BEGIN; \ mov %rbx, %rdi; /* arg1 == skb */ \ push %r9; \ push SKBDATA; \ @@ -115,13 +112,14 @@ bpf_slow_path_byte: test %rax,%rax; \ pop SKBDATA; \ pop %r9; \ + FRAME_END; \ jz bpf_error bpf_slow_path_word_neg: cmp SKF_MAX_NEG_OFF, %esi /* test range */ jl bpf_error /* offset lower -> error */ -sk_load_word_negative_offset: - .globl sk_load_word_negative_offset + +FUNC(sk_load_word_negative_offset) sk_negative_common(4) mov (%rax), %eax bswap %eax @@ -130,8 +128,8 @@ sk_load_word_negative_offset: bpf_slow_path_half_neg: cmp SKF_MAX_NEG_OFF, %esi jl bpf_error -sk_load_half_negative_offset: - .globl sk_load_half_negative_offset + +FUNC(sk_load_half_negative_offset) sk_negative_common(2) mov (%rax),%ax rol $8,%ax @@ -141,8 +139,8 @@ sk_load_half_negative_offset: bpf_slow_path_byte_neg: cmp SKF_MAX_NEG_OFF, %esi jl bpf_error -sk_load_byte_negative_offset: - .globl sk_load_byte_negative_offset + +FUNC(sk_load_byte_negative_offset) sk_negative_common(1) movzbl (%rax), %eax ret diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 4e664bdb535a..cb31a4440e58 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr, int reliable) +static int backtrace_address(void *data, unsigned long addr, int reliable) { unsigned int *depth = data; if ((*depth)--) oprofile_add_trace(addr); + return 0; } static struct stacktrace_ops backtrace_ops = { diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2879efc73a96..381a43c40bf7 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -12,7 +12,6 @@ #include <linux/dmi.h> #include <linux/slab.h> -#include <asm-generic/pci-bridge.h> #include <asm/acpi.h> #include <asm/segment.h> #include <asm/io.h> @@ -711,28 +710,22 @@ int pcibios_add_device(struct pci_dev *dev) return 0; } -int pcibios_alloc_irq(struct pci_dev *dev) +int pcibios_enable_device(struct pci_dev *dev, int mask) { - /* - * If the PCI device was already claimed by core code and has - * MSI enabled, probing of the pcibios IRQ will overwrite - * dev->irq. So bail out if MSI is already enabled. - */ - if (pci_dev_msi_enabled(dev)) - return -EBUSY; + int err; - return pcibios_enable_irq(dev); -} + if ((err = pci_enable_resources(dev, mask)) < 0) + return err; -void pcibios_free_irq(struct pci_dev *dev) -{ - if (pcibios_disable_irq) - pcibios_disable_irq(dev); + if (!pci_dev_msi_enabled(dev)) + return pcibios_enable_irq(dev); + return 0; } -int pcibios_enable_device(struct pci_dev *dev, int mask) +void pcibios_disable_device (struct pci_dev *dev) { - return pci_enable_resources(dev, mask); + if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq) + pcibios_disable_irq(dev); } int pci_ext_cfg_avail(void) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e58565556703..b7de1929714b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -297,14 +297,14 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_r * * The standard boot ROM sequence for an x86 machine uses the BIOS * to select an initial video card for boot display. This boot video - * card will have it's BIOS copied to C0000 in system RAM. + * card will have its BIOS copied to 0xC0000 in system RAM. * IORESOURCE_ROM_SHADOW is used to associate the boot video * card with this copy. On laptops this copy has to be used since * the main ROM may be compressed or combined with another image. * See pci_map_rom() for use of this flag. Before marking the device * with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set - * by either arch cde or vga-arbitration, if so only apply the fixup to this - * already determined primary video card. + * by either arch code or vga-arbitration; if so only apply the fixup to this + * already-determined primary video card. */ static void pci_fixup_video(struct pci_dev *pdev) @@ -312,6 +312,7 @@ static void pci_fixup_video(struct pci_dev *pdev) struct pci_dev *bridge; struct pci_bus *bus; u16 config; + struct resource *res; /* Is VGA routed to us? */ bus = pdev->bus; @@ -336,8 +337,18 @@ static void pci_fixup_video(struct pci_dev *pdev) if (!vga_default_device() || pdev == vga_default_device()) { pci_read_config_word(pdev, PCI_COMMAND, &config); if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { - pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; - dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n"); + res = &pdev->resource[PCI_ROM_RESOURCE]; + + pci_disable_rom(pdev); + if (res->parent) + release_resource(res); + + res->start = 0xC0000; + res->end = res->start + 0x20000 - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW | + IORESOURCE_PCI_FIXED; + dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n", + res); } } } @@ -540,3 +551,10 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); + +static void pci_bdwep_bar(struct pci_dev *dev) +{ + dev->non_compliant_bars = 1; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 0d24e7c10145..8b93e634af84 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -215,7 +215,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) int polarity; int ret; - if (pci_has_managed_irq(dev)) + if (dev->irq_managed && dev->irq > 0) return 0; switch (intel_mid_identify_cpu()) { @@ -256,13 +256,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) static void intel_mid_pci_irq_disable(struct pci_dev *dev) { - if (pci_has_managed_irq(dev)) { + if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && + dev->irq > 0) { mp_unmap_irq(dev->irq); dev->irq_managed = 0; - /* - * Don't reset dev->irq here, otherwise - * intel_mid_pci_irq_enable() will fail on next call. - */ } } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 32e70343e6fd..9bd115484745 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev) struct pci_dev *temp_dev; int irq; - if (pci_has_managed_irq(dev)) + if (dev->irq_managed && dev->irq > 0) return 0; irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, @@ -1230,7 +1230,8 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { - pci_set_managed_irq(dev, irq); + dev->irq_managed = 1; + dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); return 0; @@ -1256,10 +1257,24 @@ static int pirq_enable_irq(struct pci_dev *dev) return 0; } +bool mp_should_keep_irq(struct device *dev) +{ + if (dev->power.is_prepared) + return true; +#ifdef CONFIG_PM + if (dev->power.runtime_status == RPM_SUSPENDING) + return true; +#endif + + return false; +} + static void pirq_disable_irq(struct pci_dev *dev) { - if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) { + if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && + dev->irq_managed && dev->irq) { mp_unmap_irq(dev->irq); - pci_reset_managed_irq(dev); + dev->irq = 0; + dev->irq_managed = 0; } } diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c index d57e48016f15..7792aba266df 100644 --- a/arch/x86/pci/vmd.c +++ b/arch/x86/pci/vmd.c @@ -503,6 +503,18 @@ static struct pci_ops vmd_ops = { .write = vmd_pci_write, }; +static void vmd_attach_resources(struct vmd_dev *vmd) +{ + vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1]; + vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2]; +} + +static void vmd_detach_resources(struct vmd_dev *vmd) +{ + vmd->dev->resource[VMD_MEMBAR1].child = NULL; + vmd->dev->resource[VMD_MEMBAR2].child = NULL; +} + /* * VMD domains start at 0x1000 to not clash with ACPI _SEG domains. */ @@ -527,11 +539,28 @@ static int vmd_enable_domain(struct vmd_dev *vmd) res = &vmd->dev->resource[VMD_CFGBAR]; vmd->resources[0] = (struct resource) { .name = "VMD CFGBAR", - .start = res->start, + .start = 0, .end = (resource_size(res) >> 20) - 1, .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, }; + /* + * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can + * put 32-bit resources in the window. + * + * There's no hardware reason why a 64-bit window *couldn't* + * contain a 32-bit resource, but pbus_size_mem() computes the + * bridge window size assuming a 64-bit window will contain no + * 32-bit resources. __pci_assign_resource() enforces that + * artificial restriction to make sure everything will fit. + * + * The only way we could use a 64-bit non-prefechable MEMBAR is + * if its address is <4GB so that we can convert it to a 32-bit + * resource. To be visible to the host OS, all VMD endpoints must + * be initially configured by platform BIOS, which includes setting + * up these resources. We can assume the device is configured + * according to the platform needs. + */ res = &vmd->dev->resource[VMD_MEMBAR1]; upper_bits = upper_32_bits(res->end); flags = res->flags & ~IORESOURCE_SIZEALIGN; @@ -542,6 +571,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) .start = res->start, .end = res->end, .flags = flags, + .parent = res, }; res = &vmd->dev->resource[VMD_MEMBAR2]; @@ -554,6 +584,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) .start = res->start + 0x2000, .end = res->end, .flags = flags, + .parent = res, }; sd->domain = vmd_find_free_domain(); @@ -578,6 +609,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) return -ENODEV; } + vmd_attach_resources(vmd); vmd_setup_dma_ops(vmd); dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain); pci_rescan_bus(vmd->bus); @@ -674,6 +706,7 @@ static void vmd_remove(struct pci_dev *dev) { struct vmd_dev *vmd = pci_get_drvdata(dev); + vmd_detach_resources(vmd); pci_set_drvdata(dev, NULL); sysfs_remove_link(&vmd->dev->dev.kobj, "domain"); pci_stop_root_bus(vmd->bus); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index ff31ab464213..beac4dfdade6 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -196,7 +196,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; error: - dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + if (ret == -ENOSYS) + dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + else if (ret) + dev_err(&dev->dev, "Xen PCI frontend error: %d!\n", ret); free: kfree(v); return ret; diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index 2846aaab5103..066619b0700c 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,3 +1,5 @@ +OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y + obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index ea48449b2e63..a2433817c987 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -10,6 +10,9 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/init.h> #include <linux/acpi.h> @@ -28,8 +31,7 @@ struct bmp_header { void __init efi_bgrt_init(void) { acpi_status status; - void __iomem *image; - bool ioremapped = false; + void *image; struct bmp_header bmp_header; if (acpi_disabled) @@ -55,11 +57,6 @@ void __init efi_bgrt_init(void) bgrt_tab->status); return; } - if (bgrt_tab->status != 1) { - pr_debug("Ignoring BGRT: invalid status %u (expected 1)\n", - bgrt_tab->status); - return; - } if (bgrt_tab->image_type != 0) { pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n", bgrt_tab->image_type); @@ -70,20 +67,19 @@ void __init efi_bgrt_init(void) return; } - image = efi_lookup_mapped_addr(bgrt_tab->image_address); + image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB); if (!image) { - image = early_ioremap(bgrt_tab->image_address, - sizeof(bmp_header)); - ioremapped = true; - if (!image) { - pr_err("Ignoring BGRT: failed to map image header memory\n"); - return; - } + pr_err("Ignoring BGRT: failed to map image header memory\n"); + return; } - memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); - if (ioremapped) - early_iounmap(image, sizeof(bmp_header)); + memcpy(&bmp_header, image, sizeof(bmp_header)); + memunmap(image); + if (bmp_header.id != 0x4d42) { + pr_err("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n", + bmp_header.id); + return; + } bgrt_image_size = bmp_header.size; bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); @@ -93,18 +89,14 @@ void __init efi_bgrt_init(void) return; } - if (ioremapped) { - image = early_ioremap(bgrt_tab->image_address, - bmp_header.size); - if (!image) { - pr_err("Ignoring BGRT: failed to map image memory\n"); - kfree(bgrt_image); - bgrt_image = NULL; - return; - } + image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); + if (!image) { + pr_err("Ignoring BGRT: failed to map image memory\n"); + kfree(bgrt_image); + bgrt_image = NULL; + return; } - memcpy_fromio(bgrt_image, image, bgrt_image_size); - if (ioremapped) - early_iounmap(image, bmp_header.size); + memcpy(bgrt_image, image, bgrt_image_size); + memunmap(image); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ad285404ea7f..994a7df84a7b 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -235,10 +235,10 @@ void __init efi_print_memmap(void) char buf[64]; md = p; - pr_info("mem%02u: %s range=[0x%016llx-0x%016llx) (%lluMB)\n", + pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", i, efi_md_typeattr_format(buf, sizeof(buf), md), md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } #endif /* EFI_DEBUG */ @@ -815,6 +815,7 @@ static void __init kexec_enter_virtual_mode(void) { #ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; + unsigned int num_pages; void *p; efi.systab = NULL; @@ -829,6 +830,12 @@ static void __init kexec_enter_virtual_mode(void) return; } + if (efi_alloc_page_tables()) { + pr_err("Failed to allocate EFI page tables\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } + /* * Map efi regions which were passed via setup_data. The virt_addr is a * fixed addr which was used in first kernel of a kexec boot. @@ -843,6 +850,14 @@ static void __init kexec_enter_virtual_mode(void) BUG_ON(!efi.systab); + num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE); + num_pages >>= PAGE_SHIFT; + + if (efi_setup_page_tables(memmap.phys_map, num_pages)) { + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } + efi_sync_low_kernel_mappings(); /* @@ -869,7 +884,7 @@ static void __init kexec_enter_virtual_mode(void) * This function will switch the EFI runtime services to virtual mode. * Essentially, we look through the EFI memmap and map every region that * has the runtime attribute bit set in its memory descriptor into the - * ->trampoline_pgd page table using a top-down VA allocation scheme. + * efi_pgd page table. * * The old method which used to update that memory descriptor with the * virtual address obtained from ioremap() is still supported when the @@ -879,8 +894,8 @@ static void __init kexec_enter_virtual_mode(void) * * The new method does a pagetable switch in a preemption-safe manner * so that we're in a different address space when calling a runtime - * function. For function arguments passing we do copy the PGDs of the - * kernel page table into ->trampoline_pgd prior to each call. + * function. For function arguments passing we do copy the PUDs of the + * kernel page table into efi_pgd prior to each call. * * Specially for kexec boot, efi runtime maps in previous kernel should * be passed in via setup_data. In that case runtime ranges will be mapped @@ -895,6 +910,12 @@ static void __init __efi_enter_virtual_mode(void) efi.systab = NULL; + if (efi_alloc_page_tables()) { + pr_err("Failed to allocate EFI page tables\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } + efi_merge_regions(); new_memmap = efi_map_regions(&count, &pg_shift); if (!new_memmap) { @@ -913,7 +934,6 @@ static void __init __efi_enter_virtual_mode(void) } efi_sync_low_kernel_mappings(); - efi_dump_pagetable(); if (efi_is_native()) { status = phys_efi_set_virtual_address_map( @@ -951,31 +971,20 @@ static void __init __efi_enter_virtual_mode(void) efi.set_virtual_address_map = NULL; - efi_runtime_mkexec(); + /* + * Apply more restrictive page table mapping attributes now that + * SVAM() has been called and the firmware has performed all + * necessary relocation fixups for the new virtual addresses. + */ + efi_runtime_update_mappings(); + efi_dump_pagetable(); /* - * We mapped the descriptor array into the EFI pagetable above but we're - * not unmapping it here. Here's why: - * - * We're copying select PGDs from the kernel page table to the EFI page - * table and when we do so and make changes to those PGDs like unmapping - * stuff from them, those changes appear in the kernel page table and we - * go boom. - * - * From setup_real_mode(): - * - * ... - * trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd; - * - * In this particular case, our allocation is in PGD 0 of the EFI page - * table but we've copied that PGD from PGD[272] of the EFI page table: - * - * pgd_index(__PAGE_OFFSET = 0xffff880000000000) = 272 - * - * where the direct memory mapping in kernel space is. - * - * new_memmap's VA comes from that direct mapping and thus clearing it, - * it would get cleared in the kernel page table too. + * We mapped the descriptor array into the EFI pagetable above + * but we're not unmapping it here because if we're running in + * EFI mixed mode we need all of memory to be accessible when + * we pass parameters to the EFI runtime services in the + * thunking code. * * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift); */ diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index ed5b67338294..338402b91d2e 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -38,6 +38,11 @@ * say 0 - 3G. */ +int __init efi_alloc_page_tables(void) +{ + return 0; +} + void efi_sync_low_kernel_mappings(void) {} void __init efi_dump_pagetable(void) {} int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) @@ -85,7 +90,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) __flush_tlb_all(); } -void __init efi_runtime_mkexec(void) +void __init efi_runtime_update_mappings(void) { if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a0ac0f9c307f..49e4dd4a1f58 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -15,6 +15,8 @@ * */ +#define pr_fmt(fmt) "efi: " fmt + #include <linux/kernel.h> #include <linux/init.h> #include <linux/mm.h> @@ -40,6 +42,7 @@ #include <asm/fixmap.h> #include <asm/realmode.h> #include <asm/time.h> +#include <asm/pgalloc.h> /* * We allocate runtime services regions bottom-up, starting from -4G, i.e. @@ -47,16 +50,7 @@ */ static u64 efi_va = EFI_VA_START; -/* - * Scratch space used for switching the pagetable in the EFI stub - */ -struct efi_scratch { - u64 r15; - u64 prev_cr3; - pgd_t *efi_pgt; - bool use_pgd; - u64 phys_stack; -} __packed; +struct efi_scratch efi_scratch; static void __init early_code_mapping_set_exec(int executable) { @@ -83,8 +77,11 @@ pgd_t * __init efi_call_phys_prolog(void) int pgd; int n_pgds; - if (!efi_enabled(EFI_OLD_MEMMAP)) - return NULL; + if (!efi_enabled(EFI_OLD_MEMMAP)) { + save_pgd = (pgd_t *)read_cr3(); + write_cr3((unsigned long)efi_scratch.efi_pgt); + goto out; + } early_code_mapping_set_exec(1); @@ -96,6 +93,7 @@ pgd_t * __init efi_call_phys_prolog(void) vaddress = (unsigned long)__va(pgd * PGDIR_SIZE); set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); } +out: __flush_tlb_all(); return save_pgd; @@ -109,8 +107,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) int pgd_idx; int nr_pgds; - if (!save_pgd) + if (!efi_enabled(EFI_OLD_MEMMAP)) { + write_cr3((unsigned long)save_pgd); + __flush_tlb_all(); return; + } nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); @@ -123,27 +124,98 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) early_code_mapping_set_exec(0); } +static pgd_t *efi_pgd; + +/* + * We need our own copy of the higher levels of the page tables + * because we want to avoid inserting EFI region mappings (EFI_VA_END + * to EFI_VA_START) into the standard kernel page tables. Everything + * else can be shared, see efi_sync_low_kernel_mappings(). + */ +int __init efi_alloc_page_tables(void) +{ + pgd_t *pgd; + pud_t *pud; + gfp_t gfp_mask; + + if (efi_enabled(EFI_OLD_MEMMAP)) + return 0; + + gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO; + efi_pgd = (pgd_t *)__get_free_page(gfp_mask); + if (!efi_pgd) + return -ENOMEM; + + pgd = efi_pgd + pgd_index(EFI_VA_END); + + pud = pud_alloc_one(NULL, 0); + if (!pud) { + free_page((unsigned long)efi_pgd); + return -ENOMEM; + } + + pgd_populate(NULL, pgd, pud); + + return 0; +} + /* * Add low kernel mappings for passing arguments to EFI functions. */ void efi_sync_low_kernel_mappings(void) { - unsigned num_pgds; - pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); + unsigned num_entries; + pgd_t *pgd_k, *pgd_efi; + pud_t *pud_k, *pud_efi; if (efi_enabled(EFI_OLD_MEMMAP)) return; - num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET); + /* + * We can share all PGD entries apart from the one entry that + * covers the EFI runtime mapping space. + * + * Make sure the EFI runtime region mappings are guaranteed to + * only span a single PGD entry and that the entry also maps + * other important kernel regions. + */ + BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); + BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != + (EFI_VA_END & PGDIR_MASK)); + + pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); + pgd_k = pgd_offset_k(PAGE_OFFSET); - memcpy(pgd + pgd_index(PAGE_OFFSET), - init_mm.pgd + pgd_index(PAGE_OFFSET), - sizeof(pgd_t) * num_pgds); + num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET); + memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); + + /* + * We share all the PUD entries apart from those that map the + * EFI regions. Copy around them. + */ + BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0); + BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); + + pgd_efi = efi_pgd + pgd_index(EFI_VA_END); + pud_efi = pud_offset(pgd_efi, 0); + + pgd_k = pgd_offset_k(EFI_VA_END); + pud_k = pud_offset(pgd_k, 0); + + num_entries = pud_index(EFI_VA_END); + memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); + + pud_efi = pud_offset(pgd_efi, EFI_VA_START); + pud_k = pud_offset(pgd_k, EFI_VA_START); + + num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); + memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); } int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { - unsigned long text; + unsigned long pfn, text; + efi_memory_desc_t *md; struct page *page; unsigned npages; pgd_t *pgd; @@ -151,8 +223,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd; - pgd = __va(efi_scratch.efi_pgt); + efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd); + pgd = efi_pgd; /* * It can happen that the physical address of new_memmap lands in memory @@ -160,7 +232,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * and ident-map those pages containing the map before calling * phys_efi_set_virtual_address_map(). */ - if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) { + pfn = pa_memmap >> PAGE_SHIFT; + if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) { pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); return 1; } @@ -176,6 +249,25 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) if (!IS_ENABLED(CONFIG_EFI_MIXED)) return 0; + /* + * Map all of RAM so that we can access arguments in the 1:1 + * mapping when making EFI runtime calls. + */ + for_each_efi_memory_desc(&memmap, md) { + if (md->type != EFI_CONVENTIONAL_MEMORY && + md->type != EFI_LOADER_DATA && + md->type != EFI_LOADER_CODE) + continue; + + pfn = md->phys_addr >> PAGE_SHIFT; + npages = md->num_pages; + + if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) { + pr_err("Failed to map 1:1 memory\n"); + return 1; + } + } + page = alloc_page(GFP_KERNEL|__GFP_DMA32); if (!page) panic("Unable to allocate EFI runtime stack < 4GB\n"); @@ -183,10 +275,11 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) efi_scratch.phys_stack = virt_to_phys(page_address(page)); efi_scratch.phys_stack += PAGE_SIZE; /* stack grows down */ - npages = (_end - _text) >> PAGE_SHIFT; + npages = (_etext - _text) >> PAGE_SHIFT; text = __pa(_text); + pfn = text >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) { + if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) { pr_err("Failed to map kernel text 1:1\n"); return 1; } @@ -196,20 +289,20 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) { - pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); - - kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages); + kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages); } static void __init __map_region(efi_memory_desc_t *md, u64 va) { - pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); - unsigned long pf = 0; + unsigned long flags = _PAGE_RW; + unsigned long pfn; + pgd_t *pgd = efi_pgd; if (!(md->attribute & EFI_MEMORY_WB)) - pf |= _PAGE_PCD; + flags |= _PAGE_PCD; - if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf)) + pfn = md->phys_addr >> PAGE_SHIFT; + if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags)) pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", md->phys_addr, va); } @@ -300,21 +393,56 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len) efi_setup = phys_addr + sizeof(struct setup_data); } -void __init efi_runtime_mkexec(void) +void __init efi_runtime_update_mappings(void) { - if (!efi_enabled(EFI_OLD_MEMMAP)) + unsigned long pfn; + pgd_t *pgd = efi_pgd; + efi_memory_desc_t *md; + void *p; + + if (efi_enabled(EFI_OLD_MEMMAP)) { + if (__supported_pte_mask & _PAGE_NX) + runtime_code_page_mkexec(); return; + } + + if (!efi_enabled(EFI_NX_PE_DATA)) + return; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + unsigned long pf = 0; + md = p; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; - if (__supported_pte_mask & _PAGE_NX) - runtime_code_page_mkexec(); + if (!(md->attribute & EFI_MEMORY_WB)) + pf |= _PAGE_PCD; + + if ((md->attribute & EFI_MEMORY_XP) || + (md->type == EFI_RUNTIME_SERVICES_DATA)) + pf |= _PAGE_NX; + + if (!(md->attribute & EFI_MEMORY_RO) && + (md->type != EFI_RUNTIME_SERVICES_CODE)) + pf |= _PAGE_RW; + + /* Update the 1:1 mapping */ + pfn = md->phys_addr >> PAGE_SHIFT; + if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf)) + pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, md->virt_addr); + + if (kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf)) + pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, md->virt_addr); + } } void __init efi_dump_pagetable(void) { #ifdef CONFIG_EFI_PGT_DUMP - pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); - - ptdump_walk_pgd_level(NULL, pgd); + ptdump_walk_pgd_level(NULL, efi_pgd); #endif } diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 86d0f9e08dd9..92723aeae0f9 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -11,6 +11,7 @@ #include <asm/msr.h> #include <asm/processor-flags.h> #include <asm/page_types.h> +#include <asm/frame.h> #define SAVE_XMM \ mov %rsp, %rax; \ @@ -38,42 +39,8 @@ mov %rsi, %cr0; \ mov (%rsp), %rsp - /* stolen from gcc */ - .macro FLUSH_TLB_ALL - movq %r15, efi_scratch(%rip) - movq %r14, efi_scratch+8(%rip) - movq %cr4, %r15 - movq %r15, %r14 - andb $0x7f, %r14b - movq %r14, %cr4 - movq %r15, %cr4 - movq efi_scratch+8(%rip), %r14 - movq efi_scratch(%rip), %r15 - .endm - - .macro SWITCH_PGT - cmpb $0, efi_scratch+24(%rip) - je 1f - movq %r15, efi_scratch(%rip) # r15 - # save previous CR3 - movq %cr3, %r15 - movq %r15, efi_scratch+8(%rip) # prev_cr3 - movq efi_scratch+16(%rip), %r15 # EFI pgt - movq %r15, %cr3 - 1: - .endm - - .macro RESTORE_PGT - cmpb $0, efi_scratch+24(%rip) - je 2f - movq efi_scratch+8(%rip), %r15 - movq %r15, %cr3 - movq efi_scratch(%rip), %r15 - FLUSH_TLB_ALL - 2: - .endm - ENTRY(efi_call) + FRAME_BEGIN SAVE_XMM mov (%rsp), %rax mov 8(%rax), %rax @@ -83,16 +50,9 @@ ENTRY(efi_call) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx - SWITCH_PGT call *%rdi - RESTORE_PGT addq $48, %rsp RESTORE_XMM + FRAME_END ret ENDPROC(efi_call) - - .data -ENTRY(efi_scratch) - .fill 3,8,0 - .byte 0 - .quad 0 diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 2d66db8f80f9..ab50ada1d56e 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "efi: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/string.h> @@ -55,13 +57,41 @@ void efi_delete_dummy_variable(void) } /* + * In the nonblocking case we do not attempt to perform garbage + * collection if we do not have enough free space. Rather, we do the + * bare minimum check and give up immediately if the available space + * is below EFI_MIN_RESERVE. + * + * This function is intended to be small and simple because it is + * invoked from crash handler paths. + */ +static efi_status_t +query_variable_store_nonblocking(u32 attributes, unsigned long size) +{ + efi_status_t status; + u64 storage_size, remaining_size, max_size; + + status = efi.query_variable_info_nonblocking(attributes, &storage_size, + &remaining_size, + &max_size); + if (status != EFI_SUCCESS) + return status; + + if (remaining_size - size < EFI_MIN_RESERVE) + return EFI_OUT_OF_RESOURCES; + + return EFI_SUCCESS; +} + +/* * Some firmware implementations refuse to boot if there's insufficient space * in the variable store. Ensure that we never use more than a safe limit. * * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable * store. */ -efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) +efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, + bool nonblocking) { efi_status_t status; u64 storage_size, remaining_size, max_size; @@ -69,6 +99,9 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) if (!(attributes & EFI_VARIABLE_NON_VOLATILE)) return 0; + if (nonblocking) + return query_variable_store_nonblocking(attributes, size); + status = efi.query_variable_info(attributes, &storage_size, &remaining_size, &max_size); if (status != EFI_SUCCESS) @@ -131,6 +164,27 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) EXPORT_SYMBOL_GPL(efi_query_variable_store); /* + * Helper function for efi_reserve_boot_services() to figure out if we + * can free regions in efi_free_boot_services(). + * + * Use this function to ensure we do not free regions owned by somebody + * else. We must only reserve (and then free) regions: + * + * - Not within any part of the kernel + * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc) + */ +static bool can_free_region(u64 start, u64 size) +{ + if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end)) + return false; + + if (!e820_all_mapped(start, start+size, E820_RAM)) + return false; + + return true; +} + +/* * The UEFI specification makes it clear that the operating system is free to do * whatever it wants with boot services code after ExitBootServices() has been * called. Ignoring this recommendation a significant bunch of EFI implementations @@ -147,26 +201,50 @@ void __init efi_reserve_boot_services(void) efi_memory_desc_t *md = p; u64 start = md->phys_addr; u64 size = md->num_pages << EFI_PAGE_SHIFT; + bool already_reserved; if (md->type != EFI_BOOT_SERVICES_CODE && md->type != EFI_BOOT_SERVICES_DATA) continue; - /* Only reserve where possible: - * - Not within any already allocated areas - * - Not over any memory area (really needed, if above?) - * - Not within any part of the kernel - * - Not the bios reserved area - */ - if ((start + size > __pa_symbol(_text) - && start <= __pa_symbol(_end)) || - !e820_all_mapped(start, start+size, E820_RAM) || - memblock_is_region_reserved(start, size)) { - /* Could not reserve, skip it */ - md->num_pages = 0; - memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n", - start, start+size-1); - } else + + already_reserved = memblock_is_region_reserved(start, size); + + /* + * Because the following memblock_reserve() is paired + * with free_bootmem_late() for this region in + * efi_free_boot_services(), we must be extremely + * careful not to reserve, and subsequently free, + * critical regions of memory (like the kernel image) or + * those regions that somebody else has already + * reserved. + * + * A good example of a critical region that must not be + * freed is page zero (first 4Kb of memory), which may + * contain boot services code/data but is marked + * E820_RESERVED by trim_bios_range(). + */ + if (!already_reserved) { memblock_reserve(start, size); + + /* + * If we are the first to reserve the region, no + * one else cares about it. We own it and can + * free it later. + */ + if (can_free_region(start, size)) + continue; + } + + /* + * We don't own the region. We must not free it. + * + * Setting this bit for a boot services region really + * doesn't make sense as far as the firmware is + * concerned, but it does provide us with a way to tag + * those regions that must not be paired with + * free_bootmem_late(). + */ + md->attribute |= EFI_MEMORY_RUNTIME; } } @@ -183,8 +261,8 @@ void __init efi_free_boot_services(void) md->type != EFI_BOOT_SERVICES_DATA) continue; - /* Could not reserve boot area */ - if (!size) + /* Do not free, someone else owns it: */ + if (md->attribute & EFI_MEMORY_RUNTIME) continue; free_bootmem_late(start, size); @@ -267,7 +345,7 @@ void __init efi_apply_memmap_quirks(void) * services. */ if (!efi_runtime_supported()) { - pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + pr_info("Setup done, disabling due to 32/64-bit mismatch\n"); efi_unmap_memmap(); } diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index 76b6632d3143..1865c196f136 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -21,7 +21,7 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/string.h> -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/leds.h> #include <linux/platform_device.h> #include <linux/gpio.h> @@ -35,6 +35,11 @@ #define BIOS_SIGNATURE_COREBOOT 0x500 #define BIOS_REGION_SIZE 0x10000 +/* + * This driver is not modular, but to keep back compatibility + * with existing use cases, continuing with module_param is + * the easiest way forward. + */ static bool force = 0; module_param(force, bool, 0444); /* FIXME: Award bios is not automatically detected as Alix platform */ @@ -192,9 +197,4 @@ static int __init alix_init(void) return 0; } - -module_init(alix_init); - -MODULE_AUTHOR("Ed Wildgoose <kernel@wildgooses.com>"); -MODULE_DESCRIPTION("PCEngines ALIX System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(alix_init); diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index aa733fba2471..4fcdb91318a0 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -19,7 +19,6 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/string.h> -#include <linux/module.h> #include <linux/leds.h> #include <linux/platform_device.h> #include <linux/gpio.h> @@ -120,9 +119,4 @@ static int __init geos_init(void) return 0; } - -module_init(geos_init); - -MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>"); -MODULE_DESCRIPTION("Traverse Technologies Geos System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(geos_init); diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 927e38c0089f..a2f6b982a729 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -20,7 +20,6 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/string.h> -#include <linux/module.h> #include <linux/leds.h> #include <linux/platform_device.h> #include <linux/gpio.h> @@ -146,9 +145,4 @@ static int __init net5501_init(void) return 0; } - -module_init(net5501_init); - -MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>"); -MODULE_DESCRIPTION("Soekris net5501 System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(net5501_init); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 23381d2174ae..1eb47b6298c2 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void) /* mark tsc clocksource as reliable */ set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - if (fast_calibrate) - return fast_calibrate; - - return 0; + return fast_calibrate; } static void __init penwell_arch_setup(void) diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c index aaca91753d32..bd1adc621781 100644 --- a/arch/x86/platform/intel-mid/mrfl.c +++ b/arch/x86/platform/intel-mid/mrfl.c @@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void) /* mark tsc clocksource as reliable */ set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - if (fast_calibrate) - return fast_calibrate; - - return 0; + return fast_calibrate; } static void __init tangier_arch_setup(void) diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index c61b6c332e97..17d6d2296e4d 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -1,5 +1,5 @@ /** - * imr.c + * imr.c -- Intel Isolated Memory Region driver * * Copyright(c) 2013 Intel Corporation. * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> @@ -31,7 +31,6 @@ #include <linux/debugfs.h> #include <linux/init.h> #include <linux/mm.h> -#include <linux/module.h> #include <linux/types.h> struct imr_device { @@ -135,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) * @idev: pointer to imr_device structure. * @imr_id: IMR entry to write. * @imr: IMR structure representing address and access masks. - * @lock: indicates if the IMR lock bit should be applied. * @return: 0 on success or error code passed from mbi_iosf on failure. */ -static int imr_write(struct imr_device *idev, u32 imr_id, - struct imr_regs *imr, bool lock) +static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) { unsigned long flags; u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; @@ -163,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id, if (ret) goto failed; - /* Lock bit must be set separately to addr_lo address bits. */ - if (lock) { - imr->addr_lo |= IMR_LOCK; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, - reg - IMR_NUM_REGS, imr->addr_lo); - if (ret) - goto failed; - } - local_irq_restore(flags); return 0; failed: @@ -270,17 +258,6 @@ static int imr_debugfs_register(struct imr_device *idev) } /** - * imr_debugfs_unregister - unregister debugfs hooks. - * - * @idev: pointer to imr_device structure. - * @return: - */ -static void imr_debugfs_unregister(struct imr_device *idev) -{ - debugfs_remove(idev->file); -} - -/** * imr_check_params - check passed address range IMR alignment and non-zero size * * @base: base address of intended IMR. @@ -334,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr) * @size: physical size of region in bytes must be aligned to 1KiB. * @read_mask: read access mask. * @write_mask: write access mask. - * @lock: indicates whether or not to permanently lock this region. * @return: zero on success or negative value indicating error. */ int imr_add_range(phys_addr_t base, size_t size, - unsigned int rmask, unsigned int wmask, bool lock) + unsigned int rmask, unsigned int wmask) { phys_addr_t end; unsigned int i; @@ -411,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size, imr.rmask = rmask; imr.wmask = wmask; - ret = imr_write(idev, reg, &imr, lock); + ret = imr_write(idev, reg, &imr); if (ret < 0) { /* * In the highly unlikely event iosf_mbi_write failed @@ -422,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size, imr.addr_hi = 0; imr.rmask = IMR_READ_ACCESS_ALL; imr.wmask = IMR_WRITE_ACCESS_ALL; - imr_write(idev, reg, &imr, false); + imr_write(idev, reg, &imr); } failed: mutex_unlock(&idev->lock); @@ -518,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size) imr.rmask = IMR_READ_ACCESS_ALL; imr.wmask = IMR_WRITE_ACCESS_ALL; - ret = imr_write(idev, reg, &imr, false); + ret = imr_write(idev, reg, &imr); failed: mutex_unlock(&idev->lock); @@ -592,14 +568,14 @@ static void __init imr_fixup_memmap(struct imr_device *idev) end = (unsigned long)__end_rodata - 1; /* - * Setup a locked IMR around the physical extent of the kernel + * Setup an unlocked IMR around the physical extent of the kernel * from the beginning of the .text secton to the end of the * .rodata section as one physically contiguous block. * * We don't round up @size since it is already PAGE_SIZE aligned. * See vmlinux.lds.S for details. */ - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); if (ret < 0) { pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n", size / 1024, start, end); @@ -614,7 +590,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = { { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ {} }; -MODULE_DEVICE_TABLE(x86cpu, imr_ids); /** * imr_init - entry point for IMR driver. @@ -640,22 +615,4 @@ static int __init imr_init(void) imr_fixup_memmap(idev); return 0; } - -/** - * imr_exit - exit point for IMR code. - * - * Deregisters debugfs, leave IMR state as-is. - * - * return: - */ -static void __exit imr_exit(void) -{ - imr_debugfs_unregister(&imr_dev); -} - -module_init(imr_init); -module_exit(imr_exit); - -MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>"); -MODULE_DESCRIPTION("Intel Isolated Memory Region driver"); -MODULE_LICENSE("Dual BSD/GPL"); +device_initcall(imr_init); diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 278e4da4222f..f5bad40936ac 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -1,5 +1,5 @@ /** - * imr_selftest.c + * imr_selftest.c -- Intel Isolated Memory Region self-test driver * * Copyright(c) 2013 Intel Corporation. * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> @@ -15,7 +15,6 @@ #include <asm/imr.h> #include <linux/init.h> #include <linux/mm.h> -#include <linux/module.h> #include <linux/types.h> #define SELFTEST KBUILD_MODNAME ": " @@ -61,30 +60,30 @@ static void __init imr_self_test(void) int ret; /* Test zero zero. */ - ret = imr_add_range(0, 0, 0, 0, false); + ret = imr_add_range(0, 0, 0, 0); imr_self_test_result(ret < 0, "zero sized IMR\n"); /* Test exact overlap. */ - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with base inside of existing. */ base += size - IMR_ALIGN; - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with end inside of existing. */ base -= size + IMR_ALIGN * 2; - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */ ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL, - IMR_WRITE_ACCESS_ALL, false); + IMR_WRITE_ACCESS_ALL); imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n"); /* Test that a 1 KiB IMR @ zero with CPU only will work. */ - ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU); imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n"); if (ret >= 0) { ret = imr_remove_range(0, IMR_ALIGN); @@ -93,8 +92,7 @@ static void __init imr_self_test(void) /* Test 2 KiB works. */ size = IMR_ALIGN * 2; - ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, - IMR_WRITE_ACCESS_ALL, false); + ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL); imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n"); if (ret >= 0) { ret = imr_remove_range(0, size); @@ -106,7 +104,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = { { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ {} }; -MODULE_DEVICE_TABLE(x86cpu, imr_ids); /** * imr_self_test_init - entry point for IMR driver. @@ -125,13 +122,4 @@ static int __init imr_self_test_init(void) * * return: */ -static void __exit imr_self_test_exit(void) -{ -} - -module_init(imr_self_test_init); -module_exit(imr_self_test_exit); - -MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>"); -MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver"); -MODULE_LICENSE("Dual BSD/GPL"); +device_initcall(imr_self_test_init); diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index e2386cb4e0c3..4400a43b9e28 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -21,8 +21,10 @@ #include <asm/page_types.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/frame.h> ENTRY(swsusp_arch_suspend) + FRAME_BEGIN movq $saved_context, %rax movq %rsp, pt_regs_sp(%rax) movq %rbp, pt_regs_bp(%rax) @@ -50,7 +52,9 @@ ENTRY(swsusp_arch_suspend) movq %rax, restore_cr3(%rip) call swsusp_save + FRAME_END ret +ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) /* switch to temporary page tables */ @@ -107,6 +111,7 @@ ENTRY(core_restore_code) */ ENTRY(restore_registers) + FRAME_BEGIN /* go back to the original page tables */ movq %rbx, %cr3 @@ -147,4 +152,6 @@ ENTRY(restore_registers) /* tell the hibernation core that we've just restored the memory */ movq %rax, in_suspend(%rip) + FRAME_END ret +ENDPROC(restore_registers) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 2c835e356349..92e3e1d84c1d 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -1,3 +1,5 @@ +OBJECT_FILES_NON_STANDARD := y + purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o targets += $(purgatory-y) diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index e02c2c6c56a5..682c895753d9 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -6,7 +6,9 @@ # for more details. # # -KASAN_SANITIZE := n +KASAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y + subdir- := rm obj-y += init.o diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 3e75fcf6b836..053abe7b0ef7 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -6,7 +6,8 @@ # for more details. # # -KASAN_SANITIZE := n +KASAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y always := realmode.bin realmode.relocs diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index ee940185e89f..54d96f1e3594 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -87,8 +87,8 @@ static inline __sum16 csum_fold(__wsum sum) * 32bit unfolded. */ static inline __wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, - unsigned short proto, __wsum sum) +csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) { asm(" addl %1, %0\n" " adcl %2, %0\n" @@ -104,9 +104,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, * returns a 16-bit checksum, already complemented */ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, - unsigned short len, - unsigned short proto, - __wsum sum) + __u32 len, __u8 proto, + __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); } diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h index ab77b6f9a4bf..83a75f8a1233 100644 --- a/arch/x86/um/asm/checksum_32.h +++ b/arch/x86/um/asm/checksum_32.h @@ -13,7 +13,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len) #define _HAVE_ARCH_IPV6_CSUM static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, - __u32 len, unsigned short proto, + __u32 len, __u8 proto, __wsum sum) { __asm__( diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c index 8502ad30e61b..5adb6a2fd117 100644 --- a/arch/x86/um/os-Linux/task_size.c +++ b/arch/x86/um/os-Linux/task_size.c @@ -109,7 +109,7 @@ unsigned long os_get_top_address(void) exit(1); } - printf("0x%x\n", bottom << UM_KERN_PAGE_SHIFT); + printf("0x%lx\n", bottom << UM_KERN_PAGE_SHIFT); printf("Locating the top of the address space ... "); fflush(stdout); @@ -134,7 +134,7 @@ out: exit(1); } top <<= UM_KERN_PAGE_SHIFT; - printf("0x%x\n", top); + printf("0x%lx\n", top); return top; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d09e4c9d7cc5..2379a5a88504 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -32,6 +32,7 @@ #include <linux/gfp.h> #include <linux/memblock.h> #include <linux/edd.h> +#include <linux/frame.h> #ifdef CONFIG_KEXEC_CORE #include <linux/kexec.h> @@ -351,8 +352,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *cx &= maskecx; *cx |= setecx; *dx &= maskedx; - } +STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ static bool __init xen_check_mwait(void) { @@ -1654,7 +1655,7 @@ asmlinkage __visible void __init xen_start_kernel(void) cpu_detect(&new_cpu_data); set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); new_cpu_data.wp_works_ok = 1; - new_cpu_data.x86_capability[0] = cpuid_edx(1); + new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); #endif if (xen_start_info->mod_start) { diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 724a08740a04..9466354d3e49 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -11,7 +11,7 @@ #include "pmu.h" /* x86_pmu.handle_irq definition */ -#include "../kernel/cpu/perf_event.h" +#include "../events/perf_event.h" #define XENPMU_IRQ_PROCESSING 1 struct xenpmu { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3f4ebf0261f2..3c6d17fd423a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu) xen_pvh_secondary_vcpu_init(cpu); #endif cpu_bringup(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void xen_smp_intr_free(unsigned int cpu) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 3e45aa000718..eff224df813f 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -14,6 +14,7 @@ #include <asm/asm-offsets.h> #include <asm/percpu.h> #include <asm/processor-flags.h> +#include <asm/frame.h> #include "xen-asm.h" @@ -23,6 +24,7 @@ * then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) + FRAME_BEGIN /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask @@ -39,6 +41,7 @@ ENTRY(xen_irq_enable_direct) 2: call check_events 1: ENDPATCH(xen_irq_enable_direct) + FRAME_END ret ENDPROC(xen_irq_enable_direct) RELOC(xen_irq_enable_direct, 2b+1) @@ -82,6 +85,7 @@ ENDPATCH(xen_save_fl_direct) * enters the hypervisor to get them delivered if so. */ ENTRY(xen_restore_fl_direct) + FRAME_BEGIN #ifdef CONFIG_X86_64 testw $X86_EFLAGS_IF, %di #else @@ -100,6 +104,7 @@ ENTRY(xen_restore_fl_direct) 2: call check_events 1: ENDPATCH(xen_restore_fl_direct) + FRAME_END ret ENDPROC(xen_restore_fl_direct) RELOC(xen_restore_fl_direct, 2b+1) @@ -109,7 +114,8 @@ ENDPATCH(xen_restore_fl_direct) * Force an event check by making a hypercall, but preserve regs * before making the call. */ -check_events: +ENTRY(check_events) + FRAME_BEGIN #ifdef CONFIG_X86_32 push %eax push %ecx @@ -139,4 +145,6 @@ check_events: pop %rcx pop %rax #endif + FRAME_END ret +ENDPROC(check_events) diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index cc8acc410ddb..c3df43141e70 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -26,6 +26,7 @@ ENTRY(xen_adjust_exception_frame) mov 8+0(%rsp), %rcx mov 8+8(%rsp), %r11 ret $16 +ENDPROC(xen_adjust_exception_frame) hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* |