diff options
author | Ingo Molnar <mingo@kernel.org> | 2014-05-07 15:15:46 +0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-05-07 15:15:46 +0400 |
commit | 2fe5de9ce7d57498abc14b375cad2fcf8c3ee6cc (patch) | |
tree | 9478e8cf470c1d5bdb2d89b57a7e35919ab95e72 /arch/x86 | |
parent | 08f8aeb55d7727d644dbbbbfb798fe937d47751d (diff) | |
parent | 2b4cfe64dee0d84506b951d81bf55d9891744d25 (diff) | |
download | linux-2fe5de9ce7d57498abc14b375cad2fcf8c3ee6cc.tar.xz |
Merge branch 'sched/urgent' into sched/core, to avoid conflicts
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
75 files changed, 2096 insertions, 821 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f73071742975..25d2c6f7325e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -43,6 +43,7 @@ config X86 select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES + select GENERIC_EARLY_IOREMAP select HAVE_OPTPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_FTRACE_MCOUNT_RECORD @@ -128,6 +129,7 @@ config X86 select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_CC_STACKPROTECTOR select GENERIC_CPU_AUTOPROBE + select HAVE_ARCH_AUDITSYSCALL config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3b9348a0c1a4..d1b7c377a234 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -108,7 +108,7 @@ else # this works around some issues with generating unwind tables in older gccs # newer gccs do it by default - KBUILD_CFLAGS += -maccumulate-outgoing-args + KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) endif # Make sure compiler does not have buggy stack-protector support. @@ -250,8 +250,8 @@ archclean: PHONY += kvmconfig kvmconfig: $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target)) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config arch/x86/configs/kvm_guest.config - $(Q)yes "" | $(MAKE) oldconfig + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config $(srctree)/arch/x86/configs/kvm_guest.config + $(Q)yes "" | $(MAKE) -f $(srctree)/Makefile oldconfig define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 1e6146137f8e..4703a6c4b8e3 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -112,7 +112,7 @@ __file_size64(void *__fh, efi_char16_t *filename_16, efi_file_info_t *info; efi_status_t status; efi_guid_t info_guid = EFI_FILE_INFO_ID; - u32 info_sz; + u64 info_sz; status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); @@ -167,31 +167,31 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh, } static inline efi_status_t -efi_file_read(void *__fh, void *handle, unsigned long *size, void *addr) +efi_file_read(void *handle, unsigned long *size, void *addr) { unsigned long func; if (efi_early->is64) { - efi_file_handle_64_t *fh = __fh; + efi_file_handle_64_t *fh = handle; func = (unsigned long)fh->read; return efi_early->call(func, handle, size, addr); } else { - efi_file_handle_32_t *fh = __fh; + efi_file_handle_32_t *fh = handle; func = (unsigned long)fh->read; return efi_early->call(func, handle, size, addr); } } -static inline efi_status_t efi_file_close(void *__fh, void *handle) +static inline efi_status_t efi_file_close(void *handle) { if (efi_early->is64) { - efi_file_handle_64_t *fh = __fh; + efi_file_handle_64_t *fh = handle; return efi_early->call((unsigned long)fh->close, handle); } else { - efi_file_handle_32_t *fh = __fh; + efi_file_handle_32_t *fh = handle; return efi_early->call((unsigned long)fh->close, handle); } @@ -1016,6 +1016,9 @@ void setup_graphics(struct boot_params *boot_params) * Because the x86 boot code expects to be passed a boot_params we * need to create one ourselves (usually the bootloader would create * one for us). + * + * The caller is responsible for filling out ->code32_start in the + * returned boot_params. */ struct boot_params *make_boot_params(struct efi_config *c) { @@ -1081,8 +1084,6 @@ struct boot_params *make_boot_params(struct efi_config *c) hdr->vid_mode = 0xffff; hdr->boot_flag = 0xAA55; - hdr->code32_start = (__u64)(unsigned long)image->image_base; - hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index de9d4200d305..cbed1407a5cd 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -59,6 +59,7 @@ ENTRY(efi_pe_entry) call make_boot_params cmpl $0, %eax je fail + movl %esi, BP_code32_start(%eax) popl %ecx pushl %eax pushl %ecx @@ -90,12 +91,7 @@ fail: hlt jmp fail 2: - call 3f -3: - popl %eax - subl $3b, %eax - subl BP_pref_address(%esi), %eax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leal preferred_addr(%eax), %eax jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 57e58a5fa210..0d558ee899ae 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -261,6 +261,8 @@ ENTRY(efi_pe_entry) cmpq $0,%rax je fail mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) jmp 2f /* Skip the relocation */ handover_entry: @@ -284,12 +286,7 @@ fail: hlt jmp fail 2: - call 3f -3: - popq %rax - subq $3b, %rax - subq BP_pref_address(%rsi), %rax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leaq preferred_addr(%rax), %rax jmp *%rax diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 6ba54d640383..61d6e281898b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -79,6 +79,9 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +ifeq ($(avx2_supported),yes) +sha1-ssse3-y += sha1_avx2_x86_64_asm.o +endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 50ec333b70e6..8af519ed73d1 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -223,9 +223,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * 4); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e6a3700489b9..e57e20ab5e0b 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -203,9 +203,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 586f41aac361..185fad49d86f 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -24,10 +24,6 @@ .align 16 .Lbswap_mask: .octa 0x000102030405060708090a0b0c0d0e0f -.Lpoly: - .octa 0xc2000000000000000000000000000001 -.Ltwo_one: - .octa 0x00000001000000000000000000000001 #define DATA %xmm0 #define SHASH %xmm1 @@ -134,28 +130,3 @@ ENTRY(clmul_ghash_update) .Lupdate_just_ret: ret ENDPROC(clmul_ghash_update) - -/* - * void clmul_ghash_setkey(be128 *shash, const u8 *key); - * - * Calculate hash_key << 1 mod poly - */ -ENTRY(clmul_ghash_setkey) - movaps .Lbswap_mask, BSWAP - movups (%rsi), %xmm0 - PSHUFB_XMM BSWAP %xmm0 - movaps %xmm0, %xmm1 - psllq $1, %xmm0 - psrlq $63, %xmm1 - movaps %xmm1, %xmm2 - pslldq $8, %xmm1 - psrldq $8, %xmm2 - por %xmm1, %xmm0 - # reduction - pshufd $0b00100100, %xmm2, %xmm1 - pcmpeqd .Ltwo_one, %xmm1 - pand .Lpoly, %xmm1 - pxor %xmm1, %xmm0 - movups %xmm0, (%rdi) - ret -ENDPROC(clmul_ghash_setkey) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 6759dd1135be..d785cf2c529c 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -30,8 +30,6 @@ void clmul_ghash_mul(char *dst, const be128 *shash); void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, const be128 *shash); -void clmul_ghash_setkey(be128 *shash, const u8 *key); - struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; @@ -58,13 +56,23 @@ static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + be128 *x = (be128 *)key; + u64 a, b; if (keylen != GHASH_BLOCK_SIZE) { crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } - clmul_ghash_setkey(&ctx->shash, key); + /* perform multiplication by 'x' in GF(2^128) */ + a = be64_to_cpu(x->a); + b = be64_to_cpu(x->b); + + ctx->shash.a = (__be64)((b << 1) | (a >> 63)); + ctx->shash.b = (__be64)((a << 1) | (b >> 63)); + + if (a >> 63) + ctx->shash.b ^= cpu_to_be64(0xc2); return 0; } diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S new file mode 100644 index 000000000000..1cd792db15ef --- /dev/null +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -0,0 +1,708 @@ +/* + * Implement fast SHA-1 with AVX2 instructions. (x86_64) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Ilya Albrekht <ilya.albrekht@intel.com> + * Maxim Locktyukhin <maxim.locktyukhin@intel.com> + * Ronen Zohar <ronen.zohar@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. + * + *This implementation is based on the previous SSSE3 release: + *Visit http://software.intel.com/en-us/articles/ + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ + * + *Updates 20-byte SHA-1 record in 'hash' for even number of + *'num_blocks' consecutive 64-byte blocks + * + *extern "C" void sha1_transform_avx2( + * int *hash, const char* input, size_t num_blocks ); + */ + +#include <linux/linkage.h> + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %ebp +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 +#define BUFFER_END %r11 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +/* + * Keep 2 iterations precalculated at a time: + * - 80 DWORDs per iteration * 2 + */ +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + /* Rotate macros */ + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + /* Define register aliases */ + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + /* message scheduling pre-compute for rounds 0-15 */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM(K_BASE), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_16_31 + /* + * message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction + * + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency + */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + /* w[i-14] */ + vpalignr $8, WY_minus_16, WY_minus_12, WY + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + /* + * in SHA-1 specification: + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization + * since w[i]=>w[i-3] dependency is broken + */ + + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + /* W is W_minus_32 before xor */ + vpxor WY_minus_28, WY, WY + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +/* Macro relies on saved ROUND_Fx */ + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) /* Precalculate F for first round */ + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B /* b>>>2 */ + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 /* ~b&d */ + lea (RE,RTB), E /* Add F from the previous round */ + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30),A, TB /* b>>>2 for next round */ + + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + /* + * Calculate F for the next round + * (b & c) ^ andn[b, d] + */ + and B, A /* b&c */ + xor T1, A /* F1 = (b&c) ^ (~b&d) */ + + lea (RE,RTA), E /* E += A >>> 5 */ +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E /* Add F from the previous round */ + + /* Calculate F for the next round */ + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + .if ((round_id) < 79) + rorx $(32-30), A, TB /* b>>>2 for next round */ + .endif + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E /* E += A >>> 5 */ + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + lea (RE,RTB), E /* Add F from the previous round */ + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30), A, TB /* b>>>2 for next round */ + + /* Calculate F for the next round + * (b and c) or (d and (b or c)) + */ + and C, T1 + and B, A + or T1, A + + add TA, E /* E += A >>> 5 */ + +.endm + +/* + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining + */ +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + PRECALC_OFFSET = 0 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + PRECALC_OFFSET = 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 +_loop: + /* + * code loops through more than one block + * we use K_BASE value as a signal of a last block, + * it is set below by: cmovae BUFFER_PTR, K_BASE + */ + cmp K_BASE, BUFFER_PTR + jne _begin + .align 32 + jmp _end + .align 32 +_begin: + + /* + * Do first block + * rounds: 0,2,4,6,8 + */ + .set j, 0 + .rept 5 + RR j + .set j, j+2 + .endr + + jmp _loop0 +_loop0: + + /* + * rounds: + * 10,12,14,16,18 + * 20,22,24,26,28 + * 30,32,34,36,38 + * 40,42,44,46,48 + * 50,52,54,56,58 + */ + .rept 25 + RR j + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ + cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + /* + * rounds + * 60,62,64,66,68 + * 70,72,74,76,78 + */ + .rept 10 + RR j + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + cmp K_BASE, BUFFER_PTR /* is current block the last one? */ + je _loop + + mov TB, B + + /* Process second block */ + /* + * rounds + * 0+80, 2+80, 4+80, 6+80, 8+80 + * 10+80,12+80,14+80,16+80,18+80 + */ + + .set j, 0 + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop1 +_loop1: + /* + * rounds + * 20+80,22+80,24+80,26+80,28+80 + * 30+80,32+80,34+80,36+80,38+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop2 +_loop2: + + /* + * rounds + * 40+80,42+80,44+80,46+80,48+80 + * 50+80,52+80,54+80,56+80,58+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ + + cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + jmp _loop3 +_loop3: + + /* + * rounds + * 60+80,62+80,64+80,66+80,68+80 + * 70+80,72+80,74+80,76+80,78+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + /* Reset state for AVX2 reg permutation */ + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp _loop + + .align 32 + _end: + +.endm +/* + * macro implements SHA-1 function's body for several 64-byte blocks + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + ENTRY(\name) + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + /* Align stack */ + mov %rsp, %rbx + and $~(0x20-1), %rsp + push %rbx + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + lea K_XMM_AR(%rip), K_BASE + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + lea 64(BUF), BUFFER_PTR2 + + shl $6, CNT /* mul by 64 */ + add BUF, CNT + add $64, CNT + mov CNT, BUFFER_END + + cmp BUFFER_END, BUFFER_PTR2 + cmovae K_BASE, BUFFER_PTR2 + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + add $RESERVE_STACK, %rsp + pop %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + + ENDPROC(\name) +.endm + +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f +.text + +SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 4a11a9d72451..74d16ef707c7 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -10,6 +10,7 @@ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> * Copyright (c) Mathias Krause <minipli@googlemail.com> + * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -39,6 +40,12 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, asmlinkage void sha1_transform_avx(u32 *digest, const char *data, unsigned int rounds); #endif +#ifdef CONFIG_AS_AVX2 +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, + unsigned int rounds); +#endif static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); @@ -165,6 +172,18 @@ static int sha1_ssse3_import(struct shash_desc *desc, const void *in) return 0; } +#ifdef CONFIG_AS_AVX2 +static void sha1_apply_transform_avx2(u32 *digest, const char *data, + unsigned int rounds) +{ + /* Select the optimal transform based on data block size */ + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(digest, data, rounds); + else + sha1_transform_avx(digest, data, rounds); +} +#endif + static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_ssse3_init, @@ -201,27 +220,49 @@ static bool __init avx_usable(void) return true; } + +#ifdef CONFIG_AS_AVX2 +static bool __init avx2_usable(void) +{ + if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} +#endif #endif static int __init sha1_ssse3_mod_init(void) { + char *algo_name; + /* test for SSSE3 first */ - if (cpu_has_ssse3) + if (cpu_has_ssse3) { sha1_transform_asm = sha1_transform_ssse3; + algo_name = "SSSE3"; + } #ifdef CONFIG_AS_AVX /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) + if (avx_usable()) { sha1_transform_asm = sha1_transform_avx; + algo_name = "AVX"; +#ifdef CONFIG_AS_AVX2 + /* allow AVX2 to override AVX, it's a little faster */ + if (avx2_usable()) { + sha1_transform_asm = sha1_apply_transform_avx2; + algo_name = "AVX2"; + } +#endif + } #endif if (sha1_transform_asm) { - pr_info("Using %s optimized SHA-1 implementation\n", - sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" - : "AVX"); + pr_info("Using %s optimized SHA-1 implementation\n", algo_name); return crypto_register_shash(&alg); } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); return -ENODEV; } diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4acddc43ee0c..3ca9762e1649 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,5 +5,6 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h +generic-y += early_ioremap.h generic-y += cputime.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index e6a92455740e..69f1366f1aa3 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -1,7 +1,7 @@ /* * This file is part of the Linux kernel. * - * Copyright (c) 2011, Intel Corporation + * Copyright (c) 2011-2014, Intel Corporation * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> * @@ -31,10 +31,13 @@ #define RDRAND_RETRY_LOOPS 10 #define RDRAND_INT ".byte 0x0f,0xc7,0xf0" +#define RDSEED_INT ".byte 0x0f,0xc7,0xf8" #ifdef CONFIG_X86_64 # define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" +# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8" #else # define RDRAND_LONG RDRAND_INT +# define RDSEED_LONG RDSEED_INT #endif #ifdef CONFIG_ARCH_RANDOM @@ -53,6 +56,16 @@ static inline int rdrand_long(unsigned long *v) return ok; } +/* A single attempt at RDSEED */ +static inline bool rdseed_long(unsigned long *v) +{ + unsigned char ok; + asm volatile(RDSEED_LONG "\n\t" + "setc %0" + : "=qm" (ok), "=a" (*v)); + return ok; +} + #define GET_RANDOM(name, type, rdrand, nop) \ static inline int name(type *v) \ { \ @@ -70,18 +83,40 @@ static inline int name(type *v) \ return ok; \ } +#define GET_SEED(name, type, rdseed, nop) \ +static inline int name(type *v) \ +{ \ + unsigned char ok; \ + alternative_io("movb $0, %0\n\t" \ + nop, \ + rdseed "\n\t" \ + "setc %0", \ + X86_FEATURE_RDSEED, \ + ASM_OUTPUT2("=q" (ok), "=a" (*v))); \ + return ok; \ +} + #ifdef CONFIG_X86_64 GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); +GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5); +GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); + #else GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); +GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4); +GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); + #endif /* CONFIG_X86_64 */ +#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) +#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) + #else static inline int rdrand_long(unsigned long *v) @@ -89,6 +124,11 @@ static inline int rdrand_long(unsigned long *v) return 0; } +static inline bool rdseed_long(unsigned long *v) +{ + return 0; +} + #endif /* CONFIG_ARCH_RANDOM */ extern void x86_init_rdrand(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 2f03ff018d36..ba38ebbaced3 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_BUG_H #define _ASM_X86_BUG_H -#ifdef CONFIG_BUG #define HAVE_ARCH_BUG #ifdef CONFIG_DEBUG_BUGVERBOSE @@ -33,8 +32,6 @@ do { \ } while (0) #endif -#endif /* !CONFIG_BUG */ - #include <asm-generic/bug.h> #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8dcd35c4c787..43f482a0db37 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -163,5 +163,11 @@ static inline void __set_fixmap(enum fixed_addresses idx, #include <asm-generic/fixmap.h> +#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) +#define __late_clear_fixmap(idx) __set_fixmap(idx, 0, __pgprot(0)) + +void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 91d9c69a629e..b8237d8a1e0c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -39,6 +39,7 @@ #include <linux/string.h> #include <linux/compiler.h> #include <asm/page.h> +#include <asm/early_ioremap.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -316,19 +317,6 @@ extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, unsigned long prot_val); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); -/* - * early_ioremap() and early_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -extern void early_ioremap_init(void); -extern void early_ioremap_reset(void); -extern void __iomem *early_ioremap(resource_size_t phys_addr, - unsigned long size); -extern void __iomem *early_memremap(resource_size_t phys_addr, - unsigned long size); -extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fdf83afbb7d9..7de069afb382 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -60,7 +60,7 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -337,6 +337,11 @@ struct kvm_pmu { u64 reprogram_pmi; }; +enum { + KVM_DEBUGREG_BP_ENABLED = 1, + KVM_DEBUGREG_WONT_EXIT = 2, +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -444,7 +449,6 @@ struct kvm_vcpu_arch { } st; u64 last_guest_tsc; - u64 last_kernel_ns; u64 last_host_tsc; u64 tsc_offset_adjustment; u64 this_tsc_nsec; @@ -464,7 +468,7 @@ struct kvm_vcpu_arch { struct mtrr_state_type mtrr_state; u32 pat; - int switch_db_regs; + unsigned switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; @@ -599,6 +603,8 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; cycle_t master_cycle_now; + struct delayed_work kvmclock_update_work; + struct delayed_work kvmclock_sync_work; struct kvm_xen_hvm_config xen_hvm_config; @@ -702,6 +708,7 @@ struct kvm_x86_ops { void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); u64 (*get_dr6)(struct kvm_vcpu *vcpu); void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); @@ -728,8 +735,8 @@ struct kvm_x86_ops { int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); - int (*enable_nmi_window)(struct kvm_vcpu *vcpu); - int (*enable_irq_window)(struct kvm_vcpu *vcpu); + void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*vm_has_apicv)(struct kvm *kvm); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); @@ -765,6 +772,9 @@ struct kvm_x86_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); + bool (*mpx_supported)(void); + + int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); }; struct kvm_arch_async_pf { diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 94220d14d5cc..851bcdc5db04 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -52,7 +52,7 @@ * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define __this_cpu_ptr(ptr) \ +#define raw_cpu_ptr(ptr) \ ({ \ unsigned long tcp_ptr__; \ __verify_pcpu_ptr(ptr); \ @@ -362,25 +362,25 @@ do { \ */ #define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) -#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) - -#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) + +#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -401,16 +401,16 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) @@ -427,7 +427,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #endif /* CONFIG_X86_CMPXCHG64 */ @@ -436,22 +436,22 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) /* @@ -474,7 +474,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #endif @@ -495,9 +495,9 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; #ifdef CONFIG_X86_64 - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0; #else - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0; #endif } diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index c8b051933b1b..7024c12f7bfe 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -19,12 +19,12 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { - __this_cpu_write_4(__preempt_count, pc); + raw_cpu_write_4(__preempt_count, pc); } /* @@ -53,17 +53,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -72,12 +72,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - __this_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - __this_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(__preempt_count, -val); } /* @@ -95,7 +95,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(void) { - return unlikely(!__this_cpu_read_4(__preempt_count)); + return unlikely(!raw_cpu_read_4(__preempt_count)); } #ifdef CONFIG_PREEMPT diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index aea284b41312..d6a756ae04c8 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -13,7 +13,7 @@ #ifndef _ASM_X86_SYSCALL_H #define _ASM_X86_SYSCALL_H -#include <linux/audit.h> +#include <uapi/linux/audit.h> #include <linux/sched.h> #include <linux/err.h> #include <asm/asm-offsets.h> /* For NR_syscalls */ @@ -91,8 +91,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { return AUDIT_ARCH_I386; } @@ -221,8 +220,7 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { #ifdef CONFIG_IA32_EMULATION /* @@ -234,7 +232,7 @@ static inline int syscall_get_arch(struct task_struct *task, * * x32 tasks should be considered AUDIT_ARCH_X86_64. */ - if (task_thread_info(task)->status & TS_COMPAT) + if (task_thread_info(current)->status & TS_COMPAT) return AUDIT_ARCH_I386; #endif /* Both x32 and x86_64 are considered "64-bit". */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2067264fb7f5..7004d21e6219 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -85,6 +85,7 @@ #define VM_EXIT_SAVE_IA32_EFER 0x00100000 #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 +#define VM_EXIT_CLEAR_BNDCFGS 0x00800000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -95,6 +96,7 @@ #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 +#define VM_ENTRY_LOAD_BNDCFGS 0x00010000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -174,6 +176,8 @@ enum vmcs_field { GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, + GUEST_BNDCFGS = 0x00002812, + GUEST_BNDCFGS_HIGH = 0x00002813, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 3e276eb23d1b..c949923a5668 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -49,10 +49,17 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); +extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count); extern int m2p_add_override(unsigned long mfn, struct page *page, struct gnttab_map_grant_ref *kmap_op); +extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count); extern int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op); + struct gnttab_map_grant_ref *kmap_op, + unsigned long mfn); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); @@ -121,7 +128,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) pfn = m2p_find_override_pfn(mfn, ~0); } - /* + /* * pfn is ~0 if there are no entries in the m2p for mfn or if the * entry doesn't map back to the mfn and m2p_override doesn't have a * valid entry for it. diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 6c1d7411eb00..d949ef28c48b 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,8 @@ #define XSTATE_Hi16_ZMM 0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) #define FXSAVE_SIZE 512 diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 4924f4be2b99..c827ace3121b 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -295,6 +295,7 @@ #define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_TSC_ADJUST 0x0000003b +#define MSR_IA32_BNDCFGS 0x00000d90 #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index e69182fd01cf..4b28159e0421 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -87,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; retval = 0; - if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) { + /* If the HW does not support any sub-states in this C-state */ + if (num_cstate_subtype == 0) { + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part); retval = -1; goto out; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 481ae38f6a44..ad28db7e6bde 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1996,7 +1996,8 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) }; /* First tickle the hardware, only then report what went on. -- REW */ - apic_write(APIC_ESR, 0); + if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); v = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0641113e2965..a952e9c85b6f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1225,21 +1225,24 @@ static struct notifier_block cacheinfo_cpu_notifier = { static int __init cache_sysfs_init(void) { - int i; + int i, err = 0; if (num_cache_leaves == 0) return 0; + cpu_notifier_register_begin(); for_each_online_cpu(i) { - int err; struct device *dev = get_cpu_device(i); err = cache_add_dev(dev); if (err) - return err; + goto out; } - register_hotcpu_notifier(&cacheinfo_cpu_notifier); - return 0; + __register_hotcpu_notifier(&cacheinfo_cpu_notifier); + +out: + cpu_notifier_register_done(); + return err; } device_initcall(cache_sysfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4d5419b249da..68317c80de7f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -614,6 +617,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -1278,10 +1282,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); @@ -1296,7 +1308,9 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) { + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { iv = max(iv / 2, (unsigned long) HZ/100); } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); @@ -2434,14 +2448,18 @@ static __init int mcheck_init_device(void) if (err) return err; + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); - if (err) + if (err) { + cpu_notifier_register_done(); return err; + } } register_syscore_ops(&mce_syscore_ops); - register_hotcpu_notifier(&mce_cpu_notifier); + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); /* register character device /dev/mcelog */ misc_register(&mce_chrdev_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index fb6156fee6f7..9a316b21df8b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -9,6 +9,7 @@ #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/cpumask.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> @@ -41,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) } } +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static bool cmci_storm_detect(void) { unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -158,7 +175,7 @@ static bool cmci_storm_detect(void) if (cnt <= CMCI_STORM_THRESHOLD) return false; - cmci_clear(); + cmci_storm_disable_banks(); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); mce_timer_kick(CMCI_POLL_INTERVAL); @@ -194,7 +211,7 @@ static void cmci_discover(int banks) int i; int bios_wrong_thresh = 0; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -249,7 +266,7 @@ static void cmci_discover(int banks) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -299,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -343,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 3eec7de76efb..d921b7ee6595 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev) sysfs_remove_group(&dev->kobj, &thermal_attr_group); } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); - /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_cpu_callback(struct notifier_block *nfb, @@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(dev, cpu); - mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(dev); - mutex_unlock(&therm_cpu_lock); break; } return notifier_from_errno(err); @@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; - register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU - mutex_lock(&therm_cpu_lock); -#endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } -#ifdef CONFIG_HOTPLUG_CPU - mutex_unlock(&therm_cpu_lock); -#endif + + __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 4b8e4d3cd6ea..4c36bbe3173a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -926,13 +926,13 @@ static __init int amd_ibs_init(void) goto out; perf_ibs_pm_init(); - get_online_cpus(); + cpu_notifier_register_begin(); ibs_caps = caps; /* make ibs_caps visible to other cpus: */ smp_mb(); - perf_cpu_notifier(perf_ibs_cpu_notifier); smp_call_function(setup_APIC_ibs, NULL, 1); - put_online_cpus(); + __perf_cpu_notifier(perf_ibs_cpu_notifier); + cpu_notifier_register_done(); ret = perf_event_ibs_init(); out: diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 754291adec33..3bbdf4cd38b9 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -531,15 +531,16 @@ static int __init amd_uncore_init(void) if (ret) return -ENODEV; - get_online_cpus(); + cpu_notifier_register_begin(); + /* init cpus already online before registering for hotplug notifier */ for_each_online_cpu(cpu) { amd_uncore_cpu_up_prepare(cpu); smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); } - register_cpu_notifier(&amd_uncore_cpu_notifier_block); - put_online_cpus(); + __register_cpu_notifier(&amd_uncore_cpu_notifier_block); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 5ad35ad94d0f..7c87424d4140 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -59,7 +59,7 @@ #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ /* Clients have PP0, PKG */ @@ -72,6 +72,12 @@ 1<<RAPL_IDX_PKG_NRG_STAT|\ 1<<RAPL_IDX_RAM_NRG_STAT) +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -425,6 +431,24 @@ static struct attribute *rapl_events_cln_attr[] = { NULL, }; +static struct attribute *rapl_events_hsw_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -511,6 +535,7 @@ static int rapl_cpu_prepare(int cpu) struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); u64 ms; + u64 msr_rapl_power_unit_bits; if (pmu) return 0; @@ -518,6 +543,9 @@ static int rapl_cpu_prepare(int cpu) if (phys_id < 0) return -1; + if (!rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) return -1; @@ -531,8 +559,7 @@ static int rapl_cpu_prepare(int cpu) * * we cache in local PMU instance */ - rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); - pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; /* @@ -631,11 +658,14 @@ static int __init rapl_pmu_init(void) switch (boot_cpu_data.x86_model) { case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_HSW; + rapl_pmu_events_group.attrs = rapl_events_hsw_attr; + break; case 45: /* Sandy Bridge-EP */ case 62: /* IvyTown */ rapl_cntr_mask = RAPL_IDX_SRV; @@ -646,19 +676,22 @@ static int __init rapl_pmu_init(void) /* unsupported */ return 0; } - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { - rapl_cpu_prepare(cpu); + ret = rapl_cpu_prepare(cpu); + if (ret) + goto out; rapl_cpu_init(cpu); } - perf_cpu_notifier(rapl_cpu_notifier); + __perf_cpu_notifier(rapl_cpu_notifier); ret = perf_pmu_register(&rapl_pmu_class, "power", -1); if (WARN_ON(ret)) { pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); - put_online_cpus(); + cpu_notifier_register_done(); return -1; } @@ -672,7 +705,8 @@ static int __init rapl_pmu_init(void) hweight32(rapl_cntr_mask), ktime_to_ms(pmu->timer_interval)); - put_online_cpus(); +out: + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index bd2253d40cff..65bbbea38b9c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -4244,7 +4244,7 @@ static void __init uncore_cpumask_init(void) if (!cpumask_empty(&uncore_cpu_mask)) return; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { int i, phys_id = topology_physical_package_id(cpu); @@ -4263,9 +4263,9 @@ static void __init uncore_cpumask_init(void) } on_each_cpu(uncore_cpu_setup, NULL, 1); - register_cpu_notifier(&uncore_cpu_nb); + __register_cpu_notifier(&uncore_cpu_nb); - put_online_cpus(); + cpu_notifier_register_done(); } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7d9481c743f8..3225ae6c5180 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -198,14 +198,15 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) goto out_class; } - register_hotcpu_notifier(&cpuid_class_cpu_notifier); - put_online_cpus(); + __register_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); err = 0; goto out; @@ -215,7 +216,7 @@ out_class: for_each_online_cpu(i) { cpuid_device_destroy(i); } - put_online_cpus(); + cpu_notifier_register_done(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); @@ -227,13 +228,13 @@ static void __exit cpuid_exit(void) { int cpu = 0; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); - unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); - put_online_cpus(); + __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); } module_init(cpuid_init); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6d7d5a1260a6..6e2537c32190 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -225,7 +225,7 @@ static void __init intel_remapping_check(int num, int slot, int func) * * And yes, so far on current devices the base addr is always under 4G. */ -static u32 __init intel_stolen_base(int num, int slot, int func) +static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size) { u32 base; @@ -240,10 +240,118 @@ static u32 __init intel_stolen_base(int num, int slot, int func) return base; } -#define KB(x) ((x) * 1024) +#define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) #define GB(x) (MB (KB (x))) +static size_t __init i830_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + if (tmp & I830_TSEG_SIZE_1M) + return MB(1); + else + return KB(512); +} + +static size_t __init i845_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + switch (tmp & I845_TSEG_SIZE_MASK) { + case I845_TSEG_SIZE_512K: + return KB(512); + case I845_TSEG_SIZE_1M: + return MB(1); + default: + WARN_ON(1); + return 0; + } +} + +static size_t __init i85x_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + return MB(1); +} + +static size_t __init i830_mem_size(void) +{ + return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); +} + +static size_t __init i85x_mem_size(void) +{ + return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); +} + +/* + * On 830/845/85x the stolen memory base isn't available in any + * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. + */ +static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i830_tseg_size() - stolen_size; +} + +static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i845_tseg_size() - stolen_size; +} + +static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i85x_mem_size() - i85x_tseg_size() - stolen_size; +} + +static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + /* + * FIXME is the graphics stolen memory region + * always at TOUD? Ie. is it always the last + * one to be allocated by the BIOS? + */ + return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; +} + +static size_t __init i830_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I830_GMCH_GMS_MASK) { + case I830_GMCH_GMS_STOLEN_512: + stolen_size = KB(512); + break; + case I830_GMCH_GMS_STOLEN_1024: + stolen_size = MB(1); + break; + case I830_GMCH_GMS_STOLEN_8192: + stolen_size = MB(8); + break; + case I830_GMCH_GMS_LOCAL: + /* local memory isn't part of the normal address space */ + stolen_size = 0; + break; + default: + return 0; + } + + return stolen_size; +} + static size_t __init gen3_stolen_size(int num, int slot, int func) { size_t stolen_size; @@ -310,7 +418,7 @@ static size_t __init gen6_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } -static inline size_t gen8_stolen_size(int num, int slot, int func) +static size_t gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; @@ -320,31 +428,74 @@ static inline size_t gen8_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } -typedef size_t (*stolen_size_fn)(int num, int slot, int func); + +struct intel_stolen_funcs { + size_t (*size)(int num, int slot, int func); + u32 (*base)(int num, int slot, int func, size_t size); +}; + +static const struct intel_stolen_funcs i830_stolen_funcs = { + .base = i830_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i845_stolen_funcs = { + .base = i845_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i85x_stolen_funcs = { + .base = i85x_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs i865_stolen_funcs = { + .base = i865_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen3_stolen_funcs = { + .base = intel_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen6_stolen_funcs = { + .base = intel_stolen_base, + .size = gen6_stolen_size, +}; + +static const struct intel_stolen_funcs gen8_stolen_funcs = { + .base = intel_stolen_base, + .size = gen8_stolen_size, +}; static struct pci_device_id intel_stolen_ids[] __initdata = { - INTEL_I915G_IDS(gen3_stolen_size), - INTEL_I915GM_IDS(gen3_stolen_size), - INTEL_I945G_IDS(gen3_stolen_size), - INTEL_I945GM_IDS(gen3_stolen_size), - INTEL_VLV_M_IDS(gen6_stolen_size), - INTEL_VLV_D_IDS(gen6_stolen_size), - INTEL_PINEVIEW_IDS(gen3_stolen_size), - INTEL_I965G_IDS(gen3_stolen_size), - INTEL_G33_IDS(gen3_stolen_size), - INTEL_I965GM_IDS(gen3_stolen_size), - INTEL_GM45_IDS(gen3_stolen_size), - INTEL_G45_IDS(gen3_stolen_size), - INTEL_IRONLAKE_D_IDS(gen3_stolen_size), - INTEL_IRONLAKE_M_IDS(gen3_stolen_size), - INTEL_SNB_D_IDS(gen6_stolen_size), - INTEL_SNB_M_IDS(gen6_stolen_size), - INTEL_IVB_M_IDS(gen6_stolen_size), - INTEL_IVB_D_IDS(gen6_stolen_size), - INTEL_HSW_D_IDS(gen6_stolen_size), - INTEL_HSW_M_IDS(gen6_stolen_size), - INTEL_BDW_M_IDS(gen8_stolen_size), - INTEL_BDW_D_IDS(gen8_stolen_size) + INTEL_I830_IDS(&i830_stolen_funcs), + INTEL_I845G_IDS(&i845_stolen_funcs), + INTEL_I85X_IDS(&i85x_stolen_funcs), + INTEL_I865G_IDS(&i865_stolen_funcs), + INTEL_I915G_IDS(&gen3_stolen_funcs), + INTEL_I915GM_IDS(&gen3_stolen_funcs), + INTEL_I945G_IDS(&gen3_stolen_funcs), + INTEL_I945GM_IDS(&gen3_stolen_funcs), + INTEL_VLV_M_IDS(&gen6_stolen_funcs), + INTEL_VLV_D_IDS(&gen6_stolen_funcs), + INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), + INTEL_I965G_IDS(&gen3_stolen_funcs), + INTEL_G33_IDS(&gen3_stolen_funcs), + INTEL_I965GM_IDS(&gen3_stolen_funcs), + INTEL_GM45_IDS(&gen3_stolen_funcs), + INTEL_G45_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), + INTEL_SNB_D_IDS(&gen6_stolen_funcs), + INTEL_SNB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_M_IDS(&gen6_stolen_funcs), + INTEL_BDW_M_IDS(&gen8_stolen_funcs), + INTEL_BDW_D_IDS(&gen8_stolen_funcs) }; static void __init intel_graphics_stolen(int num, int slot, int func) @@ -361,11 +512,13 @@ static void __init intel_graphics_stolen(int num, int slot, int func) for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { if (intel_stolen_ids[i].device == device) { - stolen_size_fn stolen_size = - (stolen_size_fn)intel_stolen_ids[i].driver_data; - size = stolen_size(num, slot, func); - start = intel_stolen_base(num, slot, func); + const struct intel_stolen_funcs *stolen_funcs = + (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; + size = stolen_funcs->size(num, slot, func); + start = stolen_funcs->base(num, slot, func, size); if (size && start) { + printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", + start, start + (u32)size - 1); /* Mark this space as reserved */ e820_add_region(start, size, E820_RESERVED); sanitize_e820_map(e820.map, diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index e6253195a301..52819e816f87 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -308,7 +308,10 @@ static int ftrace_write(unsigned long ip, const char *val, int size) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa_symbol(ip)); - return probe_kernel_write((void *)ip, val, size); + if (probe_kernel_write((void *)ip, val, size)) + return -EPERM; + + return 0; } static int add_break(unsigned long ip, const char *old) @@ -323,10 +326,7 @@ static int add_break(unsigned long ip, const char *old) if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) return -EINVAL; - if (ftrace_write(ip, &brk, 1)) - return -EPERM; - - return 0; + return ftrace_write(ip, &brk, 1); } static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) @@ -425,7 +425,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec) /* If this does not have a breakpoint, we are done */ if (ins[0] != brk) - return -1; + return 0; nop = ftrace_nop_replace(); @@ -455,7 +455,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec) } update: - return probe_kernel_write((void *)ip, &nop[0], 1); + return ftrace_write(ip, nop, 1); } static int add_update_code(unsigned long ip, unsigned const char *new) @@ -463,9 +463,7 @@ static int add_update_code(unsigned long ip, unsigned const char *new) /* skip breakpoint */ ip++; new++; - if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) - return -EPERM; - return 0; + return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1); } static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) @@ -520,10 +518,7 @@ static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) new = ftrace_call_replace(ip, addr); - if (ftrace_write(ip, new, 1)) - return -EPERM; - - return 0; + return ftrace_write(ip, new, 1); } static int finish_update_nop(struct dyn_ftrace *rec) @@ -533,9 +528,7 @@ static int finish_update_nop(struct dyn_ftrace *rec) new = ftrace_nop_replace(); - if (ftrace_write(ip, new, 1)) - return -EPERM; - return 0; + return ftrace_write(ip, new, 1); } static int finish_update(struct dyn_ftrace *rec, int enable) @@ -632,8 +625,14 @@ void ftrace_replace_code(int enable) printk(KERN_WARNING "Failed on %s (%d):\n", report, count); for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - remove_breakpoint(rec); + /* + * Breakpoints are handled only when this function is in + * progress. The system could not work with them. + */ + if (remove_breakpoint(rec)) + BUG(); } + run_sync(); } static int @@ -655,16 +654,19 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code, run_sync(); ret = ftrace_write(ip, new_code, 1); - if (ret) { - ret = -EPERM; - goto out; - } - run_sync(); + /* + * The breakpoint is handled only when this function is in progress. + * The system could not work if we could not remove it. + */ + BUG_ON(ret); out: + run_sync(); return ret; fail_update: - probe_kernel_write((void *)ip, &old_code[0], 1); + /* Also here the system could not work with the breakpoint */ + if (ftrace_write(ip, old_code, 1)) + BUG(); goto out; } @@ -678,11 +680,8 @@ void arch_ftrace_update_code(int command) atomic_dec(&modifying_ftrace_code); } -int __init ftrace_dyn_arch_init(void *data) +int __init ftrace_dyn_arch_init(void) { - /* The return code is retured via data */ - *(unsigned long *)data = 0; - return 0; } #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 93eed15a8fd4..8d80ae011603 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -941,12 +941,14 @@ static __init int hpet_late_init(void) if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(hpet_cpuhp_notify, -20); + __hotcpu_notifier(hpet_cpuhp_notify, -20); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 42805fac0092..283a76a9cc40 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -125,7 +125,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_printf(p, " Machine check polls\n"); #endif -#if defined(CONFIG_HYPERV) || defined(CONFIG_XEN) +#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79a3f9682871..61b17dc2c277 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -897,9 +897,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - switch (kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); /* * We are here because the instruction being single * stepped caused a page fault. We reset the current @@ -914,9 +915,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) else reset_current_kprobe(); preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: + } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || + kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* * We increment the nmissed count for accounting, * we can also use npre/npostfault count for accounting @@ -945,10 +945,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) * fixup routine could not handle it, * Let do_page_fault() fix it. */ - break; - default: - break; } + return 0; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 713f1b3bad52..0331cb389d68 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -417,7 +417,6 @@ void kvm_disable_steal_time(void) #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { - WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e6041094ff26..d9156ceecdff 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -242,7 +242,7 @@ void __init kvmclock_init(void) hv_clock = __va(mem); memset(hv_clock, 0, size); - if (kvm_register_clock("boot clock")) { + if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; memblock_free(mem, size); return; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ebc987398923..af1d14a9ebda 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,6 +229,17 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } + /* + * On x86-64 we do not support 16-bit segments due to + * IRET leaking the high bits of the kernel stack address. + */ +#ifdef CONFIG_X86_64 + if (!ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } +#endif + fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 05266b5aae22..c9603ac80de5 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -259,14 +259,15 @@ static int __init msr_init(void) goto out_chrdev; } msr_class->devnode = msr_devnode; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) goto out_class; } - register_hotcpu_notifier(&msr_class_cpu_notifier); - put_online_cpus(); + __register_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); err = 0; goto out; @@ -275,7 +276,7 @@ out_class: i = 0; for_each_online_cpu(i) msr_device_destroy(i); - put_online_cpus(); + cpu_notifier_register_done(); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); @@ -286,13 +287,14 @@ out: static void __exit msr_exit(void) { int cpu = 0; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); - unregister_hotcpu_notifier(&msr_class_cpu_notifier); - put_online_cpus(); + __unregister_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); } module_init(msr_init); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 299d49302e7d..0497f719977d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1207,23 +1207,31 @@ error: return ret; } -static inline int __init determine_tce_table_size(u64 ram) +static inline int __init determine_tce_table_size(void) { int ret; if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) return specified_table_size; - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order(ram >> 13); - if (ret > TCE_TABLE_SIZE_8M) + if (is_kdump_kernel() && saved_max_pfn) { + /* + * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to + * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each + * larger table size has twice as many entries, so shift the + * max ram address by 13 to divide by 8K and then look at the + * order of the result to choose between 0-7. + */ + ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); + if (ret > TCE_TABLE_SIZE_8M) + ret = TCE_TABLE_SIZE_8M; + } else { + /* + * Use 8M by default (suggested by Muli) if it's not + * kdump kernel and saved_max_pfn isn't set. + */ ret = TCE_TABLE_SIZE_8M; + } return ret; } @@ -1418,8 +1426,7 @@ int __init detect_calgary(void) return -ENOMEM; } - specified_table_size = determine_tce_table_size((is_kdump_kernel() ? - saved_max_pfn : max_pfn) * PAGE_SIZE); + specified_table_size = determine_tce_table_size(); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 654b46574b91..3399d3a99730 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -114,8 +114,8 @@ EXPORT_SYMBOL(machine_real_restart); */ static int __init set_pci_reboot(const struct dmi_system_id *d) { - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; pr_info("%s series board detected. Selecting %s-method for reboots.\n", d->ident, "PCI"); } @@ -458,20 +458,23 @@ void __attribute__((weak)) mach_reboot_fixups(void) } /* - * Windows compatible x86 hardware expects the following on reboot: + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: * * 1) If the FADT has the ACPI reboot register flag set, try it * 2) If still alive, write to the keyboard controller * 3) If still alive, write to the ACPI reboot register again * 4) If still alive, write to the keyboard controller again * 5) If still alive, call the EFI runtime service to reboot - * 6) If still alive, write to the PCI IO port 0xCF9 to reboot - * 7) If still alive, inform BIOS to do a proper reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot * - * If the machine is still alive at this stage, it gives up. We default to - * following the same pattern, except that if we're still alive after (7) we'll - * try to force a triple fault and then cycle between hitting the keyboard - * controller and doing that + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. + * + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. */ static void native_machine_emergency_restart(void) { @@ -492,6 +495,11 @@ static void native_machine_emergency_restart(void) for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + case BOOT_KBD: mach_reboot_fixups(); /* For board specific fixups */ @@ -509,43 +517,29 @@ static void native_machine_emergency_restart(void) } break; - case BOOT_TRIPLE: - load_idt(&no_idt); - __asm__ __volatile__("int3"); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_KBD; - break; - - case BOOT_BIOS: - machine_real_restart(MRR_BIOS); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_TRIPLE; - break; - - case BOOT_ACPI: - acpi_reboot(); - reboot_type = BOOT_KBD; - break; - case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; break; - case BOOT_CF9: + case BOOT_CF9_FORCE: port_cf9_safe = true; /* Fall through */ - case BOOT_CF9_COND: + case BOOT_CF9_SAFE: if (port_cf9_safe) { - u8 reboot_code = reboot_mode == REBOOT_WARM ? - 0x06 : 0x0E; + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; u8 cf9 = inb(0xcf9) & ~reboot_code; outb(cf9|2, 0xcf9); /* Request hard reset */ udelay(50); @@ -553,7 +547,15 @@ static void native_machine_emergency_restart(void) outb(cf9|reboot_code, 0xcf9); udelay(50); } - reboot_type = BOOT_BIOS; + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + load_idt(&no_idt); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_KBD; break; } } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9ea287666c65..8b3b3eb3cead 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -348,9 +348,13 @@ static int __init vsyscall_init(void) { BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); + cpu_notifier_register_begin(); + on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ - hotcpu_notifier(cpu_vsyscall_notifier, 30); + __hotcpu_notifier(cpu_vsyscall_notifier, 30); + + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e5503d8aec1d..f47a104a749c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv) int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - xstate_bv &= ~XSTATE_FPSSE; + xstate_bv &= XSTATE_EXTEND_MASK; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx; @@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv) return ret; } +u64 kvm_supported_xcr0(void) +{ + u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + + if (!kvm_x86_ops->mpx_supported()) + xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + + return xcr0; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -73,9 +83,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) } else { vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & - host_xcr0 & KVM_SUPPORTED_XCR0; - vcpu->arch.guest_xstate_size = - xstate_required_size(vcpu->arch.guest_supported_xcr0); + kvm_supported_xcr0(); + vcpu->arch.guest_xstate_size = best->ebx = + xstate_required_size(vcpu->arch.xcr0); } kvm_pmu_cpuid_update(vcpu); @@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static bool supported_xcr0_bit(unsigned bit) -{ - u64 mask = ((u64)1 << bit); - - return mask & KVM_SUPPORTED_XCR0 & host_xcr0; -} - #define F(x) bit(X86_FEATURE_##x) static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, @@ -256,6 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -303,7 +307,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | + F(ADX) | F(SMAP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -436,16 +441,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 0xd: { int idx, i; + u64 supported = kvm_supported_xcr0(); - entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0; - entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32; + entry->eax &= supported; + entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { + u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + if (entry[i].eax == 0 || !(supported & mask)) continue; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a2a1bb7ed8c1..eeecbed26ac7 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -48,6 +48,14 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMAP)); +} + static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 07ffca0a89e9..205b17eed93c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; +static const struct gprefix pfx_0f_28_0f_29 = { + I(Aligned, em_mov), I(Aligned, em_mov), N, N, +}; + static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem, em_fnstcw), }, { @@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, - N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b531351a587..813d31038b93 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3329,7 +3329,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) @@ -3601,20 +3601,27 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; - bool fault, x, w, u, wf, uf, ff, smep; + bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; - smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { pfec = byte << 1; map = 0; wf = pfec & PFERR_WRITE_MASK; uf = pfec & PFERR_USER_MASK; ff = pfec & PFERR_FETCH_MASK; + /* + * PFERR_RSVD_MASK bit is set in PFEC if the access is not + * subject to SMAP restrictions, and cleared otherwise. The + * bit is only meaningful if the SMAP bit is set in CR4. + */ + smapf = !(pfec & PFERR_RSVD_MASK); for (bit = 0; bit < 8; ++bit) { x = bit & ACC_EXEC_MASK; w = bit & ACC_WRITE_MASK; @@ -3626,12 +3633,33 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, /* Allow supervisor writes if !cr0.wp */ w |= !is_write_protection(vcpu) && !uf; /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + x &= !(cr4_smep && u && !uf); + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - An user page is accessed + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in + * permission_fault() and is in smapf. + * + * Also, SMAP does not affect instruction + * fetches, add the !ff check here to make it + * clearer. + */ + smap = cr4_smap && u && !uf && !ff; } else /* Not really needed: no U/S accesses on ept */ u = 1; - fault = (ff && !x) || (uf && !u) || (wf && !w); + fault = (ff && !x) || (uf && !u) || (wf && !w) || + (smapf && smap); map |= fault << bit; } mmu->permissions[byte] = map; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 292615274358..3842e70bdb7c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,11 +44,17 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); @@ -73,6 +79,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -110,10 +118,30 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) * Will a fault with a given page-fault error code (pfec) cause a permission * fault with the given access (in ACC_* format)? */ -static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, - unsigned pfec) +static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { - return (mmu->permissions[pfec >> 1] >> pte_access) & 1; + int cpl = kvm_x86_ops->get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * + * If CPL = 3, SMAP applies to all supervisor-mode data accesses + * (these are implicit supervisor accesses) regardless of the value + * of EFLAGS.AC. + * + * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving + * the result in X86_EFLAGS_AC. We then insert it in place of + * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, + * but it will be one in index if SMAP checks are being overridden. + * It is important to keep this branchless. + */ + unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + int index = (pfec >> 1) + + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + + return (mmu->permissions[index] >> pte_access) & 1; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cba218a2f08d..123efd3ec29f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,7 +353,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(mmu, pte_access, access))) { + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } @@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't * used by guest then tlbs are not flushed, so guest is allowed to access the * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + * We set tlbs_dirty to let the notifier know this change and delay the flush + * until such a case actually happens. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } @@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2de1bc09a8d4..7f4f9c2badae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,7 @@ #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> +#include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/virtext.h> @@ -303,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) return vmcb->control.intercept_cr & (1U << bit); } -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr |= (1U << bit); + vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) + | (1 << INTERCEPT_DR1_READ) + | (1 << INTERCEPT_DR2_READ) + | (1 << INTERCEPT_DR3_READ) + | (1 << INTERCEPT_DR4_READ) + | (1 << INTERCEPT_DR5_READ) + | (1 << INTERCEPT_DR6_READ) + | (1 << INTERCEPT_DR7_READ) + | (1 << INTERCEPT_DR0_WRITE) + | (1 << INTERCEPT_DR1_WRITE) + | (1 << INTERCEPT_DR2_WRITE) + | (1 << INTERCEPT_DR3_WRITE) + | (1 << INTERCEPT_DR4_WRITE) + | (1 << INTERCEPT_DR5_WRITE) + | (1 << INTERCEPT_DR6_WRITE) + | (1 << INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr &= ~(1U << bit); + vmcb->control.intercept_dr = 0; recalc_intercepts(svm); } @@ -1080,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR4_WRITE); set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); + set_dr_intercepts(svm); set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); @@ -1684,6 +1684,21 @@ static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) mark_dirty(svm->vmcb, VMCB_DR); } +static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + vcpu->arch.dr6 = svm_get_dr6(vcpu); + vcpu->arch.dr7 = svm->vmcb->save.dr7; + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + set_dr_intercepts(svm); +} + static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2842,6 +2857,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -2974,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm) unsigned long val; int err; + if (svm->vcpu.guest_debug == 0) { + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + clr_dr_intercepts(svm); + svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) return emulate_on_interception(svm); @@ -3649,7 +3676,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return ret; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3663,16 +3690,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu) svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) - return 0; /* IRET will cause a vm exit */ + return; /* IRET will cause a vm exit */ /* * Something prevents NMI from been injected. Single step over possible @@ -3681,7 +3707,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu) svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_bp_intercept(vcpu); - return 0; } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4064,6 +4089,11 @@ static bool svm_invpcid_supported(void) return false; } +static bool svm_mpx_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4302,6 +4332,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_dr6 = svm_get_dr6, .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -4345,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .invpcid_supported = svm_invpcid_supported, + .mpx_supported = svm_mpx_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 392752834751..1f68c5831924 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> +#include <linux/hrtimer.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -42,6 +43,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/debugreg.h> #include <asm/kexec.h> #include "trace.h" @@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -202,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -374,6 +379,9 @@ struct nested_vmx { */ struct page *apic_access_page; u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; #define POSTED_INTR_ON 0 @@ -441,6 +449,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -533,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, + GUEST_BNDCFGS, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, @@ -588,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -718,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -728,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1047,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1710,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1747,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -2248,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls @@ -2265,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || - !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { - nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - } - nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2287,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) VM_ENTRY_LOAD_IA32_PAT; nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2342,9 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | - VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT; + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -2479,6 +2501,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + data = vmcs_read64(GUEST_BNDCFGS); + break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2547,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2832,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2853,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -3452,13 +3484,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -4223,6 +4256,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 @@ -4496,39 +4533,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) PIN_BASED_NMI_EXITING; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. The caller will have - * to make L2 exit right after entry, so we can inject to L1 - * more promptly. - */ - return -EBUSY; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) - return enable_irq_window(vcpu); - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) - return enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4620,22 +4646,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; @@ -4647,19 +4659,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); - /* - * fall through to normal code, but now in L1, not L2 - */ - } - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -5102,6 +5103,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); @@ -5128,6 +5145,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { } +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5727,6 +5762,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5791,6 +5838,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -6767,9 +6818,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * table is L0's fault. */ return 0; - case EXIT_REASON_PREEMPTION_TIMER: - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -6785,27 +6833,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 delta_tsc_l1; - u32 preempt_val_l1, preempt_val_l2, preempt_scale; - - if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER)) - return; - preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) & - MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE; - preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); - delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc()) - - vcpu->arch.last_guest_tsc; - preempt_val_l1 = delta_tsc_l1 >> preempt_scale; - if (preempt_val_l2 <= preempt_val_l1) - preempt_val_l2 = 0; - else - preempt_val_l2 -= preempt_val_l1; - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); -} - /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7052,6 +7079,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7218,8 +7251,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) - nested_adjust_preemption_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -7616,6 +7647,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7629,7 +7682,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - u32 exit_control; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -7687,13 +7739,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, - vmcs12->vmx_preemption_timer_value); + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -7721,7 +7774,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ @@ -7808,10 +7861,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - exit_control = vmcs_config.vmexit_ctrl; - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - vm_exit_controls_init(vmx, exit_control); + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -7830,6 +7880,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -8155,6 +8208,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8225,10 +8330,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && - (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) - vmcs12->vmx_preemption_timer_value = - vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -8260,6 +8368,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ @@ -8369,6 +8479,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; @@ -8495,6 +8609,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } /* @@ -8573,6 +8690,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_dr6 = vmx_get_dr6, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -8634,6 +8752,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) @@ -8721,6 +8842,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b8578432d5b..8b8fc0b792ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - u64 xcr0; + u64 xcr0 = xcr; + u64 old_xcr0 = vcpu->arch.xcr0; u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - xcr0 = xcr; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) @@ -616,8 +616,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if (xcr0 & ~valid_bits) return 1; + if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; + + if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + kvm_update_cpuid(vcpu); return 0; } @@ -646,6 +652,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; @@ -674,6 +683,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_SMAP) + update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -753,7 +765,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) else dr7 = vcpu->arch.dr7; kvm_x86_ops->set_dr7(vcpu, dr7); - vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) @@ -879,7 +893,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS }; static unsigned num_msrs_to_save; @@ -1109,7 +1123,6 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); @@ -1581,7 +1594,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; /* @@ -1623,14 +1635,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * the others. * * So in those cases, request a kvmclock update for all vcpus. - * The worst case for a remote vcpu to update its kvmclock - * is then bounded by maximum nohz sleep latency. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. */ -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work) { int i; - struct kvm *kvm = v->kvm; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1639,6 +1658,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) } } +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + struct kvm *kvm = v->kvm; + + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -2323,9 +2365,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_INDEX: { int r; struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { data = r; + break; + } + } break; } case HV_X64_MSR_EOI: @@ -2617,6 +2662,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: + case KVM_CAP_IOAPIC_POLARITY_IGNORED: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3043,9 +3089,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~KVM_SUPPORTED_XCR0) - return -EINVAL; - if (xstate_bv & ~host_xcr0) + if (xstate_bv & ~kvm_supported_xcr0()) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, guest_xsave->region, vcpu->arch.guest_xstate_size); @@ -3898,6 +3942,23 @@ static void kvm_init_msr_list(void) for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; + + /* + * Even MSRs that are valid in the host may not be exposed + * to the guests in some cases. We could work around this + * in VMX with the generic MSR save/load machinery, but it + * is not really worthwhile since it will really only + * happen with nested virtualization. + */ + switch (msrs_to_save[i]) { + case MSR_IA32_BNDCFGS: + if (!kvm_x86_ops->mpx_supported()) + continue; + break; + default: + break; + } + if (j < i) msrs_to_save[j] = msrs_to_save[i]; j++; @@ -4108,7 +4169,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | (write ? PFERR_WRITE_MASK : 0); if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { + && !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -4394,6 +4456,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -5365,7 +5428,8 @@ static void kvm_timer_init(void) int cpu; max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + + cpu_notifier_register_begin(); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; @@ -5382,6 +5446,10 @@ static void kvm_timer_init(void) pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); for_each_online_cpu(cpu) smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + + __register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + cpu_notifier_register_done(); + } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -5537,9 +5605,10 @@ int kvm_arch_init(void *opaque) goto out_free_percpu; kvm_set_mmio_spte_mask(); - kvm_init_msr_list(); kvm_x86_ops = ops; + kvm_init_msr_list(); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -5782,8 +5851,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) { + int r; + /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { trace_kvm_inj_exception(vcpu->arch.exception.nr, @@ -5793,17 +5864,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.reinject); - return; + return 0; } if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); - return; + return 0; } if (vcpu->arch.interrupt.pending) { kvm_x86_ops->set_irq(vcpu); - return; + return 0; + } + + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; } /* try to inject new event if pending */ @@ -5820,6 +5897,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) kvm_x86_ops->set_irq(vcpu); } } + return 0; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -5924,15 +6002,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - inject_pending_event(vcpu); - + if (inject_pending_event(vcpu, req_int_win) != 0) + req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - req_immediate_exit = - kvm_x86_ops->enable_nmi_window(vcpu) != 0; + else if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - req_immediate_exit = - kvm_x86_ops->enable_irq_window(vcpu) != 0; + kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { /* @@ -5992,12 +6068,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + set_debugreg(vcpu->arch.dr6, 6); } trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu); /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. + */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { + int i; + + WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + kvm_x86_ops->sync_dirty_debug_regs(vcpu); + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + } + + /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't @@ -6711,6 +6803,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { int r; struct msr_data msr; + struct kvm *kvm = vcpu->kvm; r = vcpu_load(vcpu); if (r) @@ -6721,6 +6814,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + return r; } @@ -7013,6 +7109,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + return 0; } @@ -7050,6 +7149,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -7248,6 +7349,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8da5823bcde6..8c97bac9a895 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,9 +122,12 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); -#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; +extern u64 kvm_supported_xcr0(void); + extern unsigned int min_timer_period_us; extern struct static_key kvm_no_apic_vcpu; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 799580cabc78..597ac155c91c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -328,17 +328,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -static int __initdata early_ioremap_debug; - -static int __init early_ioremap_debug_setup(char *str) -{ - early_ioremap_debug = 1; - - return 0; -} -early_param("early_ioremap_debug", early_ioremap_debug_setup); - -static __initdata int after_paging_init; static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -362,18 +351,11 @@ bool __init is_early_ioremap_ptep(pte_t *ptep) return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; } -static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; - void __init early_ioremap_init(void) { pmd_t *pmd; - int i; - if (early_ioremap_debug) - printk(KERN_INFO "early_ioremap_init()\n"); - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + early_ioremap_setup(); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); @@ -402,13 +384,8 @@ void __init early_ioremap_init(void) } } -void __init early_ioremap_reset(void) -{ - after_paging_init = 1; -} - -static void __init __early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags) +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *pte; @@ -425,198 +402,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, pte_clear(&init_mm, addr, pte); __flush_tlb_one(addr); } - -static inline void __init early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t prot) -{ - if (after_paging_init) - __set_fixmap(idx, phys, prot); - else - __early_set_fixmap(idx, phys, prot); -} - -static inline void __init early_clear_fixmap(enum fixed_addresses idx) -{ - if (after_paging_init) - clear_fixmap(idx); - else - __early_set_fixmap(idx, 0, __pgprot(0)); -} - -static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; -static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; - -void __init fixup_early_ioremap(void) -{ - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i]) { - WARN_ON(1); - break; - } - } - - early_ioremap_init(); -} - -static int __init check_early_ioremap_leak(void) -{ - int count = 0; - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - if (prev_map[i]) - count++; - - if (!count) - return 0; - WARN(1, KERN_WARNING - "Debug warning: early ioremap leak of %d areas detected.\n", - count); - printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - - return 1; -} -late_initcall(check_early_ioremap_leak); - -static void __init __iomem * -__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) -{ - unsigned long offset; - resource_size_t last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - WARN_ON(system_state != SYSTEM_BOOTING); - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (!prev_map[i]) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n", - __func__, (u64)phys_addr, size); - WARN_ON(1); - return NULL; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ", - __func__, (u64)phys_addr, size, slot); - dump_stack(); - } - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) { - WARN_ON(1); - return NULL; - } - - prev_size[slot] = size; - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) { - WARN_ON(1); - return NULL; - } - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_set_fixmap(idx, phys_addr, prot); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - - prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); - return prev_map[slot]; -} - -/* Remap an IO device */ -void __init __iomem * -early_ioremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); -} - -/* Remap memory */ -void __init __iomem * -early_memremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL); -} - -void __init early_iounmap(void __iomem *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i] == addr) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", - addr, size); - WARN_ON(1); - return; - } - - if (prev_size[slot] != size) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot]); - WARN_ON(1); - return; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, slot); - dump_stack(); - } - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { - WARN_ON(1); - return; - } - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_clear_fixmap(idx); - --idx; - --nrpages; - } - prev_map[slot] = NULL; -} diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index d87dd6d042d6..dd89a13f1051 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -78,10 +78,16 @@ early_initcall(kmemcheck_init); */ static int __init param_kmemcheck(char *str) { + int val; + int ret; + if (!str) return -EINVAL; - sscanf(str, "%d", &kmemcheck_enabled); + ret = kstrtoint(str, 0, &val); + if (ret) + return ret; + kmemcheck_enabled = val; return 0; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index a69bcb8c7621..4dd8cf652579 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -127,7 +127,7 @@ static int __init parse_reservetop(char *arg) address = memparse(arg, &arg); reserve_top_address(address); - fixup_early_ioremap(); + early_ioremap_init(); return 0; } early_param("reservetop", parse_reservetop); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4ed75dd81d05..dc017735bb91 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -553,13 +553,13 @@ void bpf_jit_compile(struct sk_filter *fp) } break; case BPF_S_ANC_RXHASH: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4); - if (is_imm8(offsetof(struct sk_buff, rxhash))) { + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + if (is_imm8(offsetof(struct sk_buff, hash))) { /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash)); + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, hash)); } else { EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, rxhash), 4); + EMIT(offsetof(struct sk_buff, hash), 4); } break; case BPF_S_ANC_QUEUE: @@ -772,6 +772,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)image; + fp->jited = 1; } out: kfree(addrs); @@ -791,7 +792,7 @@ static void bpf_jit_free_deferred(struct work_struct *work) void bpf_jit_free(struct sk_filter *fp) { - if (fp->bpf_func != sk_run_filter) { + if (fp->jited) { INIT_WORK(&fp->work, bpf_jit_free_deferred); schedule_work(&fp->work); } else { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 6890d8498e0b..379e8bd0deea 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -494,14 +494,19 @@ static int nmi_setup(void) if (err) goto fail; + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' */ get_online_cpus(); - register_cpu_notifier(&oprofile_cpu_nb); nmi_enabled = 1; /* make nmi_enabled visible to the nmi handler: */ smp_mb(); on_each_cpu(nmi_cpu_setup, NULL, 1); + __register_cpu_notifier(&oprofile_cpu_nb); put_online_cpus(); + cpu_notifier_register_done(); + return 0; fail: free_msrs(); @@ -512,12 +517,18 @@ static void nmi_shutdown(void) { struct op_msrs *msrs; + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */ get_online_cpus(); - unregister_cpu_notifier(&oprofile_cpu_nb); on_each_cpu(nmi_cpu_shutdown, NULL, 1); nmi_enabled = 0; ctr_running = 0; + __unregister_cpu_notifier(&oprofile_cpu_nb); put_online_cpus(); + + cpu_notifier_register_done(); + /* make variables visible to the nmi handler: */ smp_mb(); unregister_nmi_handler(NMI_LOCAL, "oprofile"); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index a313a7fb6b86..e88f4c53d7f6 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -370,10 +370,13 @@ static int __init pci_io_ecs_init(void) if (early_pci_allowed()) pci_enable_pci_io_ecs(); - register_cpu_notifier(&amd_cpu_notifier); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, (void *)(long)cpu); + __register_cpu_notifier(&amd_cpu_notifier); + cpu_notifier_register_done(); + pci_probe |= PCI_HAS_IO_ECS; return 0; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 103e702ec5a7..905956f16465 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -178,6 +178,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], + (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : "pcifront-msi", @@ -245,6 +246,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) "xen: msi already bound to pirq=%d\n", pirq); } irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, + (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi", DOMID_SELF); @@ -269,9 +271,6 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret = 0; struct msi_desc *msidesc; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - list_for_each_entry(msidesc, &dev->msi_list, list) { struct physdev_map_pirq map_irq; domid_t domid; @@ -291,7 +290,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) (pci_domain_nr(dev->bus) << 16); map_irq.devfn = dev->devfn; - if (type == PCI_CAP_ID_MSIX) { + if (type == PCI_CAP_ID_MSI && nvec > 1) { + map_irq.type = MAP_PIRQ_TYPE_MULTI_MSI; + map_irq.entry_nr = nvec; + } else if (type == PCI_CAP_ID_MSIX) { int pos; u32 table_offset, bir; @@ -308,6 +310,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (pci_seg_supported) ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (type == PCI_CAP_ID_MSI && nvec > 1 && ret) { + /* + * If MAP_PIRQ_TYPE_MULTI_MSI is not available + * there's nothing else we can do in this case. + * Just set ret > 0 so driver can retry with + * single MSI. + */ + ret = 1; + goto out; + } if (ret == -EINVAL && !pci_domain_nr(dev->bus)) { map_irq.type = MAP_PIRQ_TYPE_MSI; map_irq.index = -1; @@ -324,11 +336,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto out; } - ret = xen_bind_pirq_msi_to_irq(dev, msidesc, - map_irq.pirq, - (type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", - domid); + ret = xen_bind_pirq_msi_to_irq(dev, msidesc, map_irq.pirq, + (type == PCI_CAP_ID_MSI) ? nvec : 1, + (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi", + domid); if (ret < 0) goto out; } diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index f325af26107c..3323c2745248 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -54,5 +54,7 @@ syshdr-$(CONFIG_X86_64) += syscalls_64.h targets += $(uapisyshdr-y) $(syshdr-y) +PHONY += all all: $(addprefix $(uapi)/,$(uapisyshdr-y)) all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 96bc506ac6de..d6b867921612 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -359,3 +359,4 @@ 350 i386 finit_module sys_finit_module 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index a12bddc7ccea..04376ac3d9ef 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -322,6 +322,7 @@ 313 common finit_module sys_finit_module 314 common sched_setattr sys_sched_setattr 315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index e8120346903b..604a37efd4d5 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -40,4 +40,6 @@ $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/ina HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-y += relocs relocs-objs := relocs_32.o relocs_64.o relocs_common.o +PHONY += relocs relocs: $(obj)/relocs + @: diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 2e263f367b13..9df017ab2285 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -9,12 +9,9 @@ SECTIONS #ifdef BUILD_VDSO32 #include <asm/vdso32.h> - .hpet_sect : { - hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - } :text :hpet_sect + hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - .vvar_sect : { - vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); + vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset; @@ -22,7 +19,6 @@ SECTIONS #include <asm/vvar.h> #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR - } :text :vvar_sect #endif . = SIZEOF_HEADERS; @@ -61,7 +57,12 @@ SECTIONS */ . = ALIGN(0x100); - .text : { *(.text*) } :text =0x90909090 + .text : { *(.text*) } :text =0x90909090, + + /* + * The comma above works around a bug in gold: + * https://sourceware.org/bugzilla/show_bug.cgi?id=16804 + */ /DISCARD/ : { *(.discard) @@ -84,8 +85,4 @@ PHDRS dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; -#ifdef BUILD_VDSO32 - vvar_sect PT_NULL FLAGS(4); /* PF_R */ - hpet_sect PT_NULL FLAGS(4); /* PF_R */ -#endif } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 9c50cc2e403b..e88fda867a33 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -19,11 +19,6 @@ config XEN_DOM0 depends on XEN && PCI_XEN && SWIOTLB_XEN depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI -# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST -# name in tools. -config XEN_PRIVILEGED_GUEST - def_bool XEN_DOM0 - config XEN_PVHVM def_bool y depends on XEN && PCI && X86_LOCAL_APIC diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 696c694986d0..85e5d78c9874 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -881,6 +881,65 @@ static unsigned long mfn_hash(unsigned long mfn) return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); } +int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret = 0; + bool lazy = false; + pte_t *pte; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + + for (i = 0; i < count; i++) { + unsigned long mfn, pfn; + + /* Do not add to override if the map failed. */ + if (map_ops[i].status) + continue; + + if (map_ops[i].flags & GNTMAP_contains_pte) { + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + } else { + mfn = PFN_DOWN(map_ops[i].dev_bus_addr); + } + pfn = page_to_pfn(pages[i]); + + WARN_ON(PagePrivate(pages[i])); + SetPagePrivate(pages[i]); + set_page_private(pages[i], mfn); + pages[i]->index = pfn_to_mfn(pfn); + + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { + ret = -ENOMEM; + goto out; + } + + if (kmap_ops) { + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); + if (ret) + goto out; + } + } + +out: + if (lazy) + arch_leave_lazy_mmu_mode(); + + return ret; +} +EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); + /* Add an MFN override for a particular page */ int m2p_add_override(unsigned long mfn, struct page *page, struct gnttab_map_grant_ref *kmap_op) @@ -899,13 +958,6 @@ int m2p_add_override(unsigned long mfn, struct page *page, "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } - WARN_ON(PagePrivate(page)); - SetPagePrivate(page); - set_page_private(page, mfn); - page->index = pfn_to_mfn(pfn); - - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) - return -ENOMEM; if (kmap_op != NULL) { if (!PageHighMem(page)) { @@ -943,20 +995,62 @@ int m2p_add_override(unsigned long mfn, struct page *page, return 0; } EXPORT_SYMBOL_GPL(m2p_add_override); + +int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret = 0; + bool lazy = false; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + + for (i = 0; i < count; i++) { + unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); + unsigned long pfn = page_to_pfn(pages[i]); + + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + ret = -EINVAL; + goto out; + } + + set_page_private(pages[i], INVALID_P2M_ENTRY); + WARN_ON(!PagePrivate(pages[i])); + ClearPagePrivate(pages[i]); + set_phys_to_machine(pfn, pages[i]->index); + + if (kmap_ops) + ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); + if (ret) + goto out; + } + +out: + if (lazy) + arch_leave_lazy_mmu_mode(); + return ret; +} +EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); + int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op) + struct gnttab_map_grant_ref *kmap_op, + unsigned long mfn) { unsigned long flags; - unsigned long mfn; unsigned long pfn; unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; pfn = page_to_pfn(page); - mfn = get_phys_to_machine(pfn); - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) - return -EINVAL; if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); @@ -970,10 +1064,7 @@ int m2p_remove_override(struct page *page, spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); - WARN_ON(!PagePrivate(page)); - ClearPagePrivate(page); - set_phys_to_machine(pfn, page->index); if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a18eadd8bb40..7005974c3ff3 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -441,10 +441,11 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) irq_ctx_init(cpu); #else clear_tsk_thread_flag(idle, TIF_FORK); +#endif per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 4d3acc34a998..0ba5f3b967f0 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -274,7 +274,7 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; } - + printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } @@ -290,6 +290,9 @@ static __init int xen_init_spinlocks_jump(void) if (!xen_pvspin) return 0; + if (!xen_domain()) + return 0; + static_key_slow_inc(¶virt_ticketlocks_enabled); return 0; } diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 33ca6e42a4ca..fd92a64d748e 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -75,6 +75,17 @@ ENDPROC(xen_sysexit) * stack state in whatever form its in, we keep things simple by only * using a single register which is pushed/popped on the stack. */ + +.macro POP_FS +1: + popw %fs +.pushsection .fixup, "ax" +2: movw $0, (%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) +.endm + ENTRY(xen_iret) /* test eflags for special cases */ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) @@ -83,15 +94,13 @@ ENTRY(xen_iret) push %eax ESP_OFFSET=4 # bytes pushed onto stack - /* - * Store vcpu_info pointer for easy access. Do it this way to - * avoid having to reload %fs - */ + /* Store vcpu_info pointer for easy access */ #ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl %ss:TI_cpu(%eax), %eax - movl %ss:__per_cpu_offset(,%eax,4), %eax - mov %ss:xen_vcpu(%eax), %eax + pushw %fs + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs + movl %fs:xen_vcpu, %eax + POP_FS #else movl %ss:xen_vcpu, %eax #endif |