From 5f40f7d93898a473eb222aa8064144c1d6835470 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Mon, 31 Mar 2014 09:37:00 -0500 Subject: x86/UV: Set n_lshift based on GAM_GR_CONFIG MMR for UV3 The value of n_lshift for UV is currently set based on the socket m_val. For UV3, set the n_lshift value based on the GAM_GR_CONFIG MMR. This will allow bios to control the n_lshift value independent of the socket m_val. Then n_lshift can be assigned a fixed value across a multi-partition system, allowing for a fixed common global physical address format that is independent of socket m_val. Cleanup unneeded macros. Signed-off-by: Dimitri Sivanich Link: http://lkml.kernel.org/r/20140331143700.GB29916@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 12 +---------- arch/x86/include/asm/uv/uv_mmrs.h | 42 +++++++++++++++++++++++++++++++++++++- arch/x86/kernel/apic/x2apic_uv_x.c | 26 ++++++++++++++++++----- 3 files changed, 63 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a30836c8ac4d..c63e925fd6b7 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -5,7 +5,7 @@ * * SGI UV architectural definitions * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_HUB_H @@ -204,16 +204,6 @@ static inline int is_uvx_hub(void) return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; } -static inline int is_uv2_1_hub(void) -{ - return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE; -} - -static inline int is_uv2_2_hub(void) -{ - return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE + 1; -} - union uvh_apicid { unsigned long v; struct uvh_apicid_s { diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index e42249bcf7e1..ddd8db6b6e70 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -2802,6 +2802,46 @@ union uv1h_lb_target_physical_apic_id_mask_u { } s1; }; +/* ========================================================================= */ +/* UV3H_GR0_GAM_GR_CONFIG */ +/* ========================================================================= */ +#define UV3H_GR0_GAM_GR_CONFIG 0xc00028UL + +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0 +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +union uv3h_gr0_gam_gr_config_u { + unsigned long v; + struct uv3h_gr0_gam_gr_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long undef_6_9:4; /* Undefined */ + unsigned long subspace:1; /* RW */ + unsigned long reserved:53; + } s3; +}; + +/* ========================================================================= */ +/* UV3H_GR1_GAM_GR_CONFIG */ +/* ========================================================================= */ +#define UV3H_GR1_GAM_GR_CONFIG 0x1000028UL + +#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_SHFT 0 +#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +union uv3h_gr1_gam_gr_config_u { + unsigned long v; + struct uv3h_gr1_gam_gr_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long undef_6_9:4; /* Undefined */ + unsigned long subspace:1; /* RW */ + unsigned long reserved:53; + } s3; +}; + /* ========================================================================= */ /* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ /* ========================================================================= */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7834389ba5be..293b41df54ef 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include #include @@ -440,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = { {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; +static unsigned char get_n_lshift(int m_val) +{ + union uv3h_gr0_gam_gr_config_u m_gr_config; + + if (is_uv1_hub()) + return m_val; + + if (is_uv2_hub()) + return m_val == 40 ? 40 : 39; + + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + return m_gr_config.s3.m_skt; +} + static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; @@ -849,6 +863,7 @@ void __init uv_system_init(void) int gnode_extra, min_pnode = 999999, max_pnode = -1; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + unsigned char n_lshift; char *hub = (is_uv1_hub() ? "UV1" : (is_uv2_hub() ? "UV2" : "UV3")); @@ -860,6 +875,7 @@ void __init uv_system_init(void) m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; pnode_mask = (1 << n_val) - 1; + n_lshift = get_n_lshift(m_val); mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; @@ -867,8 +883,9 @@ void __init uv_system_init(void) node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n", - n_val, m_val, pnode_mask, gnode_upper, gnode_extra); + pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", + n_val, m_val, pnode_mask, gnode_upper, gnode_extra, + n_lshift); pr_info("UV: global MMR base 0x%lx\n", mmr_base); @@ -935,8 +952,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; - uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? - (m_val == 40 ? 40 : 39) : m_val; + uv_cpu_hub_info(cpu)->n_lshift = n_lshift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); -- cgit v1.2.3 From 0ea481466d1c7cbd9d8f70ddc17a443a6c6fc09b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 4 Apr 2014 20:24:03 +0800 Subject: crypto: ghash-clmulni-intel - Use u128 instead of be128 for internal key The internal key isn't actually in big-endian format so let's switch to u128 which also happens to allow us to remove a sparse warning. Based on suggestion by Ard Biesheuvel. Signed-off-by: Herbert Xu Acked-by: Ard Biesheuvel --- arch/x86/crypto/ghash-clmulni-intel_asm.S | 4 ++-- arch/x86/crypto/ghash-clmulni-intel_glue.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 185fad49d86f..5d1e0075ac24 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -92,7 +92,7 @@ __clmul_gf128mul_ble: ret ENDPROC(__clmul_gf128mul_ble) -/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +/* void clmul_ghash_mul(char *dst, const u128 *shash) */ ENTRY(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH @@ -106,7 +106,7 @@ ENDPROC(clmul_ghash_mul) /* * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - * const be128 *shash); + * const u128 *shash); */ ENTRY(clmul_ghash_update) cmp $16, %rdx diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index d785cf2c529c..88bb7ba8b175 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -25,17 +25,17 @@ #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 -void clmul_ghash_mul(char *dst, const be128 *shash); +void clmul_ghash_mul(char *dst, const u128 *shash); void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - const be128 *shash); + const u128 *shash); struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; struct ghash_ctx { - be128 shash; + u128 shash; }; struct ghash_desc_ctx { @@ -68,11 +68,11 @@ static int ghash_setkey(struct crypto_shash *tfm, a = be64_to_cpu(x->a); b = be64_to_cpu(x->b); - ctx->shash.a = (__be64)((b << 1) | (a >> 63)); - ctx->shash.b = (__be64)((a << 1) | (b >> 63)); + ctx->shash.a = (b << 1) | (a >> 63); + ctx->shash.b = (a << 1) | (b >> 63); if (a >> 63) - ctx->shash.b ^= cpu_to_be64(0xc2); + ctx->shash.b ^= ((u64)0xc2) << 56; return 0; } -- cgit v1.2.3 From 79a51b25badae79d2da6f7b54530adf56697f669 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 2 Apr 2014 08:13:47 -0400 Subject: x86/irq: Clean up VECTOR_UNDEFINED and VECTOR_RETRIGGERED definition During another patch review, David Rientjes noted that VECTOR_UNDEFINED and VECTOR_RETRIGGERED should be defined with ()s so that they are not erroneously used in an arithmetic operation. Suggested-by: David Rientjes Signed-off-by: Prarit Bhargava Cc: Seiji Aguchi Cc: Yang Zhang Link: http://lkml.kernel.org/r/1396440827-18352-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a307b7530e54..4615906d83df 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -190,8 +190,8 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); #define trace_interrupt interrupt #endif -#define VECTOR_UNDEFINED -1 -#define VECTOR_RETRIGGERED -2 +#define VECTOR_UNDEFINED (-1) +#define VECTOR_RETRIGGERED (-2) typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); -- cgit v1.2.3 From b13b1d2d8692b437203de7a404c6b809d2cc4d99 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 8 Apr 2014 15:58:09 +0800 Subject: x86/mm: In the PTE swapout page reclaim case clear the accessed bit instead of flushing the TLB We use the accessed bit to age a page at page reclaim time, and currently we also flush the TLB when doing so. But in some workloads TLB flush overhead is very heavy. In my simple multithreaded app with a lot of swap to several pcie SSDs, removing the tlb flush gives about 20% ~ 30% swapout speedup. Fortunately just removing the TLB flush is a valid optimization: on x86 CPUs, clearing the accessed bit without a TLB flush doesn't cause data corruption. It could cause incorrect page aging and the (mistaken) reclaim of hot pages, but the chance of that should be relatively low. So as a performance optimization don't flush the TLB when clearing the accessed bit, it will eventually be flushed by a context switch or a VM operation anyway. [ In the rare event of it not getting flushed for a long time the delay shouldn't really matter because there's no real memory pressure for swapout to react to. ] Suggested-by: Linus Torvalds Signed-off-by: Shaohua Li Acked-by: Rik van Riel Acked-by: Mel Gorman Acked-by: Hugh Dickins Acked-by: Johannes Weiner Cc: linux-mm@kvack.org Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140408075809.GA1764@kernel.org [ Rewrote the changelog and the code comments. ] Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c96314abd144..0004ac72dbdd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -399,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young; - - young = ptep_test_and_clear_young(vma, address, ptep); - if (young) - flush_tlb_page(vma, address); - - return young; + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From a9358bc3531901a15e2f7cd48550c9555fc0288f Mon Sep 17 00:00:00 2001 From: Peter Foley Date: Tue, 15 Apr 2014 13:58:13 -0400 Subject: x86/build: Supress realmode.bin is up to date message Supress this unnecessary message during kernel re-build: make[3]: 'arch/x86/realmode/rm/realmode.bin' is up to date. Signed-off-by: Peter Foley Link: http://lkml.kernel.org/r/1397584693-15902-1-git-send-email-pefoley2@pefoley.com Signed-off-by: Ingo Molnar --- arch/x86/realmode/rm/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 3497f14e4dea..7c0d7be176a5 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -52,8 +52,9 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE OBJCOPYFLAGS_realmode.bin := -O binary targets += realmode.bin -$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs +$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE $(call if_changed,objcopy) + @: quiet_cmd_relocs = RELOCS $@ cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ -- cgit v1.2.3 From cd9ae5fe47dfb9820976c3c38c70f4b07a5a1c36 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 4 Apr 2014 06:31:04 +0300 Subject: KVM: x86: Fix page-tables reserved bits KVM does not handle the reserved bits of x86 page tables correctly: In PAE, bits 5:8 are reserved in the PDPTE. In IA-32e, bit 8 is not reserved. Signed-off-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 813d31038b93..668ae5916de9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3538,7 +3538,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | - rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ context->rsvd_bits_mask[0][0] = exb_bit_rsvd | @@ -3550,9 +3550,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | -- cgit v1.2.3 From c625d1c203941fad755eb4eb729db1f65d6e9836 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Sep 2013 09:55:39 -0500 Subject: efi: x86: Handle arbitrary Unicode characters Instead of truncating UTF-16 assuming all characters is ASCII, properly convert it to UTF-8. Signed-off-by: H. Peter Anvin [ Bug and style fixes. ] Signed-off-by: Roy Franz Signed-off-by: Leif Lindholm Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 3 +- drivers/firmware/efi/efi-stub-helper.c | 87 ++++++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 4703a6c4b8e3..0331d765c2bb 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1087,8 +1087,7 @@ struct boot_params *make_boot_params(struct efi_config *c) hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ - cmdline_ptr = efi_convert_cmdline_to_ascii(sys_table, image, - &options_size); + cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size); if (!cmdline_ptr) goto fail; hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; diff --git a/drivers/firmware/efi/efi-stub-helper.c b/drivers/firmware/efi/efi-stub-helper.c index a168dd20511f..eb6d4be9e722 100644 --- a/drivers/firmware/efi/efi-stub-helper.c +++ b/drivers/firmware/efi/efi-stub-helper.c @@ -535,53 +535,100 @@ static efi_status_t efi_relocate_kernel(efi_system_table_t *sys_table_arg, return status; } +/* + * Get the number of UTF-8 bytes corresponding to an UTF-16 character. + * This overestimates for surrogates, but that is okay. + */ +static int efi_utf8_bytes(u16 c) +{ + return 1 + (c >= 0x80) + (c >= 0x800); +} + +/* + * Convert an UTF-16 string, not necessarily null terminated, to UTF-8. + */ +static u8 *efi_utf16_to_utf8(u8 *dst, const u16 *src, int n) +{ + unsigned int c; + + while (n--) { + c = *src++; + if (n && c >= 0xd800 && c <= 0xdbff && + *src >= 0xdc00 && *src <= 0xdfff) { + c = 0x10000 + ((c & 0x3ff) << 10) + (*src & 0x3ff); + src++; + n--; + } + if (c >= 0xd800 && c <= 0xdfff) + c = 0xfffd; /* Unmatched surrogate */ + if (c < 0x80) { + *dst++ = c; + continue; + } + if (c < 0x800) { + *dst++ = 0xc0 + (c >> 6); + goto t1; + } + if (c < 0x10000) { + *dst++ = 0xe0 + (c >> 12); + goto t2; + } + *dst++ = 0xf0 + (c >> 18); + *dst++ = 0x80 + ((c >> 12) & 0x3f); + t2: + *dst++ = 0x80 + ((c >> 6) & 0x3f); + t1: + *dst++ = 0x80 + (c & 0x3f); + } + + return dst; +} + /* * Convert the unicode UEFI command line to ASCII to pass to kernel. * Size of memory allocated return in *cmd_line_len. * Returns NULL on error. */ -static char *efi_convert_cmdline_to_ascii(efi_system_table_t *sys_table_arg, - efi_loaded_image_t *image, - int *cmd_line_len) +static char *efi_convert_cmdline(efi_system_table_t *sys_table_arg, + efi_loaded_image_t *image, + int *cmd_line_len) { - u16 *s2; + const u16 *s2; u8 *s1 = NULL; unsigned long cmdline_addr = 0; - int load_options_size = image->load_options_size / 2; /* ASCII */ - void *options = image->load_options; - int options_size = 0; + int load_options_chars = image->load_options_size / 2; /* UTF-16 */ + const u16 *options = image->load_options; + int options_bytes = 0; /* UTF-8 bytes */ + int options_chars = 0; /* UTF-16 chars */ efi_status_t status; - int i; u16 zero = 0; if (options) { s2 = options; - while (*s2 && *s2 != '\n' && options_size < load_options_size) { - s2++; - options_size++; + while (*s2 && *s2 != '\n' + && options_chars < load_options_chars) { + options_bytes += efi_utf8_bytes(*s2++); + options_chars++; } } - if (options_size == 0) { + if (!options_chars) { /* No command line options, so return empty string*/ - options_size = 1; options = &zero; } - options_size++; /* NUL termination */ + options_bytes++; /* NUL termination */ - status = efi_low_alloc(sys_table_arg, options_size, 0, &cmdline_addr); + status = efi_low_alloc(sys_table_arg, options_bytes, 0, &cmdline_addr); if (status != EFI_SUCCESS) return NULL; s1 = (u8 *)cmdline_addr; - s2 = (u16 *)options; - - for (i = 0; i < options_size - 1; i++) - *s1++ = *s2++; + s2 = (const u16 *)options; + s1 = efi_utf16_to_utf8(s1, s2, options_chars); *s1 = '\0'; - *cmd_line_len = options_size; + *cmd_line_len = options_bytes; return (char *)cmdline_addr; } -- cgit v1.2.3 From 62fa6e69a436f662090f3996538adb9e568817f6 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 27 Mar 2014 15:10:39 -0700 Subject: x86/efi: Delete most of the efi_call* macros We really only need one phys and one virt function call, and then only one assembly function to make firmware calls. Since we are not using the C type system anyway, we're not really losing much by deleting the macros apart from no longer having a check that we are passing the correct number of parameters. The lack of duplicated code seems like a worthwhile trade-off. Cc: Ricardo Neri Cc: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/head_64.S | 2 +- arch/x86/include/asm/efi.h | 74 ++++----------------------------- arch/x86/platform/efi/efi.c | 48 +++++++++++----------- arch/x86/platform/efi/efi_stub_64.S | 81 +------------------------------------ arch/x86/platform/uv/bios_uv.c | 2 +- 5 files changed, 35 insertions(+), 172 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 0d558ee899ae..2884e0c3e8a5 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -452,7 +452,7 @@ efi32_config: .global efi64_config efi64_config: .fill 11,8,0 - .quad efi_call6 + .quad efi_call .byte 1 #endif /* CONFIG_EFI_STUB */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0869434eaf72..e3975597b289 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -27,16 +27,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); -#define efi_call_phys0(f) efi_call_phys(f) -#define efi_call_phys1(f, a1) efi_call_phys(f, a1) -#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2) -#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3) -#define efi_call_phys4(f, a1, a2, a3, a4) \ - efi_call_phys(f, a1, a2, a3, a4) -#define efi_call_phys5(f, a1, a2, a3, a4, a5) \ - efi_call_phys(f, a1, a2, a3, a4, a5) -#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \ - efi_call_phys(f, a1, a2, a3, a4, a5, a6) /* * Wrap all the virtual calls in a way that forces the parameters on the stack. */ @@ -44,75 +34,27 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt(f, args...) \ ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) -#define efi_call_virt0(f) efi_call_virt(f) -#define efi_call_virt1(f, a1) efi_call_virt(f, a1) -#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2) -#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - efi_call_virt(f, a1, a2, a3, a4) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - efi_call_virt(f, a1, a2, a3, a4, a5) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - efi_call_virt(f, a1, a2, a3, a4, a5, a6) - #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ -extern u64 efi_call0(void *fp); -extern u64 efi_call1(void *fp, u64 arg1); -extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); -extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3); -extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4); -extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3, - u64 arg4, u64 arg5); -extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, - u64 arg4, u64 arg5, u64 arg6); - -#define efi_call_phys0(f) \ - efi_call0((f)) -#define efi_call_phys1(f, a1) \ - efi_call1((f), (u64)(a1)) -#define efi_call_phys2(f, a1, a2) \ - efi_call2((f), (u64)(a1), (u64)(a2)) -#define efi_call_phys3(f, a1, a2, a3) \ - efi_call3((f), (u64)(a1), (u64)(a2), (u64)(a3)) -#define efi_call_phys4(f, a1, a2, a3, a4) \ - efi_call4((f), (u64)(a1), (u64)(a2), (u64)(a3), \ - (u64)(a4)) -#define efi_call_phys5(f, a1, a2, a3, a4, a5) \ - efi_call5((f), (u64)(a1), (u64)(a2), (u64)(a3), \ - (u64)(a4), (u64)(a5)) -#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ - (u64)(a4), (u64)(a5), (u64)(a6)) - -#define _efi_call_virtX(x, f, ...) \ +#define EFI_LOADER_SIGNATURE "EL64" + +extern u64 asmlinkage efi_call(void *fp, ...); + +#define efi_call_phys(f, args...) efi_call((f), args) + +#define efi_call_virt(f, ...) \ ({ \ efi_status_t __s; \ \ efi_sync_low_kernel_mappings(); \ preempt_disable(); \ - __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \ + __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ preempt_enable(); \ __s; \ }) -#define efi_call_virt0(f) \ - _efi_call_virtX(0, f) -#define efi_call_virt1(f, a1) \ - _efi_call_virtX(1, f, (u64)(a1)) -#define efi_call_virt2(f, a1, a2) \ - _efi_call_virtX(2, f, (u64)(a1), (u64)(a2)) -#define efi_call_virt3(f, a1, a2, a3) \ - _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3)) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4)) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5)) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) - extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 3781dd39e8bd..8f6531e8d32f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -110,7 +110,7 @@ static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt2(get_time, tm, tc); + status = efi_call_virt(get_time, tm, tc); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -121,7 +121,7 @@ static efi_status_t virt_efi_set_time(efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt1(set_time, tm); + status = efi_call_virt(set_time, tm); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -134,8 +134,7 @@ static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt3(get_wakeup_time, - enabled, pending, tm); + status = efi_call_virt(get_wakeup_time, enabled, pending, tm); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -146,8 +145,7 @@ static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt2(set_wakeup_time, - enabled, tm); + status = efi_call_virt(set_wakeup_time, enabled, tm); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -158,17 +156,17 @@ static efi_status_t virt_efi_get_variable(efi_char16_t *name, unsigned long *data_size, void *data) { - return efi_call_virt5(get_variable, - name, vendor, attr, - data_size, data); + return efi_call_virt(get_variable, + name, vendor, attr, + data_size, data); } static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) { - return efi_call_virt3(get_next_variable, - name_size, name, vendor); + return efi_call_virt(get_next_variable, + name_size, name, vendor); } static efi_status_t virt_efi_set_variable(efi_char16_t *name, @@ -177,9 +175,9 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, unsigned long data_size, void *data) { - return efi_call_virt5(set_variable, - name, vendor, attr, - data_size, data); + return efi_call_virt(set_variable, + name, vendor, attr, + data_size, data); } static efi_status_t virt_efi_query_variable_info(u32 attr, @@ -190,13 +188,13 @@ static efi_status_t virt_efi_query_variable_info(u32 attr, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt4(query_variable_info, attr, storage_space, - remaining_space, max_variable_size); + return efi_call_virt(query_variable_info, attr, storage_space, + remaining_space, max_variable_size); } static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) { - return efi_call_virt1(get_next_high_mono_count, count); + return efi_call_virt(get_next_high_mono_count, count); } static void virt_efi_reset_system(int reset_type, @@ -204,8 +202,8 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { - efi_call_virt4(reset_system, reset_type, status, - data_size, data); + efi_call_virt(reset_system, reset_type, status, + data_size, data); } static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, @@ -215,7 +213,7 @@ static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt3(update_capsule, capsules, count, sg_list); + return efi_call_virt(update_capsule, capsules, count, sg_list); } static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, @@ -226,8 +224,8 @@ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt4(query_capsule_caps, capsules, count, max_size, - reset_type); + return efi_call_virt(query_capsule_caps, capsules, count, max_size, + reset_type); } static efi_status_t __init phys_efi_set_virtual_address_map( @@ -239,9 +237,9 @@ static efi_status_t __init phys_efi_set_virtual_address_map( efi_status_t status; efi_call_phys_prelog(); - status = efi_call_phys4(efi_phys.set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); + status = efi_call_phys(efi_phys.set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); efi_call_phys_epilog(); return status; } diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index e0984ef0374b..5fcda7272550 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -73,84 +73,7 @@ 2: .endm -ENTRY(efi_call0) - SAVE_XMM - subq $32, %rsp - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call0) - -ENTRY(efi_call1) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call1) - -ENTRY(efi_call2) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call2) - -ENTRY(efi_call3) - SAVE_XMM - subq $32, %rsp - mov %rcx, %r8 - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call3) - -ENTRY(efi_call4) - SAVE_XMM - subq $32, %rsp - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call4) - -ENTRY(efi_call5) - SAVE_XMM - subq $48, %rsp - mov %r9, 32(%rsp) - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $48, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call5) - -ENTRY(efi_call6) +ENTRY(efi_call) SAVE_XMM mov (%rsp), %rax mov 8(%rax), %rax @@ -166,7 +89,7 @@ ENTRY(efi_call6) addq $48, %rsp RESTORE_XMM ret -ENDPROC(efi_call6) +ENDPROC(efi_call) #ifdef CONFIG_EFI_MIXED diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 766612137a62..1584cbed0dce 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -39,7 +39,7 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - ret = efi_call6((void *)__va(tab->function), (u64)which, + ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5); return ret; } -- cgit v1.2.3 From c6b406919288a617815f710175da20f3fca72065 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 27 Mar 2014 15:10:40 -0700 Subject: x86, fpu: Extend the use of static_cpu_has_safe It may be necessary to save and restore the FPU context during EFI runtime system services calls. However, this may happen during boot and before alternatives have run. Thus, we need to use static_cpu_has_safe instead. The rationale behind the use of static_cpu_has_safe is the same as in commit 5f8c4218148822fde6ee ("x86, fpu: Use static_cpu_has_safe before alternatives") by Borislav Petkov. Signed-off-by: Matt Fleming Signed-off-by: Ricardo Neri Cc: Borislav Petkov --- arch/x86/include/asm/fpu-internal.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index cea1c76d49bf..115e3689cd53 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -87,22 +87,22 @@ static inline int is_x32_frame(void) static __always_inline __pure bool use_eager_fpu(void) { - return static_cpu_has(X86_FEATURE_EAGER_FPU); + return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); } static __always_inline __pure bool use_xsaveopt(void) { - return static_cpu_has(X86_FEATURE_XSAVEOPT); + return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) { - return static_cpu_has(X86_FEATURE_XSAVE); + return static_cpu_has_safe(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { - return static_cpu_has(X86_FEATURE_FXSR); + return static_cpu_has_safe(X86_FEATURE_FXSR); } static inline void fx_finit(struct i387_fxsave_struct *fx) @@ -293,7 +293,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. "m" is a random variable that should be in L1 */ - if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) { + if (unlikely(static_cpu_has_safe(X86_FEATURE_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" -- cgit v1.2.3 From 982e239cd2c73d2c70e14615a42c0c7391415a44 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Mar 2014 15:10:41 -0700 Subject: x86/efi: Implement a __efi_call_virt macro For i386, all the EFI system runtime services functions return efi_status_t except efi_reset_system_system. Therefore, not all functions can be covered by the same macro in case the macro needs to do more than calling the function (i.e., return a value). The purpose of the __efi_call_virt macro is to be used when no return value is expected. For x86_64, this macro would not be needed as all the runtime services return u64. However, the same code is used for both x86_64 and i386. Thus, the macro __efi_call_virt is also defined to not break compilation. Signed-off-by: Ricardo Neri Cc: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 10 ++++++++++ arch/x86/platform/efi/efi.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index e3975597b289..19292e7cae58 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -31,9 +31,13 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); * Wrap all the virtual calls in a way that forces the parameters on the stack. */ +/* Use this macro if your virtual returns a non-void value */ #define efi_call_virt(f, args...) \ ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) +/* Use this macro if your virtual call does not return any value */ +#define __efi_call_virt(f, args...) efi_call_virt(f, args) + #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -55,6 +59,12 @@ extern u64 asmlinkage efi_call(void *fp, ...); __s; \ }) +/* + * All X86_64 virt calls return non-void values. Thus, use non-void call for + * virt calls that would be void on X86_32. + */ +#define __efi_call_virt(f, args...) efi_call_virt(f, args) + extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 8f6531e8d32f..835b24820eaa 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -202,8 +202,8 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { - efi_call_virt(reset_system, reset_type, status, - data_size, data); + __efi_call_virt(reset_system, reset_type, status, + data_size, data); } static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, -- cgit v1.2.3 From de05764e0b2a3d9559e099a2e134f8cef4500fdd Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Mar 2014 15:10:42 -0700 Subject: x86/efi: Save and restore FPU context around efi_calls (x86_64) Do a complete FPU context save/restore around the EFI calls. This required as runtime EFI firmware may potentially use the FPU. This change covers only the x86_64 configuration. Signed-off-by: Ricardo Neri Cc: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 19292e7cae58..0b19187cc882 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +#include /* * We map the EFI regions needed for runtime services non-contiguously, * with preserved alignment on virtual addresses starting from -4G down @@ -54,7 +55,9 @@ extern u64 asmlinkage efi_call(void *fp, ...); \ efi_sync_low_kernel_mappings(); \ preempt_disable(); \ + __kernel_fpu_begin(); \ __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ + __kernel_fpu_end(); \ preempt_enable(); \ __s; \ }) -- cgit v1.2.3 From b738c6ea49b4e98e6ca0651da82a610f996a16ae Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Mar 2014 15:10:43 -0700 Subject: x86/efi: Save and restore FPU context around efi_calls (i386) Do a complete FPU context save/restore around the EFI calls. This required as runtime EFI firmware may potentially use the FPU. This change covers only the i386 configuration. Signed-off-by: Ricardo Neri Cc: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0b19187cc882..1eb5f6433ad8 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -34,10 +34,23 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); /* Use this macro if your virtual returns a non-void value */ #define efi_call_virt(f, args...) \ - ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) +({ \ + efi_status_t __s; \ + kernel_fpu_begin(); \ + __s = ((efi_##f##_t __attribute__((regparm(0)))*) \ + efi.systab->runtime->f)(args); \ + kernel_fpu_end(); \ + __s; \ +}) /* Use this macro if your virtual call does not return any value */ -#define __efi_call_virt(f, args...) efi_call_virt(f, args) +#define __efi_call_virt(f, args...) \ +({ \ + kernel_fpu_begin(); \ + ((efi_##f##_t __attribute__((regparm(0)))*) \ + efi.systab->runtime->f)(args); \ + kernel_fpu_end(); \ +}) #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) -- cgit v1.2.3 From f848a5a8dcb655553423f77cc98909a04e64173d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 31 Mar 2014 21:50:38 +0300 Subject: KVM: support any-length wildcard ioeventfd It is sometimes benefitial to ignore IO size, and only match on address. In hindsight this would have been a better default than matching length when KVM_IOEVENTFD_FLAG_DATAMATCH is not set, In particular, this kind of access can be optimized on VMX: there no need to do page lookups. This can currently be done with many ioeventfds but in a suboptimal way. However we can't change kernel/userspace ABI without risk of breaking some applications. Use len = 0 to mean "ignore length for matching" in a more optimal way. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 3 ++- virt/kvm/eventfd.c | 27 ++++++++++++++++++++++----- 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b8fc0b792ba..bc4aaf68190c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2644,6 +2644,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: + case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a8f4ee5d2e82..39098a61f41c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -529,7 +529,7 @@ enum { struct kvm_ioeventfd { __u64 datamatch; __u64 addr; /* legal pio/mmio address */ - __u32 len; /* 1, 2, 4, or 8 bytes */ + __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */ __s32 fd; __u32 flags; __u8 pad[36]; @@ -743,6 +743,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97 #define KVM_CAP_ENABLE_CAP_VM 98 #define KVM_CAP_S390_IRQCHIP 99 +#define KVM_CAP_IOEVENTFD_NO_LENGTH 100 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 29c2a04e036e..2721996bb9c2 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -600,7 +600,15 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) { u64 _val; - if (!(addr == p->addr && len == p->length)) + if (addr != p->addr) + /* address must be precise for a hit */ + return false; + + if (!p->length) + /* length = 0 means only look at the address, so always a hit */ + return true; + + if (len != p->length) /* address-range must be precise for a hit */ return false; @@ -671,9 +679,11 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) list_for_each_entry(_p, &kvm->ioeventfds, list) if (_p->bus_idx == p->bus_idx && - _p->addr == p->addr && _p->length == p->length && - (_p->wildcard || p->wildcard || - _p->datamatch == p->datamatch)) + _p->addr == p->addr && + (!_p->length || !p->length || + (_p->length == p->length && + (_p->wildcard || p->wildcard || + _p->datamatch == p->datamatch)))) return true; return false; @@ -697,8 +707,9 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) int ret; bus_idx = ioeventfd_bus_from_flags(args->flags); - /* must be natural-word sized */ + /* must be natural-word sized, or 0 to ignore length */ switch (args->len) { + case 0: case 1: case 2: case 4: @@ -716,6 +727,12 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) return -EINVAL; + /* ioeventfd with no length can't be combined with DATAMATCH */ + if (!args->len && + args->flags & (KVM_IOEVENTFD_FLAG_PIO | + KVM_IOEVENTFD_FLAG_DATAMATCH)) + return -EINVAL; + eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); -- cgit v1.2.3 From 68c3b4d1676d870f0453c31d5a52e7e65c7448ae Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 31 Mar 2014 21:50:44 +0300 Subject: KVM: VMX: speed up wildcard MMIO EVENTFD With KVM, MMIO is much slower than PIO, due to the need to do page walk and emulation. But with EPT, it does not have to be: we know the address from the VMCS so if the address is unique, we can look up the eventfd directly, bypassing emulation. Unfortunately, this only works if userspace does not need to match on access length and data. The implementation adds a separate FAST_MMIO bus internally. This serves two purposes: - minimize overhead for old userspace that does not use eventfd with lengtth = 0 - minimize disruption in other code (since we don't know the length, devices on the MMIO bus only get a valid address in write, this way we don't need to touch all devices to teach them to handle an invalid length) At the moment, this optimization only has effect for EPT on x86. It will be possible to speed up MMIO for NPT and MMU using the same idea in the future. With this patch applied, on VMX MMIO EVENTFD is essentially as fast as PIO. I was unable to detect any measureable slowdown to non-eventfd MMIO. Making MMIO faster is important for the upcoming virtio 1.0 which includes an MMIO signalling capability. The idea was suggested by Peter Anvin. Lots of thanks to Gleb for pre-review and suggestions. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + virt/kvm/eventfd.c | 16 ++++++++++++++++ virt/kvm/kvm_main.c | 1 + 5 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f68c5831924..eb3f2b1b764c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5528,6 +5528,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + skip_emulated_instruction(vcpu); + return 1; + } ret = handle_mmio_page_fault_common(vcpu, gpa, true); if (likely(ret == RET_MMIO_PF_EMULATE)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d21cf9f4380..6c3c2eb96d06 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -163,6 +163,7 @@ enum kvm_bus { KVM_MMIO_BUS, KVM_PIO_BUS, KVM_VIRTIO_CCW_NOTIFY_BUS, + KVM_FAST_MMIO_BUS, KVM_NR_BUSES }; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 39098a61f41c..d8a6ce4c2a83 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -515,6 +515,7 @@ enum { kvm_ioeventfd_flag_nr_pio, kvm_ioeventfd_flag_nr_deassign, kvm_ioeventfd_flag_nr_virtio_ccw_notify, + kvm_ioeventfd_flag_nr_fast_mmio, kvm_ioeventfd_flag_nr_max, }; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 2721996bb9c2..912ec5a95e2c 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -770,6 +770,16 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (ret < 0) goto unlock_fail; + /* When length is ignored, MMIO is also put on a separate bus, for + * faster lookups. + */ + if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) { + ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS, + p->addr, 0, &p->dev); + if (ret < 0) + goto register_fail; + } + kvm->buses[bus_idx]->ioeventfd_count++; list_add_tail(&p->list, &kvm->ioeventfds); @@ -777,6 +787,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) return 0; +register_fail: + kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); unlock_fail: mutex_unlock(&kvm->slots_lock); @@ -816,6 +828,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) continue; kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); + if (!p->length) { + kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS, + &p->dev); + } kvm->buses[bus_idx]->ioeventfd_count--; ioeventfd_release(p); ret = 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 56baae8c2f56..96456ac888ba 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2922,6 +2922,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range, return -EOPNOTSUPP; } +EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, -- cgit v1.2.3 From ddb69f276c4af8bb47ad4f24a72f72ddf58c228a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 15:16:22 +0200 Subject: uprobes/x86: Fold prepare_fixups() into arch_uprobe_analyze_insn() No functional changes, preparation. Shift the code from prepare_fixups() to arch_uprobe_analyze_insn() with the following modifications: - Do not call insn_get_opcode() again, it was already called by validate_insn_bits(). - Move "case 0xea" up. This way "case 0xff" can fall through to default case. - change "case 0xff" to use the nested "switch (MODRM_REG)", this way the code looks a bit simpler. - Make the comments look consistent. While at it, kill the initialization of rip_rela_target_address and ->fixups, we can rely on kzalloc(). We will add the new members into arch_uprobe, it would be better to assume that everything is zero by default. TODO: cleanup/fix the mess in validate_insn_bits() paths: - validate_insn_64bits() and validate_insn_32bits() should be unified. - "ifdef" is not used consistently; if good_insns_64 depends on CONFIG_X86_64, then probably good_insns_32 should depend on CONFIG_X86_32/EMULATION - the usage of mm->context.ia32_compat looks wrong if the task is TIF_X32. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 110 ++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ed845928b5f..098e56ec7954 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -53,7 +53,7 @@ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) -#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -229,63 +229,6 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } -/* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, and - * annotate arch_uprobe->fixups accordingly. To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related fixups. - */ -static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) -{ - bool fix_ip = true, fix_call = false; /* defaults */ - int reg; - - insn_get_opcode(insn); /* should be a nop */ - - switch (OPCODE1(insn)) { - case 0x9d: - /* popf */ - auprobe->fixups |= UPROBE_FIX_SETF; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - /* ip is correct */ - fix_ip = false; - break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; - case 0x9a: /* call absolute - Fix return addr, not ip */ - fix_call = true; - fix_ip = false; - break; - case 0xff: - insn_get_modrm(insn); - reg = MODRM_REG(insn); - if (reg == 2 || reg == 3) { - /* call or lcall, indirect */ - /* Fix return addr; ip is correct. */ - fix_call = true; - fix_ip = false; - } else if (reg == 4 || reg == 5) { - /* jmp or ljmp, indirect */ - /* ip is correct. */ - fix_ip = false; - } - break; - case 0xea: /* jmp absolute -- ip is correct */ - fix_ip = false; - break; - default: - break; - } - if (fix_ip) - auprobe->fixups |= UPROBE_FIX_IP; - if (fix_call) - auprobe->fixups |= UPROBE_FIX_CALL; -} - #ifdef CONFIG_X86_64 /* * If arch_uprobe->insn doesn't use rip-relative addressing, return @@ -318,7 +261,6 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins if (mm->context.ia32_compat) return; - auprobe->rip_rela_target_address = 0x0; if (!insn_rip_relative(insn)) return; @@ -421,16 +363,58 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - int ret; struct insn insn; + bool fix_ip = true, fix_call = false; + int ret; - auprobe->fixups = 0; ret = validate_insn_bits(auprobe, mm, &insn); - if (ret != 0) + if (ret) return ret; + /* + * Figure out which fixups arch_uprobe_post_xol() will need to perform, + * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups + * is either zero or it reflects rip-related fixups. + */ handle_riprel_insn(auprobe, mm, &insn); - prepare_fixups(auprobe, &insn); + + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + fix_ip = false; + break; + case 0xe8: /* call relative - Fix return addr */ + fix_call = true; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + case 0xff: + insn_get_modrm(&insn); + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_call = true; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip = false; + } + break; + default: + break; + } + + if (fix_ip) + auprobe->fixups |= UPROBE_FIX_IP; + if (fix_call) + auprobe->fixups |= UPROBE_FIX_CALL; return 0; } -- cgit v1.2.3 From 59078d4b96bb548f97d9fb429b929a289e4884d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 18:09:36 +0200 Subject: uprobes/x86: Kill the "ia32_compat" check in handle_riprel_insn(), remove "mm" arg Kill the "mm->context.ia32_compat" check in handle_riprel_insn(), if it is true insn_rip_relative() must return false. validate_insn_bits() passed "ia32_compat" as !x86_64 to insn_init(), and insn_rip_relative() checks insn->x86_64. Also, remove the no longer needed "struct mm_struct *mm" argument and the unnecessary "return" at the end. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 098e56ec7954..963c121c0307 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -253,14 +253,11 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) * - The displacement is always 4 bytes. */ static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; - if (mm->context.ia32_compat) - return; - if (!insn_rip_relative(insn)) return; @@ -314,7 +311,6 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins cursor++; memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); } - return; } static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) @@ -343,7 +339,7 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { /* No RIP-relative addressing on 32-bit */ } @@ -376,7 +372,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups * is either zero or it reflects rip-related fixups. */ - handle_riprel_insn(auprobe, mm, &insn); + handle_riprel_insn(auprobe, &insn); switch (OPCODE1(&insn)) { case 0x9d: /* popf */ -- cgit v1.2.3 From d20737c07a1063d681fe9fb86f3da369da1edab7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 18:35:09 +0200 Subject: uprobes/x86: Gather "riprel" functions together Cosmetic. Move pre_xol_rip_insn() and handle_riprel_post_xol() up to the closely related handle_riprel_insn(). This way it is simpler to read and understand this code, and this lessens the number of ifdef's. While at it, update the comment in handle_riprel_post_xol() as Jim suggested. TODO: rename them somehow to make the naming consistent. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 118 +++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 963c121c0307..c52c30fa7871 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -313,6 +313,48 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) } } +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Caller may need to apply other fixups to handle stuff + * like "jmpq *...(%rip)" and "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } +} + static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) { insn_init(insn, auprobe->insn, true); @@ -339,9 +381,19 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ +/* + * No RIP-relative addressing on 32-bit + */ static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { - /* No RIP-relative addressing on 32-bit */ +} +static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ +} +static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, + long *correction) +{ } static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) @@ -415,34 +467,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, return 0; } -#ifdef CONFIG_X86_64 -/* - * If we're emulating a rip-relative instruction, save the contents - * of the scratch register and store the target address in that register. - */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) -{ - if (auprobe->fixups & UPROBE_FIX_RIP_AX) { - autask->saved_scratch_register = regs->ax; - regs->ax = current->utask->vaddr; - regs->ax += auprobe->rip_rela_target_address; - } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { - autask->saved_scratch_register = regs->cx; - regs->cx = current->utask->vaddr; - regs->cx += auprobe->rip_rela_target_address; - } -} -#else -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) -{ - /* No RIP-relative addressing on 32-bit */ -} -#endif - /* * arch_uprobe_pre_xol - prepare to execute out of line. * @auprobe: the probepoint information. @@ -492,42 +516,6 @@ static int adjust_ret_addr(unsigned long sp, long correction) return 0; } -#ifdef CONFIG_X86_64 -static bool is_riprel_insn(struct arch_uprobe *auprobe) -{ - return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); -} - -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) -{ - if (is_riprel_insn(auprobe)) { - struct arch_uprobe_task *autask; - - autask = ¤t->utask->autask; - if (auprobe->fixups & UPROBE_FIX_RIP_AX) - regs->ax = autask->saved_scratch_register; - else - regs->cx = autask->saved_scratch_register; - - /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Fall through to handle stuff like "jmpq *...(%rip)" and - * "callq *...(%rip)". - */ - if (correction) - *correction += 4; - } -} -#else -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) -{ - /* No RIP-relative addressing on 32-bit */ -} -#endif - /* * If xol insn itself traps and generates a signal(Say, * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped -- cgit v1.2.3 From 34e7317d6ae8f6111ac449444f22e14f4a14ebfd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 19:38:09 +0200 Subject: uprobes/x86: move the UPROBE_FIX_{RIP,IP,CALL} code at the end of pre/post hooks No functional changes. Preparation to simplify the review of the next change. Just reorder the code in arch_uprobe_pre/post_xol() functions so that UPROBE_FIX_{RIP_*,IP,CALL} logic goes to the end. Also change arch_uprobe_pre_xol() to use utask instead of autask, to make the code more symmetrical with arch_uprobe_post_xol(). Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index c52c30fa7871..3bb4198aa588 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -474,19 +474,18 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, */ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct arch_uprobe_task *autask; + struct uprobe_task *utask = current->utask; - autask = ¤t->utask->autask; - autask->saved_trap_nr = current->thread.trap_nr; + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; - regs->ip = current->utask->xol_vaddr; - pre_xol_rip_insn(auprobe, regs, autask); - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); regs->flags |= X86_EFLAGS_TF; if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); + pre_xol_rip_insn(auprobe, regs, &utask->autask); return 0; } @@ -560,22 +559,13 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct uprobe_task *utask; + struct uprobe_task *utask = current->utask; long correction; int result = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - utask = current->utask; current->thread.trap_nr = utask->autask.saved_trap_nr; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); - /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need @@ -586,6 +576,14 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; + correction = (long)(utask->vaddr - utask->xol_vaddr); + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) + result = adjust_ret_addr(regs->sp, correction); + return result; } -- cgit v1.2.3 From 8ad8e9d3fd64f101eed6652964670672d699e563 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 21:01:31 +0200 Subject: uprobes/x86: Introduce uprobe_xol_ops and arch_uprobe->ops Introduce arch_uprobe->ops pointing to the "struct uprobe_xol_ops", move the current UPROBE_FIX_{RIP*,IP,CALL} code into the default set of methods and change arch_uprobe_pre/post_xol() accordingly. This way we can add the new uprobe_xol_ops's to handle the insns which need the special processing (rip-relative jmp/call at least). Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Reviewed-by: Masami Hiramatsu --- arch/x86/include/asm/uprobes.h | 7 ++- arch/x86/kernel/uprobes.c | 107 ++++++++++++++++++++++++++--------------- 2 files changed, 74 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 3087ea9c5f2e..9f8210bcbb49 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -33,12 +33,17 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +struct uprobe_xol_ops; + struct arch_uprobe { - u16 fixups; union { u8 insn[MAX_UINSN_BYTES]; u8 ixol[MAX_UINSN_BYTES]; }; + + u16 fixups; + const struct uprobe_xol_ops *ops; + #ifdef CONFIG_X86_64 unsigned long rip_rela_target_address; #endif diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 3bb4198aa588..13ad8a38c2d9 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -402,6 +402,64 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, } #endif /* CONFIG_X86_64 */ +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); +}; + +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); + return 0; +} + +/* + * Adjust the return address pushed by a call insn executed out of line. + */ +static int adjust_ret_addr(unsigned long sp, long correction) +{ + int rasize, ncopied; + long ra = 0; + + if (is_ia32_task()) + rasize = 4; + else + rasize = 8; + + ncopied = copy_from_user(&ra, (void __user *)sp, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + ra += correction; + ncopied = copy_to_user((void __user *)sp, &ra, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + return 0; +} + +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + long correction = (long)(utask->vaddr - utask->xol_vaddr); + int ret = 0; + + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) + ret = adjust_ret_addr(regs->sp, correction); + + return ret; +} + +static struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, +}; + /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. @@ -464,6 +522,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (fix_call) auprobe->fixups |= UPROBE_FIX_CALL; + auprobe->ops = &default_xol_ops; return 0; } @@ -485,33 +544,8 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); - pre_xol_rip_insn(auprobe, regs, &utask->autask); - return 0; -} - -/* - * This function is called by arch_uprobe_post_xol() to adjust the return - * address pushed by a call instruction executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) -{ - int rasize, ncopied; - long ra = 0; - - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; - - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) - return -EFAULT; - - ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) - return -EFAULT; - + if (auprobe->ops->pre_xol) + return auprobe->ops->pre_xol(auprobe, regs); return 0; } @@ -560,11 +594,8 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; - long correction; - int result = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP @@ -576,15 +607,9 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); - - return result; + if (auprobe->ops->post_xol) + return auprobe->ops->post_xol(auprobe, regs); + return 0; } /* callback routine for handling exceptions. */ @@ -642,6 +667,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { int i; + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); + + /* TODO: move this code into ->emulate() hook */ for (i = 0; i < MAX_UINSN_BYTES; i++) { if (auprobe->insn[i] == 0x66) continue; -- cgit v1.2.3 From e55848a4f8ee52465771983e144f0c3337776eda Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 17:24:14 +0200 Subject: uprobes/x86: Conditionalize the usage of handle_riprel_insn() arch_uprobe_analyze_insn() calls handle_riprel_insn() at the start, but only "0xff" and "default" cases need the UPROBE_FIX_RIP_ logic. Move the callsite into "default" case and change the "0xff" case to fall-through. We are going to add the various hooks to handle the rip-relative jmp/call instructions (and more), we need this change to enforce the fact that the new code can not conflict with is_riprel_insn() logic which, after this change, can only be used by default_xol_ops. Note: arch_uprobe_abort_xol() still calls handle_riprel_post_xol() directly. This is fine unless another _xol_ops we may add later will need to reuse "UPROBE_FIX_RIP_AX|UPROBE_FIX_RIP_CX" bits in ->fixup. In this case we can add uprobe_xol_ops->abort() hook, which (perhaps) we will need anyway in the long term. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Reviewed-by: Masami Hiramatsu --- arch/x86/kernel/uprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 13ad8a38c2d9..08cdb82815fe 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -482,8 +482,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups * is either zero or it reflects rip-related fixups. */ - handle_riprel_insn(auprobe, &insn); - switch (OPCODE1(&insn)) { case 0x9d: /* popf */ auprobe->fixups |= UPROBE_FIX_SETF; @@ -512,9 +510,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, case 4: case 5: /* jmp or ljmp, indirect */ fix_ip = false; } - break; + /* fall through */ default: - break; + handle_riprel_insn(auprobe, &insn); } if (fix_ip) -- cgit v1.2.3 From 014940bad8e46ca7bd0483f760f9cba60088a3d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Apr 2014 20:20:10 +0200 Subject: uprobes/x86: Send SIGILL if arch_uprobe_post_xol() fails Currently the error from arch_uprobe_post_xol() is silently ignored. This doesn't look good and this can lead to the hard-to-debug problems. 1. Change handle_singlestep() to loudly complain and send SIGILL. Note: this only affects x86, ppc/arm can't fail. 2. Change arch_uprobe_post_xol() to call arch_uprobe_abort_xol() and avoid TF games if it is going to return an error. This can help to to analyze the problem, if nothing else we should not report ->ip = xol_slot in the core-file. Note: this means that handle_riprel_post_xol() can be called twice, but this is fine because it is idempotent. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 16 ++++++++++++---- kernel/events/uprobes.c | 8 +++++++- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 08cdb82815fe..e72903eacd43 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -594,6 +594,15 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + if (auprobe->ops->post_xol) { + int err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + arch_uprobe_abort_xol(auprobe, regs); + return err; + } + } + current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP @@ -605,8 +614,6 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - if (auprobe->ops->post_xol) - return auprobe->ops->post_xol(auprobe, regs); return 0; } @@ -641,8 +648,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. + * Reset the instruction pointer to its probed address for the potential + * restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2a7c0728bb..d1edc5e6fd03 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1867,10 +1867,11 @@ out: static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; + int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) - arch_uprobe_post_xol(&uprobe->arch, regs); + err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else @@ -1884,6 +1885,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ spin_unlock_irq(¤t->sighand->siglock); + + if (unlikely(err)) { + uprobe_warn(current, "execute the probed insn, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); + } } /* -- cgit v1.2.3 From 75f9ef0b7f1aae33b7be7ba8d9c23c8cb48c2212 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Apr 2014 20:52:19 +0200 Subject: uprobes/x86: Teach arch_uprobe_post_xol() to restart if possible SIGILL after the failed arch_uprobe_post_xol() should only be used as a last resort, we should try to restart the probed insn if possible. Currently only adjust_ret_addr() can fail, and this can only happen if another thread unmapped our stack after we executed "call" out-of-line. Most probably the application if buggy, but even in this case it can have a handler for SIGSEGV/etc. And in theory it can be even correct and do something non-trivial with its memory. Of course we can't restart unconditionally, so arch_uprobe_post_xol() does this only if ->post_xol() returns -ERESTART even if currently this is the only possible error. default_post_xol_op(UPROBE_FIX_CALL) can always restart, but as Jim pointed out it should not forget to pop off the return address pushed by this insn executed out-of-line. Note: this is not "perfect", we do not want the extra handler_chain() after restart, but I think this is the best solution we can realistically do without too much uglifications. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index e72903eacd43..cdd6909affcb 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -443,16 +443,22 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs { struct uprobe_task *utask = current->utask; long correction = (long)(utask->vaddr - utask->xol_vaddr); - int ret = 0; handle_riprel_post_xol(auprobe, regs, &correction); if (auprobe->fixups & UPROBE_FIX_IP) regs->ip += correction; - if (auprobe->fixups & UPROBE_FIX_CALL) - ret = adjust_ret_addr(regs->sp, correction); + if (auprobe->fixups & UPROBE_FIX_CALL) { + if (adjust_ret_addr(regs->sp, correction)) { + if (is_ia32_task()) + regs->sp += 4; + else + regs->sp += 8; + return -ERESTART; + } + } - return ret; + return 0; } static struct uprobe_xol_ops default_xol_ops = { @@ -599,6 +605,12 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) int err = auprobe->ops->post_xol(auprobe, regs); if (err) { arch_uprobe_abort_xol(auprobe, regs); + /* + * Restart the probed insn. ->post_xol() must ensure + * this is really possible if it returns -ERESTART. + */ + if (err == -ERESTART) + return 0; return err; } } -- cgit v1.2.3 From 8faaed1b9f500d6cf32702716733a645c9b0727a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 6 Apr 2014 17:16:10 +0200 Subject: uprobes/x86: Introduce sizeof_long(), cleanup adjust_ret_addr() and arch_uretprobe_hijack_return_addr() 1. Add the trivial sizeof_long() helper and change other callers of is_ia32_task() to use it. TODO: is_ia32_task() is not what we actually want, TS_COMPAT does not necessarily mean 32bit. Fortunately syscall-like insns can't be probed so it actually works, but it would be better to rename and use is_ia32_frame(). 2. As Jim pointed out "ncopied" in arch_uretprobe_hijack_return_addr() and adjust_ret_addr() should be named "nleft". And in fact only the last copy_to_user() in arch_uretprobe_hijack_return_addr() actually needs to inspect the non-zero error code. TODO: adjust_ret_addr() should die. We can always calculate the value we need to write into *regs->sp, just UPROBE_FIX_CALL should record insn->length. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index cdd6909affcb..aecc22054384 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -408,6 +408,11 @@ struct uprobe_xol_ops { int (*post_xol)(struct arch_uprobe *, struct pt_regs *); }; +static inline int sizeof_long(void) +{ + return is_ia32_task() ? 4 : 8; +} + static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); @@ -419,21 +424,14 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) */ static int adjust_ret_addr(unsigned long sp, long correction) { - int rasize, ncopied; - long ra = 0; - - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; + int rasize = sizeof_long(); + long ra; - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&ra, (void __user *)sp, rasize)) return -EFAULT; ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) + if (copy_to_user((void __user *)sp, &ra, rasize)) return -EFAULT; return 0; @@ -450,10 +448,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs if (auprobe->fixups & UPROBE_FIX_CALL) { if (adjust_ret_addr(regs->sp, correction)) { - if (is_ia32_task()) - regs->sp += 4; - else - regs->sp += 8; + regs->sp += sizeof_long(); return -ERESTART; } } @@ -714,23 +709,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize, ncopied; + int rasize = sizeof_long(), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ - rasize = is_ia32_task() ? 4 : 8; - ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; - ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!ncopied)) + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) return orig_ret_vaddr; - if (ncopied != rasize) { + if (nleft != rasize) { pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); -- cgit v1.2.3 From 7ba6db2d688bdf83049a18c8e55b2d1e58e8b0bc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Apr 2014 20:05:02 +0200 Subject: uprobes/x86: Emulate unconditional relative jmp's Currently we always execute all insns out-of-line, including relative jmp's and call's. This assumes that even if regs->ip points to nowhere after the single-step, default_post_xol_op(UPROBE_FIX_IP) logic will update it correctly. However, this doesn't work if this regs->ip == xol_vaddr + insn_offset is not canonical. In this case CPU generates #GP and general_protection() kills the task which tries to execute this insn out-of-line. Now that we have uprobe_xol_ops we can teach uprobes to emulate these insns and solve the problem. This patch adds branch_xol_ops which has a single branch_emulate_op() hook, so far it can only handle rel8/32 relative jmp's. TODO: move ->fixup into the union along with rip_rela_target_address. Signed-off-by: Oleg Nesterov Reported-by: Jonathan Lebon Reviewed-by: Jim Keniston --- arch/x86/include/asm/uprobes.h | 8 +++++++- arch/x86/kernel/uprobes.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 9f8210bcbb49..e9fd4d5537ed 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -44,9 +44,15 @@ struct arch_uprobe { u16 fixups; const struct uprobe_xol_ops *ops; + union { #ifdef CONFIG_X86_64 - unsigned long rip_rela_target_address; + unsigned long rip_rela_target_address; #endif + struct { + s32 offs; + u8 ilen; + } branch; + }; }; struct arch_uprobe_task { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aecc22054384..c3baeaacf1b6 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -461,6 +461,40 @@ static struct uprobe_xol_ops default_xol_ops = { .post_xol = default_post_xol_op, }; +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + regs->ip += auprobe->branch.ilen + auprobe->branch.offs; + return true; +} + +static struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + + switch (OPCODE1(insn)) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + break; + default: + return -ENOSYS; + } + + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; + + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; +} + /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. @@ -478,6 +512,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + /* * Figure out which fixups arch_uprobe_post_xol() will need to perform, * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups -- cgit v1.2.3 From d241006354c550c7d22f304e2fdf90137fb8eaab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Apr 2014 21:06:10 +0200 Subject: uprobes/x86: Emulate nop's using ops->emulate() Finally we can kill the ugly (and very limited) code in __skip_sstep(). Just change branch_setup_xol_ops() to treat "nop" as jmp to the next insn. Thanks to lib/insn.c, it is clever enough. OPCODE1() == 0x90 includes "(rep;)+ nop;" at least, and (afaics) much more. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index c3baeaacf1b6..f3c4212f3819 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -478,6 +478,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) switch (OPCODE1(insn)) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; default: return -ENOSYS; @@ -710,29 +711,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_TF; } -/* - * Skip these instructions as per the currently known x86 ISA. - * rep=0x66*; nop=0x90 - */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int i; - if (auprobe->ops->emulate) return auprobe->ops->emulate(auprobe, regs); - - /* TODO: move this code into ->emulate() hook */ - for (i = 0; i < MAX_UINSN_BYTES; i++) { - if (auprobe->insn[i] == 0x66) - continue; - - if (auprobe->insn[i] == 0x90) { - regs->ip += i + 1; - return true; - } - - break; - } return false; } -- cgit v1.2.3 From 8e89c0be171b1a9ed2ba67168733ca811bb45d5c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 6 Apr 2014 18:11:02 +0200 Subject: uprobes/x86: Emulate relative call's See the previous "Emulate unconditional relative jmp's" which explains why we can not execute "jmp" out-of-line, the same applies to "call". Emulating of rip-relative call is trivial, we only need to additionally push the ret-address. If this fails, we execute this instruction out of line and this should trigger the trap, the probed application should die or the same insn will be restarted if a signal handler expands the stack. We do not even need ->post_xol() for this case. But there is a corner (and almost theoretical) case: another thread can expand the stack right before we execute this insn out of line. In this case it hit the same problem we are trying to solve. So we simply turn the probed insn into "call 1f; 1:" and add ->post_xol() which restores ->sp and restarts. Many thanks to Jonathan who finally found the standalone reproducer, otherwise I would never resolve the "random SIGSEGV's under systemtap" bug-report. Now that the problem is clear we can write the simplified test-case: void probe_func(void), callee(void); int failed = 1; asm ( ".text\n" ".align 4096\n" ".globl probe_func\n" "probe_func:\n" "call callee\n" "ret" ); /* * This assumes that: * * - &probe_func = 0x401000 + a_bit, aligned = 0x402000 * * - xol_vma->vm_start = TASK_SIZE_MAX - PAGE_SIZE = 0x7fffffffe000 * as xol_add_vma() asks; the 1st slot = 0x7fffffffe080 * * so we can target the non-canonical address from xol_vma using * the simple math below, 100 * 4096 is just the random offset */ asm (".org . + 0x800000000000 - 0x7fffffffe080 - 5 - 1 + 100 * 4096\n"); void callee(void) { failed = 0; } int main(void) { probe_func(); return failed; } It SIGSEGV's if you probe "probe_func" (although this is not very reliable, randomize_va_space/etc can change the placement of xol area). Note: as Denys Vlasenko pointed out, amd and intel treat "callw" (0x66 0xe8) differently. This patch relies on lib/insn.c and thus implements the intel's behaviour: 0x66 is simply ignored. Fortunately nothing sane should ever use this insn, so we postpone the fix until we decide what should we do; emulate or not, support or not, etc. Reported-by: Jonathan Lebon Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/include/asm/uprobes.h | 1 + arch/x86/kernel/uprobes.c | 80 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index e9fd4d5537ed..93bee7b93854 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -51,6 +51,7 @@ struct arch_uprobe { struct { s32 offs; u8 ilen; + u8 opc1; } branch; }; }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index f3c4212f3819..0914435001f5 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -461,34 +461,97 @@ static struct uprobe_xol_ops default_xol_ops = { .post_xol = default_post_xol_op, }; +static bool branch_is_call(struct arch_uprobe *auprobe) +{ + return auprobe->branch.opc1 == 0xe8; +} + static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - regs->ip += auprobe->branch.ilen + auprobe->branch.offs; + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + + if (branch_is_call(auprobe)) { + unsigned long new_sp = regs->sp - sizeof_long(); + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) + return false; + regs->sp = new_sp; + } + + regs->ip = new_ip + auprobe->branch.offs; return true; } +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(); + return -ERESTART; +} + +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) +{ + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); +} + static struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, }; /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { + u8 opc1 = OPCODE1(insn); + + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; - switch (OPCODE1(insn)) { + switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; default: return -ENOSYS; } - /* has the side-effect of processing the entire instruction */ - insn_get_length(insn); - if (WARN_ON_ONCE(!insn_complete(insn))) - return -ENOEXEC; - + auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; @@ -532,9 +595,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, case 0xca: fix_ip = false; break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; case 0x9a: /* call absolute - Fix return addr, not ip */ fix_call = true; fix_ip = false; -- cgit v1.2.3 From 8f95505bc18a026ef7d3dfdbce4e5b31b3e4fc1b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 6 Apr 2014 21:53:47 +0200 Subject: uprobes/x86: Emulate relative conditional "short" jmp's Teach branch_emulate_op() to emulate the conditional "short" jmp's which check regs->flags. Note: this doesn't support jcxz/jcexz, loope/loopz, and loopne/loopnz. They all are rel8 and thus they can't trigger the problem, but perhaps we will add the support in future just for completeness. Reported-by: Jonathan Lebon Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 57 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0914435001f5..0460d04f0acc 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -466,9 +466,58 @@ static bool branch_is_call(struct arch_uprobe *auprobe) return auprobe->branch.opc1 == 0xe8; } +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) + +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) + +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) + +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } +} + +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long flags = regs->flags; + + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO + + default: /* not a conditional jmp */ + return true; + } +} + +#undef XF +#undef COND +#undef CASE_COND + static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; if (branch_is_call(auprobe)) { unsigned long new_sp = regs->sp - sizeof_long(); @@ -484,9 +533,11 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) return false; regs->sp = new_sp; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; } - regs->ip = new_ip + auprobe->branch.offs; + regs->ip = new_ip + offs; return true; } @@ -547,8 +598,10 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); break; + default: - return -ENOSYS; + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; } auprobe->branch.opc1 = opc1; -- cgit v1.2.3 From 6cc5e7ff2c38641060f20786a5caf2815edbca5f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 16:22:58 +0200 Subject: uprobes/x86: Emulate relative conditional "near" jmp's Change branch_setup_xol_ops() to simply use opc1 = OPCODE2(insn) - 0x10 if OPCODE1() == 0x0f; this matches the "short" jmp which checks the same condition. Thanks to lib/insn.c, it does the rest correctly. branch->ilen/offs are correct no matter if this jmp is "near" or "short". Reported-by: Jonathan Lebon Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0460d04f0acc..ace22916ade3 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -599,6 +599,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) branch_clear_offset(auprobe, insn); break; + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; + /* + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. + */ + opc1 = OPCODE2(insn) - 0x10; default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS; -- cgit v1.2.3 From 4a3dc121d3c370625575247bf714db3f601d83e9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Mar 2014 16:56:43 +0800 Subject: perf/x86: Export perf_assign_events() export perf_assign_events to allow building perf Intel uncore driver as module Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1395133004-23205-3-git-send-email-zheng.z.yan@intel.com Cc: Arnaldo Carvalho de Melo Cc: eranian@google.com Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ae407f7226c8..89f3b7c1af20 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -721,6 +721,7 @@ int perf_assign_events(struct perf_event **events, int n, return sched.state.unassigned; } +EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { -- cgit v1.2.3 From d00a569284b1340c16fe2c148099e077ea09ebc9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Mar 2014 19:00:35 +0100 Subject: arch,x86: Convert smp_mb__*() x86 is strongly ordered and all its atomic ops imply a full barrier. Implement the two new primitives as the old ones were. Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-knswsr5mldkr0w1lrdxvc81w@git.kernel.org Cc: Dave Jones Cc: Jesse Brandeburg Cc: Linus Torvalds Cc: Michel Lespinasse Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic.h | 7 +------ arch/x86/include/asm/barrier.h | 4 ++++ arch/x86/include/asm/bitops.h | 6 ++---- arch/x86/include/asm/sync_bitops.h | 2 +- arch/x86/kernel/apic/hw_nmi.c | 2 +- 5 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b17f4f48ecd7..6dd1c7dd0473 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -7,6 +7,7 @@ #include #include #include +#include /* * Atomic operations that C can't guarantee us. Useful for @@ -243,12 +244,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) : : "r" ((unsigned)(mask)), "m" (*(addr)) \ : "memory") -/* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - #ifdef CONFIG_X86_32 # include #else diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 69bbb4845020..5c7198cca5ed 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -137,6 +137,10 @@ do { \ #endif +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic() barrier() +#define smp_mb__after_atomic() barrier() + /* * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 9fc1af74dc83..afcd35d331de 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -15,6 +15,7 @@ #include #include #include +#include #if BITS_PER_LONG == 32 # define _BITOPS_LONG_SHIFT 5 @@ -102,7 +103,7 @@ static inline void __set_bit(long nr, volatile unsigned long *addr) * * clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static __always_inline void @@ -156,9 +157,6 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) __clear_bit(nr, addr); } -#define smp_mb__before_clear_bit() barrier() -#define smp_mb__after_clear_bit() barrier() - /** * __change_bit - Toggle a bit in memory * @nr: the bit to change diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 05af3b31d522..f28a24b51dc7 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -41,7 +41,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) * * sync_clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index a698d7165c96..eab67047dec3 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -57,7 +57,7 @@ void arch_trigger_all_cpu_backtrace(void) } clear_bit(0, &backtrace_flag); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static int __kprobes -- cgit v1.2.3 From 5c7411e2937401bf4d024744032f879475364996 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 7 Apr 2014 18:37:47 +0300 Subject: KVM: x86: Fix CR3 and LDT sel should not be saved in TSS According to Intel specifications, only general purpose registers and segment selectors should be saved in the old TSS during 32-bit task-switch. Signed-off-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 205b17eed93c..0dec502d20be 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2496,7 +2496,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { - tss->cr3 = ctxt->ops->get_cr(ctxt, 3); + /* CR3 and ldt selector are not saved intentionally */ tss->eip = ctxt->_eip; tss->eflags = ctxt->eflags; tss->eax = reg_read(ctxt, VCPU_REGS_RAX); @@ -2514,7 +2514,6 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); - tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, @@ -2604,6 +2603,8 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); + u32 eip_offset = offsetof(struct tss_segment_32, eip); + u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); @@ -2613,8 +2614,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, save_state_to_tss32(ctxt, &tss_seg); - ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); + /* Only GP registers and segment selectors are saved */ + ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, + ldt_sel_offset - eip_offset, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; -- cgit v1.2.3 From fd2a445a94d2ab6b39fb623dc02fee48d01a565a Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Wed, 16 Apr 2014 10:02:51 +0100 Subject: KVM: VMX: Advance rip to after an ICEBP instruction When entering an exception after an ICEBP, the saved instruction pointer should point to after the instruction. This fixes the bug here: https://bugs.launchpad.net/qemu/+bug/1119686 Signed-off-by: Huw Davies Reviewed-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eb3f2b1b764c..8fb56e4cdf91 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4841,6 +4841,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= dr6; + if (!(dr6 & ~DR6_RESERVED)) /* icebp */ + skip_emulated_instruction(vcpu); + kvm_queue_exception(vcpu, DB_VECTOR); return 1; } -- cgit v1.2.3 From 4b855078601fc422dbac3059f2215e776f49780f Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Sat, 19 Apr 2014 18:17:44 -0400 Subject: KVM: nVMX: Don't advertise single context invalidation for invept For single context invalidation, we fall through to global invalidation in handle_invept() except for one case - when the operand supplied by L1 is different from what we have in vmcs12. However, typically hypervisors will only call invept for the currently loaded eptp, so the condition will never be true. Signed-off-by: Bandan Das Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8fb56e4cdf91..f00a6e9021b0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2353,12 +2353,11 @@ static __init void nested_vmx_setup_ctls_msrs(void) VMX_EPT_INVEPT_BIT; nested_vmx_ept_caps &= vmx_capability.ept; /* - * Since invept is completely emulated we support both global - * and context invalidation independent of what host cpu - * supports + * For nested guests, we don't do anything specific + * for single context invalidation. Hence, only advertise + * support for global context invalidation. */ - nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT; + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; } else nested_vmx_ept_caps = 0; @@ -6441,7 +6440,6 @@ static int handle_invept(struct kvm_vcpu *vcpu) struct { u64 eptp, gpa; } operand; - u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { @@ -6481,16 +6479,13 @@ static int handle_invept(struct kvm_vcpu *vcpu) } switch (type) { - case VMX_EPT_EXTENT_CONTEXT: - if ((operand.eptp & eptp_mask) != - (nested_ept_get_cr3(vcpu) & eptp_mask)) - break; case VMX_EPT_EXTENT_GLOBAL: kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); nested_vmx_succeed(vcpu); break; default: + /* Trap single context invalidation invept calls */ BUG_ON(1); break; } -- cgit v1.2.3 From 77b0f5d67ff2781f36831cba79674c3e97bd7acf Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Sat, 19 Apr 2014 18:17:45 -0400 Subject: KVM: nVMX: Ack and write vector info to intr_info if L1 asks us to This feature emulates the "Acknowledge interrupt on exit" behavior. We can safely emulate it for L1 to run L2 even if L0 itself has it disabled (to run L1). Signed-off-by: Bandan Das Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/irq.c | 1 + arch/x86/kvm/vmx.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 484bc874688b..bd0da433e6d7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -113,6 +113,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) return kvm_get_apic_interrupt(v); /* APIC */ } +EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f00a6e9021b0..4378014a41a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4526,6 +4526,16 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) { return get_vmcs12(vcpu)->pin_based_vm_exec_control & @@ -8563,6 +8573,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + && nested_exit_intr_ack_set(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, vmcs12->idt_vectoring_info_field, -- cgit v1.2.3 From e0ba1a6ffcfe8dc95586943bbe56badb1459bf25 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Sat, 19 Apr 2014 18:17:46 -0400 Subject: KVM: nVMX: Advertise support for interrupt acknowledgement Some Type 1 hypervisors such as XEN won't enable VMX without it present Signed-off-by: Bandan Das Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4378014a41a9..72b8012991b9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2283,7 +2283,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) rdmsr(MSR_IA32_VMX_EXIT_CTLS, nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | @@ -2291,7 +2291,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + if (vmx_mpx_supported()) nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; -- cgit v1.2.3 From 671bd9934a861288a248b051751061b11654aef9 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 18 Apr 2014 03:35:08 +0300 Subject: KVM: x86: Fix wrong/stuck PMU when guest does not use PMI If a guest enables a performance counter but does not enable PMI, the hypervisor currently does not reprogram the performance counter once it overflows. As a result the host performance counter is kept with the original sampling period which was configured according to the value of the guest's counter when the counter was enabled. Such behaviour can cause very bad consequences. The most distrubing one can cause the guest not to make any progress at all, and keep exiting due to host PMI before any guest instructions is exeucted. This situation occurs when the performance counter holds a very high value when the guest enables the performance counter. As a result the host's sampling period is configured to be very short. The host then never reconfigures the sampling period and get stuck at entry->PMI->exit loop. We encountered such a scenario in our experiments. The solution is to reprogram the counter even if the guest does not use PMI. Signed-off-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/pmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 5c4f63151b4d..cbecaa90399c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -108,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + } } static void kvm_perf_overflow_intr(struct perf_event *perf_event, @@ -117,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { - kvm_perf_overflow(perf_event, data, regs); + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); /* * Inject PMI. If vcpu was in a guest mode during NMI PMI -- cgit v1.2.3 From 346874c9507a2582d0c00021f848de6e115f276c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 18 Apr 2014 03:35:09 +0300 Subject: KVM: x86: Fix CR3 reserved bits According to Intel specifications, PAE and non-PAE does not have any reserved bits. In long-mode, regardless to PCIDE, only the high bits (above the physical address) are reserved. Signed-off-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 6 +----- arch/x86/kvm/emulate.c | 4 ---- arch/x86/kvm/x86.c | 25 +++++-------------------- 3 files changed, 6 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7de069afb382..e21aee98a5c2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -50,11 +50,7 @@ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) -#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) -#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL -#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ - 0xFFFFFF0000000000ULL) +#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0dec502d20be..f3834bbca1d7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3388,10 +3388,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) rsvd = CR3_L_MODE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) - rsvd = CR3_PAE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) - rsvd = CR3_NONPAE_RESERVED_BITS; if (new_val & rsvd) return emulate_gp(ctxt, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bc4aaf68190c..e4ccc6cf4108 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -701,26 +701,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu)) { - if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { - if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) - return 1; - } else - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; - } else { - if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) - return 1; - if (is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) - return 1; - } - /* - * We don't check reserved bits in nonpae mode, because - * this isn't enforced, and VMware depends on this. - */ - } + if (is_long_mode(vcpu) && (cr3 & CR3_L_MODE_RESERVED_BITS)) + return 1; + if (is_pae(vcpu) && is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + return 1; vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -- cgit v1.2.3 From e6e39f0438bc4b0da9334ca42337775c7a00db21 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 18 Apr 2014 03:35:10 +0300 Subject: KVM: x86: IN instruction emulation should ignore REP-prefix The IN instruction is not be affected by REP-prefix as INS is. Therefore, the emulation should ignore the REP prefix as well. The current emulator implementation tries to perform writeback when IN instruction with REP-prefix is emulated. This causes it to perform wrong memory write or spurious #GP exception to be injected to the guest. Signed-off-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f3834bbca1d7..e8a58409b5ac 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1324,7 +1324,8 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, rc->end = n * size; } - if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) { + if (ctxt->rep_prefix && (ctxt->d & String) && + !(ctxt->eflags & EFLG_DF)) { ctxt->dst.data = rc->data + rc->pos; ctxt->dst.type = OP_MEM_STR; ctxt->dst.count = (rc->end - rc->pos) / size; -- cgit v1.2.3 From 42bf549f3c672006ba18e97152cbc563315ba4e6 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 18 Apr 2014 07:11:34 +0300 Subject: KVM: x86: Processor mode may be determined incorrectly If EFER.LMA is off, cs.l does not determine execution mode. Currently, the emulation engine assumes differently. Signed-off-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4ccc6cf4108..7cc646626afd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4888,7 +4888,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : - cs_l ? X86EMUL_MODE_PROT64 : + (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); -- cgit v1.2.3 From a086f6a1ebc9d8d2d028b99e779ce0dbd9691dea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 17 Apr 2014 17:06:12 +0800 Subject: Revert "KVM: Simplify kvm->tlbs_dirty handling" This reverts commit 5befdc385ddb2d5ae8995ad89004529a3acf58fc. Since we will allow flush tlb out of mmu-lock in the later patch Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 7 +++---- include/linux/kvm_host.h | 4 +--- virt/kvm/kvm_main.c | 5 ++++- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 123efd3ec29f..410776528265 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -913,8 +913,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't * used by guest then tlbs are not flushed, so guest is allowed to access the * freed pages. - * We set tlbs_dirty to let the notifier know this change and delay the flush - * until such a case actually happens. + * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -943,7 +942,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty = true; + vcpu->kvm->tlbs_dirty++; continue; } @@ -958,7 +957,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty = true; + vcpu->kvm->tlbs_dirty++; continue; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32d263f683dc..820fc2e1d9df 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -411,9 +411,7 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif - /* Protected by mmu_lock */ - bool tlbs_dirty; - + long tlbs_dirty; struct list_head devices; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ea46d64c8e75..fa70c6e642b4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -186,9 +186,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { + long dirty_count = kvm->tlbs_dirty; + + smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; - kvm->tlbs_dirty = false; + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); -- cgit v1.2.3 From 92a476cbfc476c63ee982dd33d15a8c88b4d51b9 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 17 Apr 2014 17:06:13 +0800 Subject: KVM: MMU: properly check last spte in fast_page_fault() Using sp->role.level instead of @level since @level is not got from the page table hierarchy There is no issue in current code since the fast page fault currently only fixes the fault caused by dirty-log that is always on the last level (level = 1) This patch makes the code more readable and avoids potential issue in the further development Reviewed-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 668ae5916de9..63107049249d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2802,9 +2802,9 @@ static bool page_fault_can_be_fast(u32 error_code) } static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); gfn_t gfn; WARN_ON(!sp->role.direct); @@ -2830,6 +2830,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, u32 error_code) { struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; bool ret = false; u64 spte = 0ull; @@ -2853,7 +2854,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, goto exit; } - if (!is_last_spte(spte, level)) + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) goto exit; /* @@ -2879,7 +2881,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * the gfn is not stable for indirect shadow page. * See Documentation/virtual/kvm/locking.txt to get more detail. */ - ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); + ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, spte, ret); -- cgit v1.2.3 From c126d94f2c90ed9daee24a94f1c67aff7e9bf387 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 17 Apr 2014 17:06:14 +0800 Subject: KVM: MMU: lazily drop large spte Currently, kvm zaps the large spte if write-protected is needed, the later read can fault on that spte. Actually, we can make the large spte readonly instead of making them un-present, the page fault caused by read access can be avoided The idea is from Avi: | As I mentioned before, write-protecting a large spte is a good idea, | since it moves some work from protect-time to fault-time, so it reduces | jitter. This removes the need for the return value. This version has fixed the issue reported in 6b73a9606, the reason of that issue is that fast_page_fault() directly sets the readonly large spte to writable but only dirty the first page into the dirty-bitmap that means other pages are missed. Fixed it by only the normal sptes (on the PT_PAGE_TABLE_LEVEL level) can be fast fixed Reviewed-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 34 ++++++++++++++++++---------------- arch/x86/kvm/x86.c | 8 ++++++-- 2 files changed, 24 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 63107049249d..ddf06963a74c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1176,8 +1176,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) /* * Write-protect on the specified @sptep, @pt_protect indicates whether - * spte writ-protection is caused by protecting shadow page table. - * @flush indicates whether tlb need be flushed. + * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between drity logging and spte * protection: @@ -1186,10 +1185,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * - * Return true if the spte is dropped. + * Return true if tlb need be flushed. */ -static bool -spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) { u64 spte = *sptep; @@ -1199,17 +1197,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - if (__drop_large_spte(kvm, sptep)) { - *flush |= true; - return true; - } - if (pt_protect) spte &= ~SPTE_MMU_WRITEABLE; spte = spte & ~PT_WRITABLE_MASK; - *flush |= mmu_spte_update(sptep, spte); - return false; + return mmu_spte_update(sptep, spte); } static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, @@ -1221,11 +1213,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { - sptep = rmap_get_first(*rmapp, &iter); - continue; - } + flush |= spte_write_protect(kvm, sptep, pt_protect); sptep = rmap_get_next(&iter); } @@ -2876,6 +2865,19 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, if (!spte_is_locklessly_modifiable(spte)) goto exit; + /* + * Do not fix write-permission on the large spte since we only dirty + * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() + * that means other pages are missed if its slot is dirty-logged. + * + * Instead, we let the slow page fault path create a normal spte to + * fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + goto exit; + /* * Currently, fast page fault only works for direct mapping since * the gfn is not stable for indirect shadow page. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7cc646626afd..63a828d206c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7315,8 +7315,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); /* * Write protect all pages for dirty logging. - * Existing largepage mappings are destroyed here and new ones will - * not be created until the end of the logging. + * + * All the sptes including the large sptes which point to this + * slot are set to readonly. We can not create any new large + * spte on this slot until the end of the logging. + * + * See the comments in fast_page_fault(). */ if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_slot_remove_write_access(kvm, mem->slot); -- cgit v1.2.3 From 7f31c9595e3c87f68dc54b3269e900f3017ed405 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 17 Apr 2014 17:06:15 +0800 Subject: KVM: MMU: flush tlb if the spte can be locklessly modified Relax the tlb flush condition since we will write-protect the spte out of mmu lock. Note lockless write-protection only marks the writable spte to readonly and the spte can be writable only if both SPTE_HOST_WRITEABLE and SPTE_MMU_WRITEABLE are set (that are tested by spte_is_locklessly_modifiable) This patch is used to avoid this kind of race: VCPU 0 VCPU 1 lockless wirte protection: set spte.w = 0 lock mmu-lock write protection the spte to sync shadow page, see spte.w = 0, then without flush tlb unlock mmu-lock !!! At this point, the shadow page can still be writable due to the corrupt tlb entry Flush all TLB Reviewed-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ddf06963a74c..388a2ef83911 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -595,7 +595,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomicly update it, see the comments in * spte_has_volatile_bits(). */ - if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) + if (spte_is_locklessly_modifiable(old_spte) && + !is_writable_pte(new_spte)) ret = true; if (!shadow_accessed_mask) -- cgit v1.2.3 From 198c74f43f0f5473f99967aead30ddc622804bc1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 17 Apr 2014 17:06:16 +0800 Subject: KVM: MMU: flush tlb out of mmu lock when write-protect the sptes Now we can flush all the TLBs out of the mmu lock without TLB corruption when write-proect the sptes, it is because: - we have marked large sptes readonly instead of dropping them that means we just change the spte from writable to readonly so that we only need to care the case of changing spte from present to present (changing the spte from present to nonpresent will flush all the TLBs immediately), in other words, the only case we need to care is mmu_spte_update() - in mmu_spte_update(), we haved checked SPTE_HOST_WRITEABLE | PTE_MMU_WRITEABLE instead of PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK anymore Acked-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++---- arch/x86/kvm/mmu.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 12 ++++++++++-- 3 files changed, 64 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 388a2ef83911..65f2400b8268 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4309,15 +4309,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (*rmapp) __rmap_write_protect(kvm, rmapp, false); - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_flush_remote_tlbs(kvm); + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); - } } } - kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + /* + * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() + * which do tlb flush out of mmu-lock should be serialized by + * kvm->slots_lock otherwise tlb flush would be missed. + */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * We can flush all the TLBs out of the mmu lock without TLB + * corruption since we just change the spte from writable to + * readonly so that we only need to care the case of changing + * spte from present to present (changing the spte from present + * to nonpresent will flush all the TLBs immediately), in other + * words, the only case we care is mmu_spte_update() where we + * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE + * instead of PT_WRITABLE_MASK, that means it does not depend + * on PT_WRITABLE_MASK anymore. + */ + kvm_flush_remote_tlbs(kvm); } #define BATCH_ZAP_PAGES 10 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3842e70bdb7c..b982112d2ca5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -104,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte) return pte & PT_PRESENT_MASK; } +/* + * Currently, we have two sorts of write-protection, a) the first one + * write-protects guest page to sync the guest modification, b) another one is + * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences + * between these two sorts are: + * 1) the first case clears SPTE_MMU_WRITEABLE bit. + * 2) the first case requires flushing tlb immediately avoiding corrupting + * shadow page table between all vcpus so it should be in the protection of + * mmu-lock. And the another case does not need to flush tlb until returning + * the dirty bitmap to userspace since it only write-protects the page + * logged in the bitmap, that means the page in the dirty bitmap is not + * missed, so it can flush tlb out of mmu-lock. + * + * So, there is the problem: the first case can meet the corrupted tlb caused + * by another case which write-protects pages but without flush tlb + * immediately. In order to making the first case be aware this problem we let + * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit + * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. + * + * Anyway, whenever a spte is updated (only permission and status bits are + * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes + * readonly, if that happens, we need to flush tlb. Fortunately, + * mmu_spte_update() has already handled it perfectly. + * + * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: + * - if we want to see if it has writable tlb entry or if the spte can be + * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most + * case, otherwise + * - if we fix page fault on the spte or do write-protection by dirty logging, + * check PT_WRITABLE_MASK. + * + * TODO: introduce APIs to split these two cases. + */ static inline int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63a828d206c5..c5582c385bc0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3632,11 +3632,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) offset = i * BITS_PER_LONG; kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); } - if (is_dirty) - kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + /* See the comments in kvm_mmu_slot_remove_write_access(). */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * All the TLBs can be flushed out of mmu lock, see the comments in + * kvm_mmu_slot_remove_write_access(). + */ + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + r = -EFAULT; if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) goto out; -- cgit v1.2.3 From 5e40704ed2c69425bcb076fb1890417ef137e6c8 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Apr 2014 13:57:37 +0100 Subject: arm: xen: implement multicall hypercall support. As part of this make the usual change to xen_ulong_t in place of unsigned long. This change has no impact on x86. The Linux definition of struct multicall_entry.result differs from the Xen definition, I think for good reasons, and used a long rather than an unsigned long. Therefore introduce a xen_long_t, which is a long on x86 architectures and a signed 64-bit integer on ARM. Use uint32_t nr_calls on x86 for consistency with the ARM definition. Build tested on amd64 and i386 builds. Runtime tested on ARM. Signed-off-by: Ian Campbell Cc: Stefano Stabellini Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/hypercall.h | 6 +----- arch/arm/include/asm/xen/interface.h | 2 ++ arch/arm/xen/hypercall.S | 1 + arch/arm64/xen/hypercall.S | 1 + arch/x86/include/asm/xen/hypercall.h | 2 +- arch/x86/include/asm/xen/interface.h | 3 +++ include/xen/interface/xen.h | 6 +++--- 7 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/xen/hypercall.h b/arch/arm/include/asm/xen/hypercall.h index 7704e28c3483..7658150488d6 100644 --- a/arch/arm/include/asm/xen/hypercall.h +++ b/arch/arm/include/asm/xen/hypercall.h @@ -48,6 +48,7 @@ int HYPERVISOR_memory_op(unsigned int cmd, void *arg); int HYPERVISOR_physdev_op(int cmd, void *arg); int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args); int HYPERVISOR_tmem_op(void *arg); +int HYPERVISOR_multicall(struct multicall_entry *calls, uint32_t nr); static inline void MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, @@ -63,9 +64,4 @@ MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, BUG(); } -static inline int -HYPERVISOR_multicall(void *call_list, int nr_calls) -{ - BUG(); -} #endif /* _ASM_ARM_XEN_HYPERCALL_H */ diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h index 1151188bcd83..50066006e6bd 100644 --- a/arch/arm/include/asm/xen/interface.h +++ b/arch/arm/include/asm/xen/interface.h @@ -40,6 +40,8 @@ typedef uint64_t xen_pfn_t; #define PRI_xen_pfn "llx" typedef uint64_t xen_ulong_t; #define PRI_xen_ulong "llx" +typedef int64_t xen_long_t; +#define PRI_xen_long "llx" /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); diff --git a/arch/arm/xen/hypercall.S b/arch/arm/xen/hypercall.S index d1cf7b7c2200..44e3a5f10c4c 100644 --- a/arch/arm/xen/hypercall.S +++ b/arch/arm/xen/hypercall.S @@ -89,6 +89,7 @@ HYPERCALL2(memory_op); HYPERCALL2(physdev_op); HYPERCALL3(vcpu_op); HYPERCALL1(tmem_op); +HYPERCALL2(multicall); ENTRY(privcmd_call) stmdb sp!, {r4} diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 531342ec4bcf..8bbe9401f4f0 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -80,6 +80,7 @@ HYPERCALL2(memory_op); HYPERCALL2(physdev_op); HYPERCALL3(vcpu_op); HYPERCALL1(tmem_op); +HYPERCALL2(multicall); ENTRY(privcmd_call) mov x16, x0 diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index e709884d0ef9..ca08a27b90b3 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -343,7 +343,7 @@ HYPERVISOR_memory_op(unsigned int cmd, void *arg) } static inline int -HYPERVISOR_multicall(void *call_list, int nr_calls) +HYPERVISOR_multicall(void *call_list, uint32_t nr_calls) { return _hypercall2(int, multicall, call_list, nr_calls); } diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index fd9cb7695b5f..3400dbaec3c3 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -54,6 +54,9 @@ typedef unsigned long xen_pfn_t; #define PRI_xen_pfn "lx" typedef unsigned long xen_ulong_t; #define PRI_xen_ulong "lx" +typedef long xen_long_t; +#define PRI_xen_long "lx" + /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 0cd5ca333fac..de082130ba4b 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -275,9 +275,9 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update); * NB. The fields are natural register size for this architecture. */ struct multicall_entry { - unsigned long op; - long result; - unsigned long args[6]; + xen_ulong_t op; + xen_long_t result; + xen_ulong_t args[6]; }; DEFINE_GUEST_HANDLE_STRUCT(multicall_entry); -- cgit v1.2.3 From d20642f0a32575605f152a1cb7753bdfca5fc94b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 18 Apr 2014 17:19:54 -0500 Subject: x86: move FIX_EARLYCON_MEM kconfig into x86 In preparation to support FIX_EARLYCON_MEM on other arches, make the option per arch. Signed-off-by: Rob Herring Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 3 +++ drivers/tty/serial/8250/Kconfig | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25d2c6f7325e..0fb6cfd50e75 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,6 +261,9 @@ config ARCH_HWEIGHT_CFLAGS config ARCH_SUPPORTS_UPROBES def_bool y +config FIX_EARLYCON_MEM + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig index 23329918f229..91f1d8332b86 100644 --- a/drivers/tty/serial/8250/Kconfig +++ b/drivers/tty/serial/8250/Kconfig @@ -90,11 +90,6 @@ config SERIAL_8250_CONSOLE If unsure, say N. -config FIX_EARLYCON_MEM - bool - depends on X86 - default y - config SERIAL_8250_GSC tristate depends on SERIAL_8250 && GSC -- cgit v1.2.3 From 0b2d70764bb39242dcc49c0ebd10fcb8258ce5fa Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 25 Apr 2014 11:01:08 -0600 Subject: x86/PCI: Fix Broadcom CNB20LE unintended sign extension In the expression "word1 << 16", word1 starts as u16, but is promoted to a signed int, then sign-extended to resource_size_t, which is probably not what was intended. Cast to resource_size_t to avoid the sign extension. Found by Coverity (CID 138749, 138750). Signed-off-by: Bjorn Helgaas --- arch/x86/pci/broadcom_bus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index 614392ced7d6..bb461cfd01ab 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -60,8 +60,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) word1 = read_pci_config_16(bus, slot, func, 0xc4); word2 = read_pci_config_16(bus, slot, func, 0xc6); if (word1 != word2) { - res.start = (word1 << 16) | 0x0000; - res.end = (word2 << 16) | 0xffff; + res.start = ((resource_size_t) word1 << 16) | 0x0000; + res.end = ((resource_size_t) word2 << 16) | 0xffff; res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH; update_res(info, res.start, res.end, res.flags, 0); } -- cgit v1.2.3 From 4e4ba9441fb431f3996de2454522342e0b1d9263 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 14 Apr 2014 15:30:09 -0600 Subject: x86/PCI: Don't try to move IORESOURCE_PCI_FIXED resources Don't attempt to move resource marked IORESOURCE_PCI_FIXED, even if pci_claim_resource() fails. In some cases, these are legacy resources that cannot be moved. Signed-off-by: Bjorn Helgaas --- arch/x86/pci/i386.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index db6b1ab43255..6db58d68feb5 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -271,11 +271,16 @@ static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass) "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - /* We'll assign a new address later */ - pcibios_save_fw_addr(dev, - idx, r->start); - r->end -= r->start; - r->start = 0; + if (r->flags & IORESOURCE_PCI_FIXED) { + dev_info(&dev->dev, "BAR %d %pR is immovable\n", + idx, r); + } else { + /* We'll assign a new address later */ + pcibios_save_fw_addr(dev, + idx, r->start); + r->end -= r->start; + r->start = 0; + } } } } -- cgit v1.2.3 From 44c8bdbe32aa51fe673c7a86b1e8f45b952d7759 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 14 Apr 2014 15:35:21 -0600 Subject: x86/PCI: Mark ATI SBx00 HPET BAR as IORESOURCE_PCI_FIXED Bodo reported that on the Asrock M3A UCC, v3.12.6 hangs during boot unless he uses "pci=nocrs". This regression was caused by 7bc5e3f2be32 ("x86/PCI: use host bridge _CRS info by default on 2008 and newer machines"), which appeared in v2.6.34. The reason is that the HPET address appears in a PCI device BAR, and this address is not contained in any of the host bridge windows. Linux moves the PCI BAR into a window, but the original address was published via the HPET table and an ACPI device, so changing the BAR is a bad idea. Here's the dmesg info: ACPI: HPET id: 0x43538301 base: 0xfed00000 pci_root PNP0A03:00: host bridge window [mem 0xd0000000-0xdfffffff] pci_root PNP0A03:00: host bridge window [mem 0xf0000000-0xfebfffff] pci 0000:00:14.0: [1002:4385] type 0 class 0x000c05 pci 0000:00:14.0: reg 14: [mem 0xfed00000-0xfed003ff] hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0, 0 pnp 00:06: Plug and Play ACPI device, IDs PNP0103 (active) pnp 00:06: [mem 0xfed00000-0xfed003ff] When we notice the BAR is not in a host bridge window, we try to move it, but that causes a hang shortly thereafter: pci 0000:00:14.0: no compatible bridge window for [mem 0xfed00000-0xfed003ff] pci 0000:00:14.0: BAR 1: assigned [mem 0xf0000000-0xf00003ff] This patch marks the BAR as IORESOURCE_PCI_FIXED to prevent Linux from moving it. This depends on a previous patch ("x86/PCI: Don't try to move IORESOURCE_PCI_FIXED resources") to check for this flag when pci_claim_resource() fails. Link: https://bugzilla.kernel.org/show_bug.cgi?id=68591 Reported-and-tested-by: Bodo Eggert <7eggert@gmx.de> Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 94ae9ae9574f..ef334a003f3c 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -6,6 +6,7 @@ #include #include #include +#include #include static void pci_fixup_i450nx(struct pci_dev *d) @@ -526,6 +527,19 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); +#ifdef CONFIG_HPET_TIMER +static void sb600_hpet_quirk(struct pci_dev *dev) +{ + struct resource *r = &dev->resource[1]; + + if (r->flags & IORESOURCE_MEM && r->start == hpet_address) { + r->flags |= IORESOURCE_PCI_FIXED; + dev_info(&dev->dev, "reg 0x14 contains HPET; making it immovable\n"); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, 0x4385, sb600_hpet_quirk); +#endif + /* * Twinhead H12Y needs us to block out a region otherwise we map devices * there and any access kills the box. -- cgit v1.2.3 From e4c9a5a17567f8ea975bdcfdd1bf9d63965de6c9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 26 Apr 2014 22:30:23 -0300 Subject: KVM: x86: expose invariant tsc cpuid bit (v2) Invariant TSC is a property of TSC, no additional support code necessary. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f47a104a749c..333b88db22fe 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -495,6 +495,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + case 0x80000007: /* Advanced power management */ + /* invariant TSC is CPUID.80000007H:EDX[8] */ + entry->edx &= (1 << 8); + /* mask against host */ + entry->edx &= boot_cpu_data.x86_power; + entry->eax = entry->ebx = entry->ecx = 0; + break; case 0x80000008: { unsigned g_phys_as = (entry->eax >> 16) & 0xff; unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); @@ -525,7 +532,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 6: /* Thermal management */ - case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: -- cgit v1.2.3 From 1bac1869947ee3866c6d687b99e4283d37bb499b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 31 Mar 2014 15:17:31 -0500 Subject: x86: use FDT accessors for FDT blob header data Remove the direct accesses to FDT header data using accessor function instead. This makes the code more readable and makes the FDT blob structure more opaque to the arch code. This also prepares for removing struct boot_param_header completely. Signed-off-by: Rob Herring Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Tested-by: Grant Likely --- arch/x86/kernel/devicetree.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index d35078ea1446..7db54b5d5f86 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -206,23 +206,21 @@ static void __init dtb_apic_setup(void) static void __init x86_flattree_get_config(void) { u32 size, map_len; - struct boot_param_header *dt; + void *dt; if (!initial_dtb) return; - map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), - (u64)sizeof(struct boot_param_header)); + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - dt = early_memremap(initial_dtb, map_len); - size = be32_to_cpu(dt->totalsize); + initial_boot_params = dt = early_memremap(initial_dtb, map_len); + size = of_get_flat_dt_size(); if (map_len < size) { early_iounmap(dt, map_len); - dt = early_memremap(initial_dtb, size); + initial_boot_params = dt = early_memremap(initial_dtb, size); map_len = size; } - initial_boot_params = dt; unflatten_and_copy_device_tree(); early_iounmap(dt, map_len); } -- cgit v1.2.3 From 3891a04aafd668686239349ea58f3314ea2af86b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 29 Apr 2014 16:46:09 -0700 Subject: x86-64, espfix: Don't leak bits 31:16 of %esp returning to 16-bit stack The IRET instruction, when returning to a 16-bit segment, only restores the bottom 16 bits of the user space stack pointer. This causes some 16-bit software to break, but it also leaks kernel state to user space. We have a software workaround for that ("espfix") for the 32-bit kernel, but it relies on a nonzero stack segment base which is not available in 64-bit mode. In checkin: b3b42ac2cbae x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels we "solved" this by forbidding 16-bit segments on 64-bit kernels, with the logic that 16-bit support is crippled on 64-bit kernels anyway (no V86 support), but it turns out that people are doing stuff like running old Win16 binaries under Wine and expect it to work. This works around this by creating percpu "ministacks", each of which is mapped 2^16 times 64K apart. When we detect that the return SS is on the LDT, we copy the IRET frame to the ministack and use the relevant alias to return to userspace. The ministacks are mapped readonly, so if IRET faults we promote #GP to #DF which is an IST vector and thus has its own stack; we then do the fixup in the #DF handler. (Making #GP an IST exception would make the msr_safe functions unsafe in NMI/MC context, and quite possibly have other effects.) Special thanks to: - Andy Lutomirski, for the suggestion of using very small stack slots and copy (as opposed to map) the IRET frame there, and for the suggestion to mark them readonly and let the fault promote to #DF. - Konrad Wilk for paravirt fixup and testing. - Borislav Petkov for testing help and useful comments. Reported-by: Brian Gerst Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com Cc: Konrad Rzeszutek Wilk Cc: Borislav Petkov Cc: Andrew Lutomriski Cc: Linus Torvalds Cc: Dirk Hohndel Cc: Arjan van de Ven Cc: comex Cc: Alexander van Heukelum Cc: Boris Ostrovsky Cc: # consider after upstream merge --- Documentation/x86/x86_64/mm.txt | 2 + arch/x86/include/asm/pgtable_64_types.h | 2 + arch/x86/include/asm/setup.h | 3 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/entry_64.S | 73 ++++++++++- arch/x86/kernel/espfix_64.c | 208 ++++++++++++++++++++++++++++++++ arch/x86/kernel/ldt.c | 11 -- arch/x86/kernel/smpboot.c | 7 ++ arch/x86/mm/dump_pagetables.c | 44 +++++-- init/main.c | 4 + 10 files changed, 329 insertions(+), 26 deletions(-) create mode 100644 arch/x86/kernel/espfix_64.c (limited to 'arch/x86') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index c584a51add15..afe68ddbe6a4 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... +ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks +... unused hole ... ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index c883bf726398..7166e25ecb57 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9264f04a4c55..9e3be3329a7e 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -57,6 +57,9 @@ extern void x86_ce4100_early_setup(void); static inline void x86_ce4100_early_setup(void) { } #endif +extern void init_espfix_bsp(void); +extern void init_espfix_ap(void); + #ifndef _SETUP /* diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f4d96000d33a..1cc3789d99d9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..bffaa986cafc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -58,6 +58,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -1040,8 +1041,16 @@ restore_args: RESTORE_ARGS 1,8,1 irq_return: + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ + testb $4,(SS-RIP)(%rsp) + jnz irq_return_ldt + +irq_return_iret: INTERRUPT_RETURN - _ASM_EXTABLE(irq_return, bad_iret) + _ASM_EXTABLE(irq_return_iret, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) @@ -1049,6 +1058,30 @@ ENTRY(native_iret) _ASM_EXTABLE(native_iret, bad_iret) #endif +irq_return_ldt: + pushq_cfi %rax + pushq_cfi %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq_cfi %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq_cfi %rax + jmp irq_return_iret + .section .fixup,"ax" bad_iret: /* @@ -1110,9 +1143,41 @@ ENTRY(retint_kernel) call preempt_schedule_irq jmp exit_intr #endif - CFI_ENDPROC END(common_interrupt) + + /* + * If IRET takes a fault on the espfix stack, then we + * end up promoting it to a doublefault. In that case, + * modify the stack to make it look like we just entered + * the #GP handler from user space, similar to bad_iret. + */ + ALIGN +__do_double_fault: + XCPT_FRAME 1 RDI+8 + movq RSP(%rdi),%rax /* Trap on the espfix stack? */ + sarq $PGDIR_SHIFT,%rax + cmpl $ESPFIX_PGD_ENTRY,%eax + jne do_double_fault /* No, just deliver the fault */ + cmpl $__KERNEL_CS,CS(%rdi) + jne do_double_fault + movq RIP(%rdi),%rax + cmpq $irq_return_iret,%rax +#ifdef CONFIG_PARAVIRT + je 1f + cmpq $native_iret,%rax +#endif + jne do_double_fault /* This shouldn't happen... */ +1: + movq PER_CPU_VAR(kernel_stack),%rax + subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ + movq %rax,RSP(%rdi) + movq $0,(%rax) /* Missing (lost) #GP error code */ + movq $general_protection,RIP(%rdi) + retq + CFI_ENDPROC +END(__do_double_fault) + /* * End of kprobes section */ @@ -1314,7 +1379,7 @@ zeroentry overflow do_overflow zeroentry bounds do_bounds zeroentry invalid_op do_invalid_op zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault +paranoiderrorentry double_fault __do_double_fault zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun errorentry invalid_TSS do_invalid_TSS errorentry segment_not_present do_segment_not_present @@ -1601,7 +1666,7 @@ error_sti: */ error_kernelspace: incl %ebx - leaq irq_return(%rip),%rcx + leaq irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs movl %ecx,%eax /* zero extend */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 000000000000..8a64da36310f --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,208 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand; + + /* + * This is run before the entropy pools are initialized, + * but this is hopefully better than nothing. + */ + if (!arch_get_random_long(&rand)) { + /* The constant is an arbitrary large prime */ + rdtscll(rand); + rand *= 0xc345c6b72fd16123UL; + } + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd_p; + pteval_t ptemask; + + ptemask = __supported_pte_mask; + + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(); +} + +void init_espfix_ap(void) +{ + unsigned int cpu, page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(this_cpu_read(espfix_stack))) + return; /* Already initialized */ + + cpu = smp_processor_id(); + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = (void *)__get_free_page(GFP_KERNEL); + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + ACCESS_ONCE(espfix_pages[page]) = stack_page; + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + this_cpu_write(espfix_stack, addr); + this_cpu_write(espfix_waddr, (unsigned long)stack_page + + (addr & ~PAGE_MASK)); +} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index af1d14a9ebda..ebc987398923 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,17 +229,6 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } - /* - * On x86-64 we do not support 16-bit segments due to - * IRET leaking the high bits of the kernel stack address. - */ -#ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit) { - error = -EINVAL; - goto out_unlock; - } -#endif - fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 34826934d4a7..61a5350850fb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -243,6 +243,13 @@ static void notrace start_secondary(void *unused) */ check_tsc_sync_target(); + /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_64 + init_espfix_ap(); +#endif + /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 20621d753d5f..167ffcac16ed 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -30,12 +30,14 @@ struct pg_state { unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; + unsigned long lines; bool to_dmesg; }; struct addr_marker { unsigned long start_address; const char *name; + unsigned long max_lines; }; /* indices for address_markers; keep sync'd w/ address_markers below */ @@ -46,6 +48,7 @@ enum address_markers_idx { LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, + ESPFIX_START_NR, HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, @@ -68,6 +71,7 @@ static struct addr_marker address_markers[] = { { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, + { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -182,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, pgprot_t new_prot, int level) { pgprotval_t prot, cur; - static const char units[] = "KMGTPE"; + static const char units[] = "BKMGTPE"; /* * If we have a "break" in the series, we need to flush the state that @@ -197,6 +201,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_prot = new_prot; st->level = level; st->marker = address_markers; + st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); } else if (prot != cur || level != st->level || @@ -208,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st, /* * Now print the actual finished series */ - pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - - delta = (st->current_address - st->start_address) >> 10; - while (!(delta & 1023) && unit[1]) { - delta >>= 10; - unit++; + if (!st->marker->max_lines || + st->lines < st->marker->max_lines) { + pt_dump_seq_printf(m, st->to_dmesg, + "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); + + delta = st->current_address - st->start_address; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", + delta, *unit); + printk_prot(m, st->current_prot, st->level, + st->to_dmesg); } - pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit); - printk_prot(m, st->current_prot, st->level, st->to_dmesg); + st->lines++; /* * We print markers for special areas of address space, @@ -226,7 +238,17 @@ static void note_page(struct seq_file *m, struct pg_state *st, * This helps in the interpretation. */ if (st->current_address >= st->marker[1].start_address) { + if (st->marker->max_lines && + st->lines > st->marker->max_lines) { + unsigned long nskip = + st->lines - st->marker->max_lines; + pt_dump_seq_printf(m, st->to_dmesg, + "... %lu entr%s skipped ... \n", + nskip, + nskip == 1 ? "y" : "ies"); + } st->marker++; + st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); } diff --git a/init/main.c b/init/main.c index 9c7fd4c9249f..70fc00e7db06 100644 --- a/init/main.c +++ b/init/main.c @@ -616,6 +616,10 @@ asmlinkage void __init start_kernel(void) #ifdef CONFIG_X86 if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); +#endif +#ifdef CONFIG_X86_64 + /* Should be run before the first non-init thread is created */ + init_espfix_bsp(); #endif thread_info_cache_init(); cred_init(); -- cgit v1.2.3 From 246f2d2ee1d715e1077fc47d61c394569c8ee692 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 30 Apr 2014 14:03:25 -0700 Subject: x86-32, espfix: Remove filter for espfix32 due to race It is not safe to use LAR to filter when to go down the espfix path, because the LDT is per-process (rather than per-thread) and another thread might change the descriptors behind our back. Fortunately it is always *safe* (if a bit slow) to go down the espfix path, and a 32-bit LDT stack segment is extremely rare. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com Cc: # consider after upstream merge --- arch/x86/kernel/entry_32.S | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a2a4f4697889..2780b8f3b96c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -551,11 +551,6 @@ ENTRY(iret_exc) CFI_RESTORE_STATE ldt_ss: - larl PT_OLDSS(%esp), %eax - jnz restore_nocheck - testl $0x00400000, %eax # returning to 32bit stack? - jnz restore_nocheck # allright, normal return - #ifdef CONFIG_PARAVIRT /* * The kernel can't run on a non-flat stack if paravirt mode -- cgit v1.2.3 From e1fe9ed8d2a4937510d0d60e20705035c2609aea Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 1 May 2014 14:12:23 -0700 Subject: x86, espfix: Move espfix definitions into a separate header file Sparse warns that the percpu variables aren't declared before they are defined. Rather than hacking around it, move espfix definitions into a proper header file. Reported-by: Fengguang Wu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/espfix.h | 16 ++++++++++++++++ arch/x86/include/asm/setup.h | 5 ++--- arch/x86/kernel/espfix_64.c | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 arch/x86/include/asm/espfix.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h new file mode 100644 index 000000000000..729051c82b02 --- /dev/null +++ b/arch/x86/include/asm/espfix.h @@ -0,0 +1,16 @@ +#ifdef _ASM_X86_ESPFIX_H +#define _ASM_X86_ESPFIX_H + +#ifdef CONFIG_X86_64 + +#include + +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +extern void init_espfix_bsp(void); +extern void init_espfix_ap(void); + +#endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_ESPFIX_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9e3be3329a7e..ff4e7b236e21 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -57,11 +57,10 @@ extern void x86_ce4100_early_setup(void); static inline void x86_ce4100_early_setup(void) { } #endif -extern void init_espfix_bsp(void); -extern void init_espfix_ap(void); - #ifndef _SETUP +#include + /* * This is set up by the setup-routine at boot-time */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 8a64da36310f..6afbb16e9b79 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round -- cgit v1.2.3 From 20b68535cd27183ebd3651ff313afb2b97dac941 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 2 May 2014 11:33:51 -0700 Subject: x86, espfix: Fix broken header guard Header guard is #ifndef, not #ifdef... Reported-by: Fengguang Wu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/espfix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 729051c82b02..99efebb2f69d 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -1,4 +1,4 @@ -#ifdef _ASM_X86_ESPFIX_H +#ifndef _ASM_X86_ESPFIX_H #define _ASM_X86_ESPFIX_H #ifdef CONFIG_X86_64 -- cgit v1.2.3 From c81c8a1eeede61e92a15103748c23d100880cc8a Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 2 May 2014 11:18:41 -0700 Subject: x86, ioremap: Speed up check for RAM pages In __ioremap_caller() (the guts of ioremap), we loop over the range of pfns being remapped and checks each one individually with page_is_ram(). For large ioremaps, this can be very slow. For example, we have a device with a 256 GiB PCI BAR, and ioremapping this BAR can take 20+ seconds -- sometimes long enough to trigger the soft lockup detector! Internally, page_is_ram() calls walk_system_ram_range() on a single page. Instead, we can make a single call to walk_system_ram_range() from __ioremap_caller(), and do our further checks only for any RAM pages that we find. For the common case of MMIO, this saves an enormous amount of work, since the range being ioremapped doesn't intersect system RAM at all. With this change, ioremap on our 256 GiB BAR takes less than 1 second. Signed-off-by: Roland Dreier Link: http://lkml.kernel.org/r/1399054721-1331-1-git-send-email-roland@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 597ac155c91c..bc7527e109c8 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } +static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, + void *arg) +{ + unsigned long i; + + for (i = 0; i < nr_pages; ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return 1; + + WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); + + return 0; +} + /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ + pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; - for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { - int is_ram = page_is_ram(pfn); - - if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) - return NULL; - WARN_ON_ONCE(is_ram); - } + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + __ioremap_check_ram) == 1) + return NULL; /* * Mappings have to be page-aligned -- cgit v1.2.3 From 197725de65477bc8509b41388157c1a2283542bb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 4 May 2014 10:00:49 -0700 Subject: x86, espfix: Make espfix64 a Kconfig option, fix UML Make espfix64 a hidden Kconfig option. This fixes the x86-64 UML build which had broken due to the non-existence of init_espfix_bsp() in UML: since UML uses its own Kconfig, this option does not appear in the UML build. This also makes it possible to make support for 16-bit segments a configuration option, for the people who want to minimize the size of the kernel. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: Richard Weinberger Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com --- arch/x86/Kconfig | 4 ++++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/smpboot.c | 2 +- init/main.c | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25d2c6f7325e..9a952a572585 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -914,6 +914,10 @@ config VM86 XFree86 to initialize some video cards via BIOS. Disabling this option saves about 6k. +config X86_ESPFIX64 + def_bool y + depends on X86_64 + config TOSHIBA tristate "Toshiba Laptop support" depends on X86_32 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1cc3789d99d9..491ef3e59850 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o -obj-$(CONFIG_X86_64) += espfix_64.o +obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 61a5350850fb..5d93ac1b72db 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -246,7 +246,7 @@ static void notrace start_secondary(void *unused) /* * Enable the espfix hack for this CPU */ -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_ESPFIX64 init_espfix_ap(); #endif diff --git a/init/main.c b/init/main.c index 70fc00e7db06..58c132d7de4b 100644 --- a/init/main.c +++ b/init/main.c @@ -617,7 +617,7 @@ asmlinkage void __init start_kernel(void) if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); #endif -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_ESPFIX64 /* Should be run before the first non-init thread is created */ init_espfix_bsp(); #endif -- cgit v1.2.3 From 34273f41d57ee8d854dcd2a1d754cbb546cb548f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 4 May 2014 10:36:22 -0700 Subject: x86, espfix: Make it possible to disable 16-bit support Embedded systems, which may be very memory-size-sensitive, are extremely unlikely to ever encounter any 16-bit software, so make it a CONFIG_EXPERT option to turn off support for any 16-bit software whatsoever. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com --- arch/x86/Kconfig | 23 ++++++++++++++++++----- arch/x86/kernel/entry_32.S | 12 ++++++++++++ arch/x86/kernel/entry_64.S | 8 ++++++++ arch/x86/kernel/ldt.c | 5 +++++ 4 files changed, 43 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9a952a572585..956c7702471e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -909,14 +909,27 @@ config VM86 default y depends on X86_32 ---help--- - This option is required by programs like DOSEMU to run 16-bit legacy - code on X86 processors. It also may be needed by software like - XFree86 to initialize some video cards via BIOS. Disabling this - option saves about 6k. + This option is required by programs like DOSEMU to run + 16-bit real mode legacy code on x86 processors. It also may + be needed by software like XFree86 to initialize some video + cards via BIOS. Disabling this option saves about 6K. + +config X86_16BIT + bool "Enable support for 16-bit segments" if EXPERT + default y + ---help--- + This option is required by programs like Wine to run 16-bit + protected mode legacy code on x86 processors. Disabling + this option saves about 300 bytes on i386, or around 6K text + plus 16K runtime memory on x86-64, + +config X86_ESPFIX32 + def_bool y + depends on X86_16BIT && X86_32 config X86_ESPFIX64 def_bool y - depends on X86_64 + depends on X86_16BIT && X86_64 config TOSHIBA tristate "Toshiba Laptop support" diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2780b8f3b96c..98313ffaae6a 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -527,6 +527,7 @@ syscall_exit: restore_all: TRACE_IRQS_IRET restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -537,6 +538,7 @@ restore_all_notrace: cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS +#endif restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code irq_return: @@ -549,6 +551,7 @@ ENTRY(iret_exc) .previous _ASM_EXTABLE(irq_return,iret_exc) +#ifdef CONFIG_X86_ESPFIX32 CFI_RESTORE_STATE ldt_ss: #ifdef CONFIG_PARAVIRT @@ -592,6 +595,7 @@ ldt_ss: lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck +#endif CFI_ENDPROC ENDPROC(system_call) @@ -699,6 +703,7 @@ END(syscall_badsys) * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. */ +#ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ @@ -708,8 +713,10 @@ END(syscall_badsys) pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 +#endif .endm .macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax @@ -720,6 +727,7 @@ END(syscall_badsys) /* switch to normal stack */ FIXUP_ESPFIX_STACK 27: +#endif .endm /* @@ -1350,11 +1358,13 @@ END(debug) ENTRY(nmi) RING0_INT_FRAME ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl_cfi %eax je nmi_espfix_stack +#endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl_cfi %eax @@ -1394,6 +1404,7 @@ nmi_debug_stack_check: FIX_STACK 24, nmi_stack_correct, 1 jmp nmi_stack_correct +#ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * @@ -1415,6 +1426,7 @@ nmi_espfix_stack: lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 jmp irq_return +#endif CFI_ENDPROC END(nmi) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bffaa986cafc..da0b9bdcc32e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1045,8 +1045,10 @@ irq_return: * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. */ +#ifdef CONFIG_X86_ESPFIX64 testb $4,(SS-RIP)(%rsp) jnz irq_return_ldt +#endif irq_return_iret: INTERRUPT_RETURN @@ -1058,6 +1060,7 @@ ENTRY(native_iret) _ASM_EXTABLE(native_iret, bad_iret) #endif +#ifdef CONFIG_X86_ESPFIX64 irq_return_ldt: pushq_cfi %rax pushq_cfi %rdi @@ -1081,6 +1084,7 @@ irq_return_ldt: movq %rax,%rsp popq_cfi %rax jmp irq_return_iret +#endif .section .fixup,"ax" bad_iret: @@ -1152,6 +1156,7 @@ END(common_interrupt) * modify the stack to make it look like we just entered * the #GP handler from user space, similar to bad_iret. */ +#ifdef CONFIG_X86_ESPFIX64 ALIGN __do_double_fault: XCPT_FRAME 1 RDI+8 @@ -1177,6 +1182,9 @@ __do_double_fault: retq CFI_ENDPROC END(__do_double_fault) +#else +# define __do_double_fault do_double_fault +#endif /* * End of kprobes section diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ebc987398923..c37886d759cc 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } + fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; -- cgit v1.2.3 From 7fd44dacdd803c0bbf38bf478d51d280902bb0f1 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sun, 4 May 2014 20:43:15 -0400 Subject: x86, x32: Use compat shims for io_{setup,submit} The io_setup takes a pointer to a context id of type aio_context_t. This in turn is typed to a __kernel_ulong_t. We could tweak the exported headers to define this as a 64bit quantity for specific ABIs, but since we already have a 32bit compat shim for the x86 ABI, let's just re-use that logic. The libaio package is also written to expect this as a pointer type, so a compat shim would simplify that. The io_submit func operates on an array of pointers to iocb structs. Padding out the array to be 64bit aligned is a huge pain, so convert it over to the existing compat shim too. We don't convert io_getevents to the compat func as its only purpose is to handle the timespec struct, and the x32 ABI uses 64bit times. With this change, the libaio package can now pass its testsuite when built for the x32 ABI. Signed-off-by: Mike Frysinger Link: http://lkml.kernel.org/r/1399250595-5005-1-git-send-email-vapier@gentoo.org Cc: H.J. Lu Signed-off-by: H. Peter Anvin Cc: # v3.4+ --- arch/x86/syscalls/syscall_64.tbl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 04376ac3d9ef..ec255a1646d2 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -212,10 +212,10 @@ 203 common sched_setaffinity sys_sched_setaffinity 204 common sched_getaffinity sys_sched_getaffinity 205 64 set_thread_area -206 common io_setup sys_io_setup +206 64 io_setup sys_io_setup 207 common io_destroy sys_io_destroy 208 common io_getevents sys_io_getevents -209 common io_submit sys_io_submit +209 64 io_submit sys_io_submit 210 common io_cancel sys_io_cancel 211 64 get_thread_area 212 common lookup_dcookie sys_lookup_dcookie @@ -359,3 +359,5 @@ 540 x32 process_vm_writev compat_sys_process_vm_writev 541 x32 setsockopt compat_sys_setsockopt 542 x32 getsockopt compat_sys_getsockopt +543 x32 io_setup compat_sys_io_setup +544 x32 io_submit compat_sys_io_submit -- cgit v1.2.3 From 73159fdcdb9be3eda61b846864352c29371baeb6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:31 -0700 Subject: x86, mm: Ensure correct alignment of the fixmap The early_ioremap code requires that its buffers not span a PMD boundary. The logic for ensuring that only works if the fixmap is aligned, so assert that it's aligned correctly. To make this work reliably, reserve_top_address needs to be adjusted. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/e59a5f4362661f75dd4841fa74e1f2448045e245.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 6 ++++++ arch/x86/mm/pgtable.c | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 597ac155c91c..6ef98c55a899 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -355,6 +355,12 @@ void __init early_ioremap_init(void) { pmd_t *pmd; +#ifdef CONFIG_X86_64 + BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#else + WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#endif + early_ioremap_setup(); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c96314abd144..5f8bdda1d1ba 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -449,9 +449,9 @@ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; + __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; + printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", + -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } -- cgit v1.2.3 From 3d7ee969bffcc984c8aeaffc6ac6816fd929ace1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:32 -0700 Subject: x86, vdso: Clean up 32-bit vs 64-bit vdso params Rather than using 'vdso_enabled' and an awful #define, just call the parameters vdso32_enabled and vdso64_enabled. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/87913de56bdcbae3d93917938302fc369b05caee.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/elf.h | 20 +++++++++++++------- arch/x86/um/vdso/vma.c | 2 +- arch/x86/vdso/vdso32-setup.c | 19 ++++++++----------- arch/x86/vdso/vma.c | 6 +++--- kernel/sysctl.c | 5 +++++ 5 files changed, 30 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2c71182d30ef..e96df2c0dd69 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -75,7 +75,12 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #include -extern unsigned int vdso_enabled; +#ifdef CONFIG_X86_64 +extern unsigned int vdso64_enabled; +#endif +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +extern unsigned int vdso32_enabled; +#endif /* * This is used to ensure we don't load something for the wrong architecture. @@ -269,9 +274,9 @@ extern int force_personality32; struct task_struct; -#define ARCH_DLINFO_IA32(vdso_enabled) \ +#define ARCH_DLINFO_IA32 \ do { \ - if (vdso_enabled) { \ + if (vdso32_enabled) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ @@ -281,7 +286,7 @@ do { \ #define STACK_RND_MASK (0x7ff) -#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) +#define ARCH_DLINFO ARCH_DLINFO_IA32 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ @@ -292,14 +297,15 @@ do { \ #define ARCH_DLINFO \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) +/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ #define ARCH_DLINFO_X32 \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) @@ -310,7 +316,7 @@ do { \ if (test_thread_flag(TIF_X32)) \ ARCH_DLINFO_X32; \ else \ - ARCH_DLINFO_IA32(sysctl_vsyscall32) + ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index af91901babb8..916cda4cd5b4 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -12,7 +12,7 @@ #include #include -unsigned int __read_mostly vdso_enabled = 1; +static unsigned int __read_mostly vdso_enabled = 1; unsigned long um_vdso_addr; extern unsigned long task_size; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 00348980a3a6..5a657d93c6e0 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -37,7 +37,6 @@ #endif #ifdef CONFIG_X86_64 -#define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages #endif @@ -45,13 +44,13 @@ * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; -static int __init vdso_setup(char *s) +static int __init vdso32_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso_enabled > 1) + if (vdso32_enabled > 1) pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); return 1; @@ -62,12 +61,10 @@ static int __init vdso_setup(char *s) * behavior on both 64-bit and 32-bit kernels. * On 32-bit kernels, vdso=[012] means the same thing. */ -__setup("vdso32=", vdso_setup); +__setup("vdso32=", vdso32_setup); #ifdef CONFIG_X86_32 -__setup_param("vdso=", vdso32_setup, vdso_setup, 0); - -EXPORT_SYMBOL_GPL(vdso_enabled); +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif static struct page **vdso32_pages; @@ -160,7 +157,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return x32_setup_additional_pages(bprm, uses_interp); #endif - if (vdso_enabled != 1) /* Other values all mean "disabled" */ + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; down_write(&mm->mmap_sem); @@ -244,7 +241,7 @@ subsys_initcall(sysenter_setup); static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", - .data = &sysctl_vsyscall32, + .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 1ad102613127..8b790398ed1d 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -17,7 +17,7 @@ #include #if defined(CONFIG_X86_64) -unsigned int __read_mostly vdso_enabled = 1; +unsigned int __read_mostly vdso64_enabled = 1; DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; @@ -160,7 +160,7 @@ static int setup_additional_pages(struct linux_binprm *bprm, unsigned long addr; int ret; - if (!vdso_enabled) + if (!vdso64_enabled) return 0; down_write(&mm->mmap_sem); @@ -203,7 +203,7 @@ int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) static __init int vdso_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso64_enabled = simple_strtoul(s, NULL, 0); return 0; } __setup("vdso=", vdso_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..420d77afa8fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1418,8 +1418,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, -- cgit v1.2.3 From cfda7bb9ecbf9d96264bb5bade33a842966d1062 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:33 -0700 Subject: x86, vdso: Move syscall and sysenter setup into kernel/cpu/common.c This code is used during CPU setup, and it isn't strictly speaking related to the 32-bit vdso. It's easier to understand how this works when the code is closer to its callers. This also lets syscall32_cpu_init be static, which might save some trivial amount of kernel text. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/4e466987204e232d7b55a53ff6b9739f12237461.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/proto.h | 2 -- arch/x86/kernel/cpu/common.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/vdso/vdso32-setup.c | 30 ------------------------------ 3 files changed, 32 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 6fd3fd769796..a90f8972dad5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -12,8 +12,6 @@ void ia32_syscall(void); void ia32_cstar_target(void); void ia32_sysenter_target(void); -void syscall32_cpu_init(void); - void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a135239badb7..7c65b4666c24 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -953,6 +953,38 @@ static void vgetcpu_set_mode(void) else vgetcpu_mode = VGETCPU_LSL; } + +/* May not be __init: called during resume */ +static void syscall32_cpu_init(void) +{ + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} +#endif + +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); + return; + } + + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + put_cpu(); +} #endif void __init identify_boot_cpu(void) diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 5a657d93c6e0..9c78d5b24874 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -75,41 +75,11 @@ static unsigned vdso32_size; #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) #define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) -/* May not be __init: called during resume */ -void syscall32_cpu_init(void) -{ - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} - #else /* CONFIG_X86_32 */ #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) #define vdso32_syscall() (0) -void enable_sep_cpu(void) -{ - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); - return; - } - - tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); - put_cpu(); -} - #endif /* CONFIG_X86_64 */ int __init sysenter_setup(void) -- cgit v1.2.3 From 6f121e548f83674ab4920a4e60afb58d4f61b829 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:34 -0700 Subject: x86, vdso: Reimplement vdso.so preparation in build-time C Currently, vdso.so files are prepared and analyzed by a combination of objcopy, nm, some linker script tricks, and some simple ELF parsers in the kernel. Replace all of that with plain C code that runs at build time. All five vdso images now generate .c files that are compiled and linked in to the kernel image. This should cause only one userspace-visible change: the loaded vDSO images are stripped more heavily than they used to be. Everything outside the loadable segment is dropped. In particular, this causes the section table and section name strings to be missing. This should be fine: real dynamic loaders don't load or inspect these tables anyway. The result is roughly equivalent to eu-strip's --strip-sections option. The purpose of this change is to enable the vvar and hpet mappings to be moved to the page following the vDSO load segment. Currently, it is possible for the section table to extend into the page after the load segment, so, if we map it, it risks overlapping the vvar or hpet page. This happens whenever the load segment is just under a multiple of PAGE_SIZE. The only real subtlety here is that the old code had a C file with inline assembler that did 'call VDSO32_vsyscall' and a linker script that defined 'VDSO32_vsyscall = __kernel_vsyscall'. This most likely worked by accident: the linker script entry defines a symbol associated with an address as opposed to an alias for the real dynamic symbol __kernel_vsyscall. That caused ld to relocate the reference at link time instead of leaving an interposable dynamic relocation. Since the VDSO32_vsyscall hack is no longer needed, I now use 'call __kernel_vsyscall', and I added -Bsymbolic to make it work. vdso2c will generate an error and abort the build if the resulting image contains any dynamic relocations, so we won't silently generate bad vdso images. (Dynamic relocations are a problem because nothing will even attempt to relocate the vdso.) Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/2c4fcf45524162a34d87fdda1eb046b2a5cecee7.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 8 +-- arch/x86/include/asm/elf.h | 7 +- arch/x86/include/asm/mmu.h | 2 +- arch/x86/include/asm/vdso.h | 70 +++++++------------ arch/x86/kernel/signal.c | 6 +- arch/x86/mm/init_64.c | 3 +- arch/x86/vdso/.gitignore | 5 +- arch/x86/vdso/Makefile | 90 +++++++++--------------- arch/x86/vdso/vclock_gettime.c | 4 +- arch/x86/vdso/vdso.S | 3 - arch/x86/vdso/vdso2c.c | 142 ++++++++++++++++++++++++++++++++++++++ arch/x86/vdso/vdso2c.h | 137 ++++++++++++++++++++++++++++++++++++ arch/x86/vdso/vdso32-setup.c | 50 ++++++-------- arch/x86/vdso/vdso32.S | 9 --- arch/x86/vdso/vdso32/vdso32.lds.S | 10 --- arch/x86/vdso/vdsox32.S | 3 - arch/x86/vdso/vma.c | 100 +++++---------------------- arch/x86/xen/setup.c | 11 ++- 18 files changed, 400 insertions(+), 260 deletions(-) delete mode 100644 arch/x86/vdso/vdso.S create mode 100644 arch/x86/vdso/vdso2c.c create mode 100644 arch/x86/vdso/vdso2c.h delete mode 100644 arch/x86/vdso/vdso32.S delete mode 100644 arch/x86/vdso/vdsox32.S (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 220675795e08..f9e181aaba97 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -383,8 +383,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, } else { /* Return stub is in 32bit vsyscall page */ if (current->mm->context.vdso) - restorer = VDSO32_SYMBOL(current->mm->context.vdso, - sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; else restorer = &frame->retcode; } @@ -462,8 +462,8 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; else - restorer = VDSO32_SYMBOL(current->mm->context.vdso, - rt_sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_rt_sigreturn; put_user_ex(ptr_to_compat(restorer), &frame->pretcode); /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index e96df2c0dd69..65b21bcbe9f7 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -299,7 +299,7 @@ do { \ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ - (unsigned long)current->mm->context.vdso); \ + (unsigned long __force)current->mm->context.vdso); \ } while (0) /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ @@ -307,7 +307,7 @@ do { \ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ - (unsigned long)current->mm->context.vdso); \ + (unsigned long __force)current->mm->context.vdso); \ } while (0) #define AT_SYSINFO 32 @@ -325,7 +325,8 @@ else \ #define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) #define VDSO_ENTRY \ - ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall)) + ((unsigned long)current->mm->context.vdso + \ + selected_vdso32->sym___kernel_vsyscall) struct linux_binprm; diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5f55e6962769..876e74e8eec7 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -18,7 +18,7 @@ typedef struct { #endif struct mutex lock; - void *vdso; + void __user *vdso; } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index d1dc55404ff1..389fe2ca27c2 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -3,63 +3,43 @@ #include #include +#include -#ifdef __ASSEMBLER__ +#ifndef __ASSEMBLER__ -#define DEFINE_VDSO_IMAGE(symname, filename) \ -__PAGE_ALIGNED_DATA ; \ - .globl symname##_start, symname##_end ; \ - .align PAGE_SIZE ; \ - symname##_start: ; \ - .incbin filename ; \ - symname##_end: ; \ - .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \ - \ -.previous ; \ - \ - .globl symname##_pages ; \ - .bss ; \ - .align 8 ; \ - .type symname##_pages, @object ; \ - symname##_pages: ; \ - .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \ - .size symname##_pages, .-symname##_pages +struct vdso_image { + void *data; + unsigned long size; /* Always a multiple of PAGE_SIZE */ + struct page **pages; /* Big enough for data/size page pointers */ -#else + unsigned long alt, alt_len; -#define DECLARE_VDSO_IMAGE(symname) \ - extern char symname##_start[], symname##_end[]; \ - extern struct page *symname##_pages[] + unsigned long sym_VDSO32_NOTE_MASK; + unsigned long sym___kernel_sigreturn; + unsigned long sym___kernel_rt_sigreturn; + unsigned long sym___kernel_vsyscall; + unsigned long sym_VDSO32_SYSENTER_RETURN; +}; -#if defined CONFIG_X86_32 || defined CONFIG_COMPAT +#ifdef CONFIG_X86_64 +extern const struct vdso_image vdso_image_64; +#endif -#include +#ifdef CONFIG_X86_X32 +extern const struct vdso_image vdso_image_x32; +#endif -DECLARE_VDSO_IMAGE(vdso32_int80); +#if defined CONFIG_X86_32 || defined CONFIG_COMPAT +extern const struct vdso_image vdso_image_32_int80; #ifdef CONFIG_COMPAT -DECLARE_VDSO_IMAGE(vdso32_syscall); +extern const struct vdso_image vdso_image_32_syscall; #endif -DECLARE_VDSO_IMAGE(vdso32_sysenter); +extern const struct vdso_image vdso_image_32_sysenter; -/* - * Given a pointer to the vDSO image, find the pointer to VDSO32_name - * as that symbol is defined in the vDSO sources or linker script. - */ -#define VDSO32_SYMBOL(base, name) \ -({ \ - extern const char VDSO32_##name[]; \ - (void __user *)(VDSO32_##name + (unsigned long)(base)); \ -}) +extern const struct vdso_image *selected_vdso32; #endif -/* - * These symbols are defined with the addresses in the vsyscall page. - * See vsyscall-sigreturn.S. - */ -extern void __user __kernel_sigreturn; -extern void __user __kernel_rt_sigreturn; - -void __init patch_vdso32(void *vdso, size_t len); +extern void __init init_vdso_image(const struct vdso_image *image); #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9e5de6813e1f..a0da58db43a8 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -298,7 +298,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, } if (current->mm->context.vdso) - restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; else restorer = &frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) @@ -361,7 +362,8 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ - restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f35c66c5959a..563849600d3e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1223,7 +1223,8 @@ int in_gate_area_no_mm(unsigned long addr) const char *arch_vma_name(struct vm_area_struct *vma) { - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + if (vma->vm_mm && vma->vm_start == + (long __force)vma->vm_mm->context.vdso) return "[vdso]"; if (vma == &gate_vma) return "[vsyscall]"; diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore index 3282874bc61d..aae8ffdd5880 100644 --- a/arch/x86/vdso/.gitignore +++ b/arch/x86/vdso/.gitignore @@ -1,8 +1,7 @@ vdso.lds -vdso-syms.lds vdsox32.lds -vdsox32-syms.lds -vdso32-syms.lds vdso32-syscall-syms.lds vdso32-sysenter-syms.lds vdso32-int80-syms.lds +vdso-image-*.c +vdso2c diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index c580d1210ffe..895d4b16b7e3 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -24,15 +24,30 @@ vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y)) # files to link into kernel obj-y += vma.o -obj-$(VDSO64-y) += vdso.o -obj-$(VDSOX32-y) += vdsox32.o -obj-$(VDSO32-y) += vdso32.o vdso32-setup.o + +# vDSO images to build +vdso_img-$(VDSO64-y) += 64 +vdso_img-$(VDSOX32-y) += x32 +vdso_img-$(VDSO32-y) += 32-int80 +vdso_img-$(CONFIG_COMPAT) += 32-syscall +vdso_img-$(VDSO32-y) += 32-sysenter + +obj-$(VDSO32-y) += vdso32-setup.o vobjs := $(foreach F,$(vobj64s),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so -targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) +targets += vdso.lds $(vobjs-y) + +# Build the vDSO image C files and link them in. +vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) +vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) +vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) +obj-y += $(vdso_img_objs) +targets += $(vdso_img_cfiles) +targets += $(vdso_img_sodbg) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) export CPPFLAGS_vdso.lds += -P -C @@ -41,14 +56,18 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ $(DISABLE_LTO) -$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so - -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE +$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) +hostprogs-y += vdso2c + +quiet_cmd_vdso2c = VDSO2C $@ +define cmd_vdso2c + $(obj)/vdso2c $< $@ +endef + +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE + $(call if_changed,vdso2c) # # Don't omit frame pointers for ease of userspace debugging, but do @@ -68,22 +87,6 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg CFLAGS_REMOVE_vvar.o = -pg -targets += vdso-syms.lds -obj-$(VDSO64-y) += vdso-syms.lds - -# -# Match symbols in the DSO that look like VDSO*; produce a file of constants. -# -sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' -quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ -endef - -$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE - $(call if_changed,vdsosym) - # # X32 processes use x32 vDSO to access 64bit kernel data. # @@ -94,9 +97,6 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE # so that it can reach 64bit address space with 64bit pointers. # -targets += vdsox32-syms.lds -obj-$(VDSOX32-y) += vdsox32-syms.lds - CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ -Wl,-soname=linux-vdso.so.1 \ @@ -113,9 +113,7 @@ quiet_cmd_x32 = X32 $@ $(obj)/%-x32.o: $(obj)/%.o FORCE $(call if_changed,x32) -targets += vdsox32.so vdsox32.so.dbg vdsox32.lds $(vobjx32s-y) - -$(obj)/vdsox32.o: $(src)/vdsox32.S $(obj)/vdsox32.so +targets += vdsox32.lds $(vobjx32s-y) $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) @@ -123,7 +121,6 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE # # Build multiple 32-bit vDSO images to choose from at boot time. # -obj-$(VDSO32-y) += vdso32-syms.lds vdso32.so-$(VDSO32-y) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter @@ -138,10 +135,8 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += $(vdso32-images) $(vdso32-images:=.dbg) targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) - -extra-y += $(vdso32-images) +targets += vdso32/vclock_gettime.o $(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) @@ -166,27 +161,6 @@ $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/%.o $(call if_changed,vdso) -# Make vdso32-*-syms.lds from each image, and then make sure they match. -# The only difference should be that some do not define VDSO32_SYSENTER_RETURN. - -targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds) - -quiet_cmd_vdso32sym = VDSOSYM $@ -define cmd_vdso32sym - if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \ - $(foreach H,$(filter-out FORCE,$^),\ - if grep -q VDSO32_SYSENTER_RETURN $H; \ - then diff -u $(@D)/.tmp_$(@F) $H; \ - else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \ - diff -u - $H; fi &&) : ;\ - then mv -f $(@D)/.tmp_$(@F) $@; \ - else rm -f $(@D)/.tmp_$(@F); exit 1; \ - fi -endef - -$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE - $(call if_changed,vdso32sym) - # # The DSO images are built using a special linker script. # @@ -197,7 +171,7 @@ quiet_cmd_vdso = VDSO $@ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ - $(LTO_CFLAGS) + -Wl,-Bsymbolic $(LTO_CFLAGS) GCOV_PROFILE := n # diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 16d686171e9a..091554c20bc9 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -154,7 +154,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) asm( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" - "call VDSO32_vsyscall \n" + "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret) : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) @@ -169,7 +169,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) asm( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" - "call VDSO32_vsyscall \n" + "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret) : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S deleted file mode 100644 index be3f23b09af5..000000000000 --- a/arch/x86/vdso/vdso.S +++ /dev/null @@ -1,3 +0,0 @@ -#include - -DEFINE_VDSO_IMAGE(vdso, "arch/x86/vdso/vdso.so") diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c new file mode 100644 index 000000000000..976e8e4ced92 --- /dev/null +++ b/arch/x86/vdso/vdso2c.c @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +/* Symbols that we need in vdso2c. */ +char const * const required_syms[] = { + "VDSO32_NOTE_MASK", + "VDSO32_SYSENTER_RETURN", + "__kernel_vsyscall", + "__kernel_sigreturn", + "__kernel_rt_sigreturn", +}; + +__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) +static void fail(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + fprintf(stderr, "Error: "); + vfprintf(stderr, format, ap); + exit(1); + va_end(ap); +} + +#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) + +#define BITS 64 +#define GOFUNC go64 +#define Elf_Ehdr Elf64_Ehdr +#define Elf_Shdr Elf64_Shdr +#define Elf_Phdr Elf64_Phdr +#define Elf_Sym Elf64_Sym +#define Elf_Dyn Elf64_Dyn +#include "vdso2c.h" +#undef BITS +#undef GOFUNC +#undef Elf_Ehdr +#undef Elf_Shdr +#undef Elf_Phdr +#undef Elf_Sym +#undef Elf_Dyn + +#define BITS 32 +#define GOFUNC go32 +#define Elf_Ehdr Elf32_Ehdr +#define Elf_Shdr Elf32_Shdr +#define Elf_Phdr Elf32_Phdr +#define Elf_Sym Elf32_Sym +#define Elf_Dyn Elf32_Dyn +#include "vdso2c.h" +#undef BITS +#undef GOFUNC +#undef Elf_Ehdr +#undef Elf_Shdr +#undef Elf_Phdr +#undef Elf_Sym +#undef Elf_Dyn + +static int go(void *addr, size_t len, FILE *outfile, const char *name) +{ + Elf64_Ehdr *hdr = (Elf64_Ehdr *)addr; + + if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { + return go64(addr, len, outfile, name); + } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { + return go32(addr, len, outfile, name); + } else { + fprintf(stderr, "Error: unknown ELF class\n"); + return 1; + } +} + +int main(int argc, char **argv) +{ + int fd; + off_t len; + void *addr; + FILE *outfile; + int ret; + char *name, *tmp; + int namelen; + + if (argc != 3) { + printf("Usage: vdso2c INPUT OUTPUT\n"); + return 1; + } + + /* + * Figure out the struct name. If we're writing to a .so file, + * generate raw output insted. + */ + name = strdup(argv[2]); + namelen = strlen(name); + if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { + name = NULL; + } else { + tmp = strrchr(name, '/'); + if (tmp) + name = tmp + 1; + tmp = strchr(name, '.'); + if (tmp) + *tmp = '\0'; + for (tmp = name; *tmp; tmp++) + if (*tmp == '-') + *tmp = '_'; + } + + fd = open(argv[1], O_RDONLY); + if (fd == -1) + err(1, "%s", argv[1]); + + len = lseek(fd, 0, SEEK_END); + if (len == (off_t)-1) + err(1, "lseek"); + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) + err(1, "mmap"); + + outfile = fopen(argv[2], "w"); + if (!outfile) + err(1, "%s", argv[2]); + + ret = go(addr, (size_t)len, outfile, name); + + munmap(addr, len); + fclose(outfile); + + return ret; +} diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h new file mode 100644 index 000000000000..9276e5207620 --- /dev/null +++ b/arch/x86/vdso/vdso2c.h @@ -0,0 +1,137 @@ +/* + * This file is included twice from vdso2c.c. It generates code for 32-bit + * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs + * are built for 32-bit userspace. + */ + +static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) +{ + int found_load = 0; + unsigned long load_size = -1; /* Work around bogus warning */ + unsigned long data_size; + Elf_Ehdr *hdr = (Elf_Ehdr *)addr; + int i; + unsigned long j; + Elf_Shdr *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + *alt_sec = NULL; + Elf_Dyn *dyn = 0, *dyn_end = 0; + const char *secstrings; + uint64_t syms[NSYMS] = {}; + + Elf_Phdr *pt = (Elf_Phdr *)(addr + hdr->e_phoff); + + /* Walk the segment table. */ + for (i = 0; i < hdr->e_phnum; i++) { + if (pt[i].p_type == PT_LOAD) { + if (found_load) + fail("multiple PT_LOAD segs\n"); + + if (pt[i].p_offset != 0 || pt[i].p_vaddr != 0) + fail("PT_LOAD in wrong place\n"); + + if (pt[i].p_memsz != pt[i].p_filesz) + fail("cannot handle memsz != filesz\n"); + + load_size = pt[i].p_memsz; + found_load = 1; + } else if (pt[i].p_type == PT_DYNAMIC) { + dyn = addr + pt[i].p_offset; + dyn_end = addr + pt[i].p_offset + pt[i].p_memsz; + } + } + if (!found_load) + fail("no PT_LOAD seg\n"); + data_size = (load_size + 4095) / 4096 * 4096; + + /* Walk the dynamic table */ + for (i = 0; dyn + i < dyn_end && dyn[i].d_tag != DT_NULL; i++) { + if (dyn[i].d_tag == DT_REL || dyn[i].d_tag == DT_RELSZ || + dyn[i].d_tag == DT_RELENT || dyn[i].d_tag == DT_TEXTREL) + fail("vdso image contains dynamic relocations\n"); + } + + /* Walk the section table */ + secstrings_hdr = addr + hdr->e_shoff + hdr->e_shentsize*hdr->e_shstrndx; + secstrings = addr + secstrings_hdr->sh_offset; + for (i = 0; i < hdr->e_shnum; i++) { + Elf_Shdr *sh = addr + hdr->e_shoff + hdr->e_shentsize * i; + if (sh->sh_type == SHT_SYMTAB) + symtab_hdr = sh; + + if (!strcmp(secstrings + sh->sh_name, ".altinstructions")) + alt_sec = sh; + } + + if (!symtab_hdr) { + fail("no symbol table\n"); + return 1; + } + + strtab_hdr = addr + hdr->e_shoff + + hdr->e_shentsize * symtab_hdr->sh_link; + + /* Walk the symbol table */ + for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { + int k; + Elf_Sym *sym = addr + symtab_hdr->sh_offset + + symtab_hdr->sh_entsize * i; + const char *name = addr + strtab_hdr->sh_offset + sym->st_name; + for (k = 0; k < NSYMS; k++) { + if (!strcmp(name, required_syms[k])) { + if (syms[k]) { + fail("duplicate symbol %s\n", + required_syms[k]); + } + syms[k] = sym->st_value; + } + } + } + + /* Remove sections. */ + hdr->e_shoff = 0; + hdr->e_shentsize = 0; + hdr->e_shnum = 0; + hdr->e_shstrndx = SHN_UNDEF; + + if (!name) { + fwrite(addr, load_size, 1, outfile); + return 0; + } + + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "\n"); + fprintf(outfile, + "static unsigned char raw_data[%lu] __page_aligned_data = {", + data_size); + for (j = 0; j < load_size; j++) { + if (j % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", (int)((unsigned char *)addr)[j]); + } + fprintf(outfile, "\n};\n\n"); + + fprintf(outfile, "static struct page *pages[%lu];\n\n", + data_size / 4096); + + fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "\t.data = raw_data,\n"); + fprintf(outfile, "\t.size = %lu,\n", data_size); + fprintf(outfile, "\t.pages = pages,\n"); + if (alt_sec) { + fprintf(outfile, "\t.alt = %lu,\n", + (unsigned long)alt_sec->sh_offset); + fprintf(outfile, "\t.alt_len = %lu,\n", + (unsigned long)alt_sec->sh_size); + } + for (i = 0; i < NSYMS; i++) { + if (syms[i]) + fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n", + required_syms[i], syms[i]); + } + fprintf(outfile, "};\n"); + + return 0; +} diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 9c78d5b24874..d41460118a28 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_COMPAT_VDSO #define VDSO_DEFAULT 0 @@ -67,9 +68,6 @@ __setup("vdso32=", vdso32_setup); __setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif -static struct page **vdso32_pages; -static unsigned vdso32_size; - #ifdef CONFIG_X86_64 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) @@ -82,34 +80,23 @@ static unsigned vdso32_size; #endif /* CONFIG_X86_64 */ +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +const struct vdso_image *selected_vdso32; +#endif + int __init sysenter_setup(void) { - char *vdso32_start, *vdso32_end; - int npages, i; - #ifdef CONFIG_COMPAT - if (vdso32_syscall()) { - vdso32_start = vdso32_syscall_start; - vdso32_end = vdso32_syscall_end; - vdso32_pages = vdso32_syscall_pages; - } else + if (vdso32_syscall()) + selected_vdso32 = &vdso_image_32_syscall; + else #endif - if (vdso32_sysenter()) { - vdso32_start = vdso32_sysenter_start; - vdso32_end = vdso32_sysenter_end; - vdso32_pages = vdso32_sysenter_pages; - } else { - vdso32_start = vdso32_int80_start; - vdso32_end = vdso32_int80_end; - vdso32_pages = vdso32_int80_pages; - } - - npages = ((vdso32_end - vdso32_start) + PAGE_SIZE - 1) / PAGE_SIZE; - vdso32_size = npages << PAGE_SHIFT; - for (i = 0; i < npages; i++) - vdso32_pages[i] = virt_to_page(vdso32_start + i*PAGE_SIZE); + if (vdso32_sysenter()) + selected_vdso32 = &vdso_image_32_sysenter; + else + selected_vdso32 = &vdso_image_32_int80; - patch_vdso32(vdso32_start, vdso32_size); + init_vdso_image(selected_vdso32); return 0; } @@ -121,6 +108,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) unsigned long addr; int ret = 0; struct vm_area_struct *vma; + unsigned long vdso32_size = selected_vdso32->size; #ifdef CONFIG_X86_X32_ABI if (test_thread_flag(TIF_X32)) @@ -140,7 +128,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) addr += VDSO_OFFSET(VDSO_PREV_PAGES); - current->mm->context.vdso = (void *)addr; + current->mm->context.vdso = (void __user *)addr; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -150,7 +138,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vdso32_size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso32_pages); + selected_vdso32->pages); if (ret) goto up_fail; @@ -188,8 +176,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } #endif - current_thread_info()->sysenter_return = - VDSO32_SYMBOL(addr, SYSENTER_RETURN); + if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) + current_thread_info()->sysenter_return = + current->mm->context.vdso + + selected_vdso32->sym_VDSO32_SYSENTER_RETURN; up_fail: if (ret) diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S deleted file mode 100644 index 018bcd9f97b4..000000000000 --- a/arch/x86/vdso/vdso32.S +++ /dev/null @@ -1,9 +0,0 @@ -#include - -DEFINE_VDSO_IMAGE(vdso32_int80, "arch/x86/vdso/vdso32-int80.so") - -#ifdef CONFIG_COMPAT -DEFINE_VDSO_IMAGE(vdso32_syscall, "arch/x86/vdso/vdso32-syscall.so") -#endif - -DEFINE_VDSO_IMAGE(vdso32_sysenter, "arch/x86/vdso/vdso32-sysenter.so") diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S index aadb8b9994cd..f072095d6427 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -38,13 +38,3 @@ VERSION local: *; }; } - -/* - * Symbols we define here called VDSO* get their values into vdso32-syms.h. - */ -VDSO32_vsyscall = __kernel_vsyscall; -VDSO32_sigreturn = __kernel_sigreturn; -VDSO32_rt_sigreturn = __kernel_rt_sigreturn; -VDSO32_clock_gettime = clock_gettime; -VDSO32_gettimeofday = gettimeofday; -VDSO32_time = time; diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S deleted file mode 100644 index f4aa34e7f370..000000000000 --- a/arch/x86/vdso/vdsox32.S +++ /dev/null @@ -1,3 +0,0 @@ -#include - -DEFINE_VDSO_IMAGE(vdsox32, "arch/x86/vdso/vdsox32.so") diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 8b790398ed1d..cf217626fb47 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -19,99 +19,31 @@ #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; -DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; -static unsigned vdso_size; - -#ifdef CONFIG_X86_X32_ABI -DECLARE_VDSO_IMAGE(vdsox32); -static unsigned vdsox32_size; -#endif #endif -#if defined(CONFIG_X86_32) || defined(CONFIG_X86_X32_ABI) || \ - defined(CONFIG_COMPAT) -void __init patch_vdso32(void *vdso, size_t len) +void __init init_vdso_image(const struct vdso_image *image) { - Elf32_Ehdr *hdr = vdso; - Elf32_Shdr *sechdrs, *alt_sec = 0; - char *secstrings; - void *alt_data; int i; + int npages = (image->size) / PAGE_SIZE; - BUG_ON(len < sizeof(Elf32_Ehdr)); - BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); - - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (i = 1; i < hdr->e_shnum; i++) { - Elf32_Shdr *shdr = &sechdrs[i]; - if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { - alt_sec = shdr; - goto found; - } - } - - /* If we get here, it's probably a bug. */ - pr_warning("patch_vdso32: .altinstructions not found\n"); - return; /* nothing to patch */ + BUG_ON(image->size % PAGE_SIZE != 0); + for (i = 0; i < npages; i++) + image->pages[i] = virt_to_page(image->data + i*PAGE_SIZE); -found: - alt_data = (void *)hdr + alt_sec->sh_offset; - apply_alternatives(alt_data, alt_data + alt_sec->sh_size); + apply_alternatives((struct alt_instr *)(image->data + image->alt), + (struct alt_instr *)(image->data + image->alt + + image->alt_len)); } -#endif - -#if defined(CONFIG_X86_64) -static void __init patch_vdso64(void *vdso, size_t len) -{ - Elf64_Ehdr *hdr = vdso; - Elf64_Shdr *sechdrs, *alt_sec = 0; - char *secstrings; - void *alt_data; - int i; - BUG_ON(len < sizeof(Elf64_Ehdr)); - BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); - - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (i = 1; i < hdr->e_shnum; i++) { - Elf64_Shdr *shdr = &sechdrs[i]; - if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { - alt_sec = shdr; - goto found; - } - } - - /* If we get here, it's probably a bug. */ - pr_warning("patch_vdso64: .altinstructions not found\n"); - return; /* nothing to patch */ - -found: - alt_data = (void *)hdr + alt_sec->sh_offset; - apply_alternatives(alt_data, alt_data + alt_sec->sh_size); -} +#if defined(CONFIG_X86_64) static int __init init_vdso(void) { - int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; - int i; - - patch_vdso64(vdso_start, vdso_end - vdso_start); - - vdso_size = npages << PAGE_SHIFT; - for (i = 0; i < npages; i++) - vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); + init_vdso_image(&vdso_image_64); #ifdef CONFIG_X86_X32_ABI - patch_vdso32(vdsox32_start, vdsox32_end - vdsox32_start); - npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE; - vdsox32_size = npages << PAGE_SHIFT; - for (i = 0; i < npages; i++) - vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE); + init_vdso_image(&vdso_image_x32); #endif return 0; @@ -171,7 +103,7 @@ static int setup_additional_pages(struct linux_binprm *bprm, goto up_fail; } - current->mm->context.vdso = (void *)addr; + current->mm->context.vdso = (void __user *)addr; ret = install_special_mapping(mm, addr, size, VM_READ|VM_EXEC| @@ -189,15 +121,15 @@ up_fail: int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages(bprm, uses_interp, vdso_pages, - vdso_size); + return setup_additional_pages(bprm, uses_interp, vdso_image_64.pages, + vdso_image_64.size); } #ifdef CONFIG_X86_X32_ABI int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages(bprm, uses_interp, vdsox32_pages, - vdsox32_size); + return setup_additional_pages(bprm, uses_interp, vdso_image_x32.pages, + vdso_image_x32.size); } #endif diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0982233b9b84..7225a9557ee2 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -516,10 +516,17 @@ char * __init xen_memory_setup(void) static void __init fiddle_vdso(void) { #ifdef CONFIG_X86_32 + /* + * This could be called before selected_vdso32 is initialized, so + * just fiddle with both possible images. vdso_image_32_syscall + * can't be selected, since it only exists on 64-bit systems. + */ u32 *mask; - mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); + mask = vdso_image_32_int80.data + + vdso_image_32_int80.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; - mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); + mask = vdso_image_32_sysenter.data + + vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; #endif } -- cgit v1.2.3 From 18d0a6fd227177fd243993179c90e454d0638b06 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:35 -0700 Subject: x86, vdso: Move the 32-bit vdso special pages after the text This unifies the vdso mapping code and teaches it how to map special pages at addresses corresponding to symbols in the vdso image. The new code is used for all vdso variants, but so far only the 32-bit variants use the new vvar page position. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/b6d7858ad7b5ac3fd3c29cab6d6d769bc45d195e.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/elf.h | 8 +-- arch/x86/include/asm/vdso.h | 4 ++ arch/x86/include/asm/vdso32.h | 11 ---- arch/x86/vdso/vdso-layout.lds.S | 42 ++++++++----- arch/x86/vdso/vdso2c.c | 14 +++++ arch/x86/vdso/vdso2c.h | 17 ++++++ arch/x86/vdso/vdso32-setup.c | 115 +----------------------------------- arch/x86/vdso/vma.c | 128 +++++++++++++++++++++++++++++++++------- 8 files changed, 173 insertions(+), 166 deletions(-) delete mode 100644 arch/x86/include/asm/vdso32.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 65b21bcbe9f7..1a055c81d864 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -333,11 +333,9 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); -extern int x32_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); - -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); -#define compat_arch_setup_additional_pages syscall32_setup_pages +extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); +#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define arch_randomize_brk arch_randomize_brk diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 389fe2ca27c2..d0a2c909c72d 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -14,6 +14,10 @@ struct vdso_image { unsigned long alt, alt_len; + unsigned long sym_end_mapping; /* Total size of the mapping */ + + unsigned long sym_vvar_page; + unsigned long sym_hpet_page; unsigned long sym_VDSO32_NOTE_MASK; unsigned long sym___kernel_sigreturn; unsigned long sym___kernel_rt_sigreturn; diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h deleted file mode 100644 index 7efb7018406e..000000000000 --- a/arch/x86/include/asm/vdso32.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _ASM_X86_VDSO32_H -#define _ASM_X86_VDSO32_H - -#define VDSO_BASE_PAGE 0 -#define VDSO_VVAR_PAGE 1 -#define VDSO_HPET_PAGE 2 -#define VDSO_PAGES 3 -#define VDSO_PREV_PAGES 2 -#define VDSO_OFFSET(x) ((x) * PAGE_SIZE) - -#endif diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 9df017ab2285..e177c08bb4bc 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -1,3 +1,5 @@ +#include + /* * Linker script for vDSO. This is an ELF shared object prelinked to * its virtual address, and with only one read-only segment. @@ -6,20 +8,6 @@ SECTIONS { -#ifdef BUILD_VDSO32 -#include - - hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - - vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset; -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR -#endif . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -59,11 +47,33 @@ SECTIONS .text : { *(.text*) } :text =0x90909090, +#ifdef BUILD_VDSO32 /* - * The comma above works around a bug in gold: - * https://sourceware.org/bugzilla/show_bug.cgi?id=16804 + * The remainder of the vDSO consists of special pages that are + * shared between the kernel and userspace. It needs to be at the + * end so that it doesn't overlap the mapping of the actual + * vDSO image. */ + . = ALIGN(PAGE_SIZE); + vvar_page = .; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + . = vvar_page + PAGE_SIZE; + + hpet_page = .; + . = . + PAGE_SIZE; +#endif + + . = ALIGN(PAGE_SIZE); + end_mapping = .; + /DISCARD/ : { *(.discard) *(.discard.*) diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index 976e8e4ced92..81edd1ec9df8 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -15,7 +15,21 @@ #include /* Symbols that we need in vdso2c. */ +enum { + sym_vvar_page, + sym_hpet_page, + sym_end_mapping, +}; + +const int special_pages[] = { + sym_vvar_page, + sym_hpet_page, +}; + char const * const required_syms[] = { + [sym_vvar_page] = "vvar_page", + [sym_hpet_page] = "hpet_page", + [sym_end_mapping] = "end_mapping", "VDSO32_NOTE_MASK", "VDSO32_SYSENTER_RETURN", "__kernel_vsyscall", diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index 9276e5207620..ed2e894e89ab 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -87,6 +87,23 @@ static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) } } + /* Validate mapping addresses. */ + for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { + if (!syms[i]) + continue; /* The mapping isn't used; ignore it. */ + + if (syms[i] % 4096) + fail("%s must be a multiple of 4096\n", + required_syms[i]); + if (syms[i] < data_size) + fail("%s must be after the text mapping\n", + required_syms[i]); + if (syms[sym_end_mapping] < syms[i] + 4096) + fail("%s overruns end_mapping\n", required_syms[i]); + } + if (syms[sym_end_mapping] % 4096) + fail("end_mapping must be a multiple of 4096\n"); + /* Remove sections. */ hdr->e_shoff = 0; hdr->e_shentsize = 0; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index d41460118a28..c3ed708e50f4 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -8,28 +8,12 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include -#include #ifdef CONFIG_COMPAT_VDSO #define VDSO_DEFAULT 0 @@ -37,10 +21,6 @@ #define VDSO_DEFAULT 1 #endif -#ifdef CONFIG_X86_64 -#define arch_setup_additional_pages syscall32_setup_pages -#endif - /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? @@ -101,95 +81,6 @@ int __init sysenter_setup(void) return 0; } -/* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - struct vm_area_struct *vma; - unsigned long vdso32_size = selected_vdso32->size; - -#ifdef CONFIG_X86_X32_ABI - if (test_thread_flag(TIF_X32)) - return x32_setup_additional_pages(bprm, uses_interp); -#endif - - if (vdso32_enabled != 1) /* Other values all mean "disabled" */ - return 0; - - down_write(&mm->mmap_sem); - - addr = get_unmapped_area(NULL, 0, vdso32_size + VDSO_OFFSET(VDSO_PREV_PAGES), 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - - addr += VDSO_OFFSET(VDSO_PREV_PAGES); - - current->mm->context.vdso = (void __user *)addr; - - /* - * MAYWRITE to allow gdb to COW and set breakpoints - */ - ret = install_special_mapping(mm, - addr, - vdso32_size, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - selected_vdso32->pages); - - if (ret) - goto up_fail; - - vma = _install_special_mapping(mm, - addr - VDSO_OFFSET(VDSO_PREV_PAGES), - VDSO_OFFSET(VDSO_PREV_PAGES), - VM_READ, - NULL); - - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto up_fail; - } - - ret = remap_pfn_range(vma, - addr - VDSO_OFFSET(VDSO_VVAR_PAGE), - __pa_symbol(&__vvar_page) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - -#ifdef CONFIG_HPET_TIMER - if (hpet_address) { - ret = io_remap_pfn_range(vma, - addr - VDSO_OFFSET(VDSO_HPET_PAGE), - hpet_address >> PAGE_SHIFT, - PAGE_SIZE, - pgprot_noncached(PAGE_READONLY)); - - if (ret) - goto up_fail; - } -#endif - - if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) - current_thread_info()->sysenter_return = - current->mm->context.vdso + - selected_vdso32->sym_VDSO32_SYSENTER_RETURN; - - up_fail: - if (ret) - current->mm->context.vdso = NULL; - - up_write(&mm->mmap_sem); - - return ret; -} - #ifdef CONFIG_X86_64 subsys_initcall(sysenter_setup); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index cf217626fb47..e915eaec4f96 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -15,6 +15,7 @@ #include #include #include +#include #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; @@ -36,7 +37,6 @@ void __init init_vdso_image(const struct vdso_image *image) image->alt_len)); } - #if defined(CONFIG_X86_64) static int __init init_vdso(void) { @@ -49,13 +49,16 @@ static int __init init_vdso(void) return 0; } subsys_initcall(init_vdso); +#endif struct linux_binprm; /* Put the vdso above the (randomized) stack with another randomized offset. This way there is no hole in the middle of address space. To save memory make sure it is still in the same PTE as the stack top. - This doesn't give that many random bits */ + This doesn't give that many random bits. + + Only used for the 64-bit and x32 vdsos. */ static unsigned long vdso_addr(unsigned long start, unsigned len) { unsigned long addr, end; @@ -81,23 +84,23 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) return addr; } -/* Setup a VMA at program startup for the vsyscall page. - Not called for compat tasks */ -static int setup_additional_pages(struct linux_binprm *bprm, - int uses_interp, - struct page **pages, - unsigned size) +static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; unsigned long addr; - int ret; + int ret = 0; - if (!vdso64_enabled) - return 0; + if (calculate_addr) { + addr = vdso_addr(current->mm->start_stack, + image->sym_end_mapping); + } else { + addr = 0; + } down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, size); - addr = get_unmapped_area(NULL, addr, size, 0, 0); + + addr = get_unmapped_area(NULL, addr, image->sym_end_mapping, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; @@ -105,34 +108,115 @@ static int setup_additional_pages(struct linux_binprm *bprm, current->mm->context.vdso = (void __user *)addr; - ret = install_special_mapping(mm, addr, size, + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + ret = install_special_mapping(mm, + addr, + image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - pages); - if (ret) { - current->mm->context.vdso = NULL; + image->pages); + + if (ret) + goto up_fail; + + vma = _install_special_mapping(mm, + addr + image->size, + image->sym_end_mapping - image->size, + VM_READ, + NULL); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto up_fail; } + if (image->sym_vvar_page) + ret = remap_pfn_range(vma, + addr + image->sym_vvar_page, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + +#ifdef CONFIG_HPET_TIMER + if (hpet_address && image->sym_hpet_page) { + ret = io_remap_pfn_range(vma, + addr + image->sym_hpet_page, + hpet_address >> PAGE_SHIFT, + PAGE_SIZE, + pgprot_noncached(PAGE_READONLY)); + + if (ret) + goto up_fail; + } +#endif + up_fail: + if (ret) + current->mm->context.vdso = NULL; + up_write(&mm->mmap_sem); return ret; } +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +static int load_vdso32(void) +{ + int ret; + + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ + return 0; + + ret = map_vdso(selected_vdso32, false); + if (ret) + return ret; + + if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) + current_thread_info()->sysenter_return = + current->mm->context.vdso + + selected_vdso32->sym_VDSO32_SYSENTER_RETURN; + + return 0; +} +#endif + +#ifdef CONFIG_X86_64 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages(bprm, uses_interp, vdso_image_64.pages, - vdso_image_64.size); + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_64, true); } +#ifdef CONFIG_COMPAT +int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) +{ #ifdef CONFIG_X86_X32_ABI -int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) + if (test_thread_flag(TIF_X32)) { + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_x32, true); + } +#endif + + return load_vdso32(); +} +#endif +#else +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages(bprm, uses_interp, vdso_image_x32.pages, - vdso_image_x32.size); + return load_vdso32(); } #endif +#ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) { vdso64_enabled = simple_strtoul(s, NULL, 0); -- cgit v1.2.3 From f40c330091c7aa9956ab66f97a3abc8a68b67240 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:36 -0700 Subject: x86, vdso: Move the vvar and hpet mappings next to the 64-bit vDSO This makes the 64-bit and x32 vdsos use the same mechanism as the 32-bit vdso. Most of the churn is deleting all the old fixmap code. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/8af87023f57f6bb96ec8d17fce3f88018195b49b.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fixmap.h | 11 ++++------- arch/x86/include/asm/vvar.h | 20 +------------------- arch/x86/include/uapi/asm/vsyscall.h | 7 +------ arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/hpet.c | 3 --- arch/x86/kernel/vsyscall_64.c | 15 ++++----------- arch/x86/mm/fault.c | 5 +++-- arch/x86/mm/init_64.c | 10 +++++----- arch/x86/vdso/vclock_gettime.c | 22 +++++----------------- arch/x86/vdso/vdso-layout.lds.S | 2 -- arch/x86/xen/mmu.c | 8 +++----- 11 files changed, 27 insertions(+), 77 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 43f482a0db37..b0910f97a3ea 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -24,7 +24,7 @@ #include #include #else -#include +#include #endif /* @@ -41,7 +41,8 @@ extern unsigned long __FIXADDR_TOP; #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) #else -#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) +#define FIXADDR_TOP (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1<> PAGE_SHIFT) - 1, - VVAR_PAGE, - VSYSCALL_HPET, + VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, #ifdef CONFIG_PARAVIRT_CLOCK PVCLOCK_FIXMAP_BEGIN, PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 081d909bc495..5d2b9ad2c6d2 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -29,31 +29,13 @@ #else -#ifdef BUILD_VDSO32 +extern char __vvar_page; #define DECLARE_VVAR(offset, type, name) \ extern type vvar_ ## name __attribute__((visibility("hidden"))); #define VVAR(name) (vvar_ ## name) -#else - -extern char __vvar_page; - -/* Base address of vvars. This is not ABI. */ -#ifdef CONFIG_X86_64 -#define VVAR_ADDRESS (-10*1024*1024 - 4096) -#else -#define VVAR_ADDRESS (&__vvar_page) -#endif - -#define DECLARE_VVAR(offset, type, name) \ - static type const * const vvaraddr_ ## name = \ - (void *)(VVAR_ADDRESS + (offset)); - -#define VVAR(name) (*vvaraddr_ ## name) -#endif - #define DEFINE_VVAR(type, name) \ type name \ __attribute__((section(".vvar_" #name), aligned(16))) __visible diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h index 85dc1b3825ab..b97dd6e263d2 100644 --- a/arch/x86/include/uapi/asm/vsyscall.h +++ b/arch/x86/include/uapi/asm/vsyscall.h @@ -7,11 +7,6 @@ enum vsyscall_num { __NR_vgetcpu, }; -#define VSYSCALL_START (-10UL << 20) -#define VSYSCALL_SIZE 1024 -#define VSYSCALL_END (-2UL << 20) -#define VSYSCALL_MAPPED_PAGES 1 -#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr)) - +#define VSYSCALL_ADDR (-10UL << 20) #endif /* _UAPI_ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7c65b4666c24..2cbbf88d8f2c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 8d80ae011603..bbc15a0362d2 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -74,9 +74,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a) static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); -#ifdef CONFIG_X86_64 - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); -#endif } static inline void hpet_clear_mapping(void) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8b3b3eb3cead..ea5b5709aa76 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -91,7 +91,7 @@ static int addr_to_vsyscall_nr(unsigned long addr) { int nr; - if ((addr & ~0xC00UL) != VSYSCALL_START) + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; @@ -330,24 +330,17 @@ void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != - (unsigned long)VSYSCALL_START); - - __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != - (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); } static int __init vsyscall_init(void) { - BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); - cpu_notifier_register_begin(); on_each_cpu(cpu_vsyscall_init, NULL, 1); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8e5722992677..858b47b5221b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,7 +18,8 @@ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ -#include /* VSYSCALL_START */ +#include /* VSYSCALL_ADDR */ +#include /* emulate_vsyscall */ #define CREATE_TRACE_POINTS #include @@ -771,7 +772,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * emulation. */ if (unlikely((error_code & PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_START))) { + ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 563849600d3e..6f881842116c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1055,8 +1055,8 @@ void __init mem_init(void) after_bootmem = 1; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, + PAGE_SIZE, KCORE_OTHER); mem_init_print_info(NULL); } @@ -1186,8 +1186,8 @@ int kern_addr_valid(unsigned long addr) * not need special handling anymore: */ static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, .vm_flags = VM_READ | VM_EXEC }; @@ -1218,7 +1218,7 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) */ int in_gate_area_no_mm(unsigned long addr) { - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); + return (addr & PAGE_MASK) == VSYSCALL_ADDR; } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 091554c20bc9..b2e4f493e5b0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -30,9 +30,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); #ifdef CONFIG_HPET_TIMER -static inline u32 read_hpet_counter(const volatile void *addr) +extern u8 hpet_page + __attribute__((visibility("hidden"))); + +static notrace cycle_t vread_hpet(void) { - return *(const volatile u32 *) (addr + HPET_COUNTER); + return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); } #endif @@ -43,11 +46,6 @@ static inline u32 read_hpet_counter(const volatile void *addr) #include #include -static notrace cycle_t vread_hpet(void) -{ - return read_hpet_counter((const void *)fix_to_virt(VSYSCALL_HPET)); -} - notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -137,16 +135,6 @@ static notrace cycle_t vread_pvclock(int *mode) #else -extern u8 hpet_page - __attribute__((visibility("hidden"))); - -#ifdef CONFIG_HPET_TIMER -static notrace cycle_t vread_hpet(void) -{ - return read_hpet_counter((const void *)(&hpet_page)); -} -#endif - notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index e177c08bb4bc..2ec72f651ebf 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -47,7 +47,6 @@ SECTIONS .text : { *(.text*) } :text =0x90909090, -#ifdef BUILD_VDSO32 /* * The remainder of the vDSO consists of special pages that are * shared between the kernel and userspace. It needs to be at the @@ -69,7 +68,6 @@ SECTIONS hpet_page = .; . = . + PAGE_SIZE; -#endif . = ALIGN(PAGE_SIZE); end_mapping = .; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 86e02eabb640..3060568248d3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1494,7 +1494,7 @@ static int xen_pgd_alloc(struct mm_struct *mm) page->private = (unsigned long)user_pgd; if (user_pgd != NULL) { - user_pgd[pgd_index(VSYSCALL_START)] = + user_pgd[pgd_index(VSYSCALL_ADDR)] = __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); ret = 0; } @@ -2062,8 +2062,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) case FIX_KMAP_BEGIN ... FIX_KMAP_END: # endif #else - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: - case VVAR_PAGE: + case VSYSCALL_PAGE: #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: @@ -2104,8 +2103,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #ifdef CONFIG_X86_64 /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) || - idx == VVAR_PAGE) { + if (idx == VSYSCALL_PAGE) { unsigned long vaddr = __fix_to_virt(idx); set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); } -- cgit v1.2.3 From 2b6f2e649f7376d9871df63a9aa0b41efd464d74 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:37 -0700 Subject: x86, vdso: Remove vestiges of VDSO_PRELINK and some outdated comments These definitions had no effect. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/946c104e40c47319f8ab406e54118799cb55bd99.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso.lds.S | 7 +------ arch/x86/vdso/vdso32/vdso32.lds.S | 5 +---- arch/x86/vdso/vdsox32.lds.S | 7 +------ 3 files changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index b96b2677cad8..75e3404c83b1 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -1,14 +1,11 @@ /* * Linker script for 64-bit vDSO. * We #include the file to define the layout details. - * Here we only choose the prelinked virtual address. * * This file defines the version script giving the user-exported symbols in - * the DSO. We can define local symbols here called VDSO* to make their - * values visible using the asm-x86/vdso.h macros from the kernel proper. + * the DSO. */ -#define VDSO_PRELINK 0xffffffffff700000 #include "vdso-layout.lds.S" /* @@ -28,5 +25,3 @@ VERSION { local: *; }; } - -VDSO64_PRELINK = VDSO_PRELINK; diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S index f072095d6427..31056cf294bf 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -1,17 +1,14 @@ /* * Linker script for 32-bit vDSO. * We #include the file to define the layout details. - * Here we only choose the prelinked virtual address. * * This file defines the version script giving the user-exported symbols in - * the DSO. We can define local symbols here called VDSO* to make their - * values visible using the asm-x86/vdso.h macros from the kernel proper. + * the DSO. */ #include #define BUILD_VDSO32 -#define VDSO_PRELINK 0 #include "../vdso-layout.lds.S" diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S index 62272aa2ae0a..46b991b578a8 100644 --- a/arch/x86/vdso/vdsox32.lds.S +++ b/arch/x86/vdso/vdsox32.lds.S @@ -1,14 +1,11 @@ /* * Linker script for x32 vDSO. * We #include the file to define the layout details. - * Here we only choose the prelinked virtual address. * * This file defines the version script giving the user-exported symbols in - * the DSO. We can define local symbols here called VDSO* to make their - * values visible using the asm-x86/vdso.h macros from the kernel proper. + * the DSO. */ -#define VDSO_PRELINK 0 #include "vdso-layout.lds.S" /* @@ -24,5 +21,3 @@ VERSION { local: *; }; } - -VDSOX32_PRELINK = VDSO_PRELINK; -- cgit v1.2.3 From 1171903d899b1930f502b4c10a2a3565d6603c71 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Fri, 2 May 2014 17:57:47 +0200 Subject: KVM: x86: improve the usability of the 'kvm_pio' tracepoint This patch moves the 'kvm_pio' tracepoint to emulator_pio_in_emulated() and emulator_pio_out_emulated(), and it adds an argument (a pointer to the 'pio_data'). A single 8-bit or 16-bit or 32-bit data item is fetched from 'pio_data' (depending on 'size'), and the value is included in the trace record ('val'). If 'count' is greater than one, this is indicated by the string "(...)" in the trace output. Signed-off-by: Ulrich Obergfell Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 20 ++++++++++++++++---- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 545245d7cc63..33574c95220d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall, /* * Tracepoint for PIO. */ + +#define KVM_PIO_IN 0 +#define KVM_PIO_OUT 1 + TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count), - TP_ARGS(rw, port, size, count), + unsigned int count, void *data), + TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, port ) __field( unsigned int, size ) __field( unsigned int, count ) + __field( unsigned int, val ) ), TP_fast_assign( @@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio, __entry->port = port; __entry->size = size; __entry->count = count; + if (size == 1) + __entry->val = *(unsigned char *)data; + else if (size == 2) + __entry->val = *(unsigned short *)data; + else + __entry->val = *(unsigned int *)data; ), - TP_printk("pio_%s at 0x%x size %d count %d", + TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", __entry->rw ? "write" : "read", - __entry->port, __entry->size, __entry->count) + __entry->port, __entry->size, __entry->count, __entry->val, + __entry->count > 1 ? "(...)" : "") ); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c5582c385bc0..de0931cb3f58 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4480,8 +4480,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count, bool in) { - trace_kvm_pio(!in, port, size, count); - vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; @@ -4516,6 +4514,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (ret) { data_avail: memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; return 1; } @@ -4530,6 +4529,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); memcpy(vcpu->arch.pio_data, val, size * count); + trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } -- cgit v1.2.3 From 19677e32fe7d6913e07ce80f6f3dc7663ac7fe67 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 6 May 2014 02:19:15 -0400 Subject: KVM: nVMX: rearrange get_vmx_mem_address Our common function for vmptr checks (in 2/4) needs to fetch the memory address Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 106 ++++++++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 72b8012991b9..917a15efc45b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5791,6 +5791,59 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info, gva_t *ret) +{ + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + *ret = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + *ret += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + *ret += kvm_register_read(vcpu, index_reg)<> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); - if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)< Date: Tue, 6 May 2014 02:19:16 -0400 Subject: KVM: nVMX: additional checks on vmxon region Currently, the vmxon region isn't used in the nested case. However, according to the spec, the vmxon instruction performs additional sanity checks on this region and the associated pointer. Modify emulated vmxon to better adhere to the spec requirements Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/vmx.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 333b88db22fe..17b42fabc842 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -732,6 +732,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } +EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); /* * If no match is found, check whether we exceed the vCPU's limit diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 917a15efc45b..0f7934767a2f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -354,6 +354,7 @@ struct vmcs02_list { struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + gpa_t vmxon_ptr; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -5844,6 +5845,68 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } +/* + * This function performs the various checks including + * - if it's 4KB aligned + * - No bits beyond the physical address width are set + * - Returns 0 on success or else 1 + */ +static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason) +{ + gva_t gva; + gpa_t vmptr; + struct x86_exception e; + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + return 1; + + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, + sizeof(vmptr), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (exit_reason) { + case EXIT_REASON_VMON: + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 + * for the nested case; + * which replaces physical address width with 32 + * + */ + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL || + *(u32 *)kmap(page) != VMCS12_REVISION) { + nested_vmx_failInvalid(vcpu); + kunmap(page); + skip_emulated_instruction(vcpu); + return 1; + } + kunmap(page); + vmx->nested.vmxon_ptr = vmptr; + break; + + default: + return 1; /* shouldn't happen */ + } + + return 0; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5882,6 +5945,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) kvm_inject_gp(vcpu, 0); return 1; } + + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON)) + return 1; + if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); skip_emulated_instruction(vcpu); -- cgit v1.2.3 From 96ec146330d18a938b4773be8d6dd1f93399507c Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 6 May 2014 02:19:17 -0400 Subject: KVM: nVMX: fail on invalid vmclear/vmptrld pointer The spec mandates that if the vmptrld or vmclear address is equal to the vmxon region pointer, the instruction should fail with error "VMPTRLD with VMXON pointer" or "VMCLEAR with VMXON pointer" Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0f7934767a2f..1d7e7279f1b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6100,6 +6100,12 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return 1; } + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + if (vmptr == vmx->nested.current_vmptr) { nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; @@ -6443,6 +6449,12 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; } + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; -- cgit v1.2.3 From 4291b58885f5af560488a5b9667ca6930b9fdc3d Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 6 May 2014 02:19:18 -0400 Subject: KVM: nVMX: move vmclear and vmptrld pre-checks to nested_vmx_check_vmptr Some checks are common to all, and moreover, according to the spec, the check for whether any bits beyond the physical address width are set are also applicable to all of them Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 83 ++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1d7e7279f1b4..a5fd47e4abfc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5850,8 +5850,10 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, * - if it's 4KB aligned * - No bits beyond the physical address width are set * - Returns 0 on success or else 1 + * (Intel SDM Section 30.3) */ -static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason) +static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, + gpa_t *vmpointer) { gva_t gva; gpa_t vmptr; @@ -5899,11 +5901,42 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason) kunmap(page); vmx->nested.vmxon_ptr = vmptr; break; + case EXIT_REASON_VMCLEAR: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case EXIT_REASON_VMPTRLD: + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + skip_emulated_instruction(vcpu); + return 1; + } + break; default: return 1; /* shouldn't happen */ } + if (vmpointer) + *vmpointer = vmptr; return 0; } @@ -5946,7 +5979,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON)) + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) return 1; if (vmx->nested.vmxon) { @@ -6075,37 +6108,16 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; struct vmcs12 *vmcs12; struct page *page; - struct x86_exception e; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; - } - if (vmptr == vmx->nested.current_vmptr) { nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; @@ -6425,35 +6437,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; gpa_t vmptr; - struct x86_exception e; u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) return 1; - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; - } if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; -- cgit v1.2.3 From a4ab9d0cf1ef0bf521bb69099aa464f38c71393c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 7 May 2014 15:32:49 +0300 Subject: KVM: vmx: handle_dr does not handle RSP correctly The RSP register is not automatically cached, causing mov DR instruction with RSP to fail. Instead the regular register accessing interface should be used. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a5fd47e4abfc..61e818d80732 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5143,7 +5143,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; kvm_register_write(vcpu, reg, val); } else - if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg])) + if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) return 1; skip_emulated_instruction(vcpu); -- cgit v1.2.3 From 5f7dde7bbb3c628766676cbd63c0a1834035d6fa Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 7 May 2014 15:32:50 +0300 Subject: KVM: x86: Mark bit 7 in long-mode PDPTE according to 1GB pages support In long-mode, bit 7 in the PDPTE is not reserved only if 1GB pages are supported by the CPU. Currently the bit is considered by KVM as always reserved. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 7 +++++++ arch/x86/kvm/mmu.c | 8 ++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index eeecbed26ac7..f9087315e0cd 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -88,4 +88,11 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_X2APIC)); } +static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_GBPAGES)); +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 65f2400b8268..931467881da7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -22,6 +22,7 @@ #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "cpuid.h" #include #include @@ -3516,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + u64 gbpages_bit_rsvd = 0; context->bad_mt_xwr = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); + if (!guest_cpuid_has_gbpages(vcpu)) + gbpages_bit_rsvd = rsvd_bits(7, 7); switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ @@ -3557,14 +3561,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->rsvd_bits_mask[0][3] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; context->rsvd_bits_mask[1][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | -- cgit v1.2.3 From b63cf42fd1d8c18fab71222321aaf356f63089c9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 7 May 2014 16:29:48 +0300 Subject: kvm/x86: implement hv EOI assist It seems that it's easy to implement the EOI assist on top of the PV EOI feature: simply convert the page address to the format expected by PV EOI. Notes: -"No EOI required" is set only if interrupt injected is edge triggered; this is true because level interrupts are going through IOAPIC which disables PV EOI. In any case, if guest triggers EOI the bit will get cleared on exit. -For migration, set of HV_X64_MSR_APIC_ASSIST_PAGE sets KVM_PV_EOI_EN internally, so restoring HV_X64_MSR_APIC_ASSIST_PAGE seems sufficient In any case, bit is cleared on exit so worst case it's never re-enabled -no handling of PV EOI data is performed at HV_X64_MSR_EOI write; HV_X64_MSR_EOI is a separate optimization - it's an X2APIC replacement that lets you do EOI with an MSR and not IO. Signed-off-by: Michael S. Tsirkin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de0931cb3f58..41f673facf2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1917,6 +1917,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { vcpu->arch.hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + return 1; break; } gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; @@ -1927,6 +1929,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; vcpu->arch.hv_vapic = data; mark_page_dirty(vcpu->kvm, gfn); + if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + return 1; break; } case HV_X64_MSR_EOI: -- cgit v1.2.3 From f80c5b39b80ab4d52acd940c03b89948cafdbd18 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 09:59:05 +0200 Subject: sched/idle, x86: Switch from TS_POLLING to TIF_POLLING_NRFLAG Standardize the idle polling indicator to TIF_POLLING_NRFLAG such that both TIF_NEED_RESCHED and TIF_POLLING_NRFLAG are in the same word. This will allow us, using fetch_or(), to both set NEED_RESCHED and check for POLLING_NRFLAG in a single operation and avoid pointless wakeups. Changing from the non-atomic thread_info::status flags to the atomic thread_info::flags shouldn't be a big issue since most polling state changes were followed/preceded by a full memory barrier anyway. Also, fix up the apm_32 idle function, clearly that was forgotten in the last conversion. The default idle state is !POLLING so just kill the lot. Signed-off-by: Peter Zijlstra Cc: Andy Lutomirski Cc: Jiri Kosina Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-7yksmqtlv4nfowmlqr1rifoi@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 4 ++-- arch/x86/kernel/apm_32.c | 11 ----------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 47e5de25ba79..854053889d4d 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,6 +83,7 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ +#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -106,6 +107,7 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void) * have to worry about atomic accesses. */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#define TS_POLLING 0x0004 /* idle task polling need_resched, - skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 3ab03430211d..f3a1f04ed4cb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -844,21 +844,10 @@ static int apm_do_idle(void) int polling; int err = 0; - polling = !!(current_thread_info()->status & TS_POLLING); - if (polling) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - } if (!need_resched()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); } - if (polling) - current_thread_info()->status |= TS_POLLING; if (!idled) return 0; -- cgit v1.2.3 From 87c00572ba05aa8c9db118da75c608f47eb10b9e Mon Sep 17 00:00:00 2001 From: "Gabriel L. Somlo" Date: Wed, 7 May 2014 16:52:13 -0400 Subject: kvm: x86: emulate monitor and mwait instructions as nop Treat monitor and mwait instructions as nop, which is architecturally correct (but inefficient) behavior. We do this to prevent misbehaving guests (e.g. OS X <= 10.7) from crashing after they fail to check for monitor/mwait availability via cpuid. Since mwait-based idle loops relying on these nop-emulated instructions would keep the host CPU pegged at 100%, do NOT advertise their presence via cpuid, to prevent compliant guests from using them inadvertently. Signed-off-by: Gabriel L. Somlo Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/svm.c | 28 ++++++++++++++++++++-------- arch/x86/kvm/vmx.c | 20 ++++++++++++++++---- 3 files changed, 38 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 17b42fabc842..38a0afe83c6b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -283,6 +283,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = + /* NOTE: MONITOR (and MWAIT) are emulated as NOP, + * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7f4f9c2badae..0b7d58d0c5fb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2770,12 +2770,6 @@ static int xsetbv_interception(struct vcpu_svm *svm) return 1; } -static int invalid_op_interception(struct vcpu_svm *svm) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; @@ -3287,6 +3281,24 @@ static int pause_interception(struct vcpu_svm *svm) return 1; } +static int nop_interception(struct vcpu_svm *svm) +{ + skip_emulated_instruction(&(svm->vcpu)); + return 1; +} + +static int monitor_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return nop_interception(svm); +} + +static int mwait_interception(struct vcpu_svm *svm) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return nop_interception(svm); +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3344,8 +3356,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, - [SVM_EXIT_MONITOR] = invalid_op_interception, - [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_MONITOR] = monitor_interception, + [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, }; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 61e818d80732..6f7463f53ed9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5650,12 +5650,24 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } -static int handle_invalid_op(struct kvm_vcpu *vcpu) +static int handle_nop(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + skip_emulated_instruction(vcpu); return 1; } +static int handle_mwait(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + +static int handle_monitor(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + /* * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. * We could reuse a single VMCS for all the L2 guests, but we also want the @@ -6617,8 +6629,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, }; -- cgit v1.2.3 From 3d379225c458097d41d104b4f78f40ee97719333 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 25 Apr 2014 13:46:11 -0400 Subject: x86, boot: Do not include boot.h in string.c string.c does not require whole of boot.h. Just inclusion of linux/types.h and ctypes.h seems to be sufficient. Keep list of stuff being included in string.c to bare minimal so that string.c can be included in other places easily. For example, Currently boot/compressed/string.c includes boot/string.c but looks like it does not want boot/boot.h. Hence there is a define in boot/compressed/misc.h "define BOOT_BOOT_H" which prevents inclusion of boot.h in compressed/string.c. And compressed/string.c is forced to include misc.h just for that reason. So by removing inclusion of boot.h, we can also get rid of inclusion of misch.h in compressed/misc.c. This also enables including of boot/string.c in purgatory/ code relatively easily. Signed-off-by: Vivek Goyal Link: http://lkml.kernel.org/r/1398447972-27896-2-git-send-email-vgoyal@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/boot/string.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 5339040ef86e..aca52b83e31d 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -12,7 +12,8 @@ * Very basic string functions */ -#include "boot.h" +#include +#include "ctype.h" /* * This file gets included in compressed/string.c which might pull in -- cgit v1.2.3 From a9a17104a112a67a7bf0679b734704c130eb5faa Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 25 Apr 2014 13:46:12 -0400 Subject: x86, boot: Remove misc.h inclusion from compressed/string.c Given the fact that we removed inclusion of boot.h from boot/string.c does not look like we need misc.h inclusion in compressed/string.c. So remove it. misc.h was also pulling in string_32.h which in turn had macros for memcmp and memcpy. So we don't need to #undef memcmp and memcpy anymore. Signed-off-by: Vivek Goyal Link: http://lkml.kernel.org/r/1398447972-27896-3-git-send-email-vgoyal@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/string.c | 4 ---- arch/x86/boot/string.c | 6 ------ 2 files changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index f3c57e341402..00e788be1db9 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,9 +1,5 @@ -#include "misc.h" #include "../string.c" -/* misc.h might pull in string_32.h which has a macro for memcpy. undef that */ -#undef memcpy - #ifdef CONFIG_X86_32 void *memcpy(void *dest, const void *src, size_t n) { diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index aca52b83e31d..493f3fd9f139 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -15,12 +15,6 @@ #include #include "ctype.h" -/* - * This file gets included in compressed/string.c which might pull in - * string_32.h and which in turn maps memcmp to __builtin_memcmp(). Undo - * that first. - */ -#undef memcmp int memcmp(const void *s1, const void *s2, size_t len) { u8 diff; -- cgit v1.2.3 From 6b8f0c8780c71d78624f736d7849645b64cc88b7 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Fri, 9 May 2014 13:44:05 -0700 Subject: x86, iosf: Make IOSF driver modular and usable by more drivers Currently drivers that run on non-IOSF systems (Core/Xeon) can't use the IOSF driver on SOC's without selecting it which forces an unnecessary and limiting dependency. Provides dummy functions to allow these modules to conditionally use the driver on IOSF equipped platforms without impacting their ability to compile and load on non-IOSF platforms. Build default m to ensure availability on x86 SOC's. Signed-off-by: David E. Box Link: http://lkml.kernel.org/r/1399668248-24199-2-git-send-email-david.e.box@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 7 ++----- arch/x86/include/asm/iosf_mbi.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/kernel/iosf_mbi.c | 7 +++++++ 3 files changed, 42 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25d2c6f7325e..f1304d38aa21 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2375,12 +2375,9 @@ config X86_DMA_REMAP depends on STA2X11 config IOSF_MBI - bool + tristate + default m depends on PCI - ---help--- - To be selected by modules requiring access to the Intel OnChip System - Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms - enumerable by PCI. source "net/Kconfig" diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 8e71c7941767..1a91a3698b1e 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -50,6 +50,10 @@ #define BT_MBI_PCIE_READ 0x00 #define BT_MBI_PCIE_WRITE 0x01 +#if IS_ENABLED(CONFIG_IOSF_MBI) + +bool iosf_mbi_available(void); + /** * iosf_mbi_read() - MailBox Interface read command * @port: port indicating subunit being accessed @@ -87,4 +91,33 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr); */ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); +#else /* CONFIG_IOSF_MBI is not enabled */ +static inline +bool iosf_mbi_available(void) +{ + return false; +} + +static inline +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} + +static inline +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} + +static inline +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} +#endif /* CONFIG_IOSF_MBI */ + #endif /* IOSF_MBI_SYMS_H */ diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index c3aae6672843..f4ff9786a620 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -177,6 +177,13 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) } EXPORT_SYMBOL(iosf_mbi_modify); +bool iosf_mbi_available(void) +{ + /* Mbi isn't hot-pluggable. No remove routine is provided */ + return mbi_pdev; +} +EXPORT_SYMBOL(iosf_mbi_available); + static int iosf_mbi_probe(struct pci_dev *pdev, const struct pci_device_id *unused) { -- cgit v1.2.3 From 7ef1def800e907edd28ddb1a5c64bae6b8749cdd Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Fri, 9 May 2014 13:44:06 -0700 Subject: x86, iosf: Added Quark MBI identifiers Added all the MBI units below and their associated read/write opcodes: - Host Bridge Arbiter - Host Bridge - Remote Management Unit - Memory Manager & eSRAM - SoC Unit Signed-off-by: Ong Boon Leong Link: http://lkml.kernel.org/r/1399668248-24199-3-git-send-email-david.e.box@linux.intel.com Signed-off-by: David E. Box Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iosf_mbi.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 1a91a3698b1e..57995f0596a6 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -50,6 +50,28 @@ #define BT_MBI_PCIE_READ 0x00 #define BT_MBI_PCIE_WRITE 0x01 +/* Quark available units */ +#define QRK_MBI_UNIT_HBA 0x00 +#define QRK_MBI_UNIT_HB 0x03 +#define QRK_MBI_UNIT_RMU 0x04 +#define QRK_MBI_UNIT_MM 0x05 +#define QRK_MBI_UNIT_MMESRAM 0x05 +#define QRK_MBI_UNIT_SOC 0x31 + +/* Quark read/write opcodes */ +#define QRK_MBI_HBA_READ 0x10 +#define QRK_MBI_HBA_WRITE 0x11 +#define QRK_MBI_HB_READ 0x10 +#define QRK_MBI_HB_WRITE 0x11 +#define QRK_MBI_RMU_READ 0x10 +#define QRK_MBI_RMU_WRITE 0x11 +#define QRK_MBI_MM_READ 0x10 +#define QRK_MBI_MM_WRITE 0x11 +#define QRK_MBI_MMESRAM_READ 0x12 +#define QRK_MBI_MMESRAM_WRITE 0x13 +#define QRK_MBI_SOC_READ 0x06 +#define QRK_MBI_SOC_WRITE 0x07 + #if IS_ENABLED(CONFIG_IOSF_MBI) bool iosf_mbi_available(void); -- cgit v1.2.3 From 90916e048c1e0c1d379577e43ab9b8e331490cfb Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Fri, 9 May 2014 13:44:07 -0700 Subject: x86, iosf: Add Quark X1000 PCI ID Add PCI device ID, i.e. that of the Host Bridge, for IOSF MBI driver. Signed-off-by: Ong Boon Leong Link: http://lkml.kernel.org/r/1399668248-24199-4-git-send-email-david.e.box@linux.intel.com Signed-off-by: David E. Box Signed-off-by: H. Peter Anvin --- arch/x86/kernel/iosf_mbi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index f4ff9786a620..201a7abb1ab1 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -201,6 +201,7 @@ static int iosf_mbi_probe(struct pci_dev *pdev, static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0958) }, { 0, }, }; MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); -- cgit v1.2.3 From 04725ad59474d24553d526fa774179ecd2922342 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Fri, 9 May 2014 13:44:08 -0700 Subject: x86, iosf: Add PCI ID macros for better readability Introduce PCI IDs macro for the list of supported product: BayTrail & Quark X1000. Signed-off-by: Ong Boon Leong Link: http://lkml.kernel.org/r/1399668248-24199-5-git-send-email-david.e.box@linux.intel.com Signed-off-by: David E. Box Signed-off-by: H. Peter Anvin --- arch/x86/kernel/iosf_mbi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index 201a7abb1ab1..d30acdc1229d 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -25,6 +25,9 @@ #include +#define PCI_DEVICE_ID_BAYTRAIL 0x0F00 +#define PCI_DEVICE_ID_QUARK_X1000 0x0958 + static DEFINE_SPINLOCK(iosf_mbi_lock); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) @@ -200,8 +203,8 @@ static int iosf_mbi_probe(struct pci_dev *pdev, } static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0958) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, { 0, }, }; MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); -- cgit v1.2.3 From aa8532c32216ae07c3813b9aeb774517878a7573 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 8 May 2014 11:09:23 +0100 Subject: xen: refactor suspend pre/post hooks New architectures currently have to provide implementations of 5 different functions: xen_arch_pre_suspend(), xen_arch_post_suspend(), xen_arch_hvm_post_suspend(), xen_mm_pin_all(), and xen_mm_unpin_all(). Refactor the suspend code to only require xen_arch_pre_suspend() and xen_arch_post_suspend(). Signed-off-by: David Vrabel Reviewed-by: Boris Ostrovsky --- arch/x86/xen/suspend.c | 23 ++++++++++++++++++++--- arch/x86/xen/xen-ops.h | 2 ++ drivers/xen/manage.c | 45 +++++++-------------------------------------- include/xen/xen-ops.h | 4 ---- 4 files changed, 29 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226e..c4df9dbd63b7 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,8 +12,10 @@ #include "xen-ops.h" #include "mmu.h" -void xen_arch_pre_suspend(void) +static void xen_pv_pre_suspend(void) { + xen_mm_pin_all(); + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = mfn_to_pfn(xen_start_info->console.domU.mfn); @@ -26,7 +28,7 @@ void xen_arch_pre_suspend(void) BUG(); } -void xen_arch_hvm_post_suspend(int suspend_cancelled) +static void xen_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; @@ -41,7 +43,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) #endif } -void xen_arch_post_suspend(int suspend_cancelled) +static void xen_pv_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); @@ -60,6 +62,21 @@ void xen_arch_post_suspend(int suspend_cancelled) xen_vcpu_restore(); } + xen_mm_unpin_all(); +} + +void xen_arch_pre_suspend(void) +{ + if (xen_pv_domain()) + xen_pv_pre_suspend(); +} + +void xen_arch_post_suspend(int cancelled) +{ + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1cb6f4c37300..c834d4b231f0 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,6 +31,8 @@ void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; +void xen_mm_pin_all(void); +void xen_mm_unpin_all(void); void xen_set_pat(u64); char * __init xen_memory_setup(void); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 32f9236c959f..c3667b202f2f 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -41,9 +41,6 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; - unsigned long arg; /* extra hypercall argument */ - void (*pre)(void); - void (*post)(int cancelled); }; static RAW_NOTIFIER_HEAD(xen_resume_notifier); @@ -61,26 +58,6 @@ void xen_resume_notifier_unregister(struct notifier_block *nb) EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); #ifdef CONFIG_HIBERNATE_CALLBACKS -static void xen_hvm_post_suspend(int cancelled) -{ - xen_arch_hvm_post_suspend(cancelled); - gnttab_resume(); -} - -static void xen_pre_suspend(void) -{ - xen_mm_pin_all(); - gnttab_suspend(); - xen_arch_pre_suspend(); -} - -static void xen_post_suspend(int cancelled) -{ - xen_arch_post_suspend(cancelled); - gnttab_resume(); - xen_mm_unpin_all(); -} - static int xen_suspend(void *data) { struct suspend_info *si = data; @@ -94,18 +71,20 @@ static int xen_suspend(void *data) return err; } - if (si->pre) - si->pre(); + gnttab_suspend(); + xen_arch_pre_suspend(); /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - si->cancelled = HYPERVISOR_suspend(si->arg); + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_mfn(xen_start_info) + : 0); - if (si->post) - si->post(si->cancelled); + xen_arch_post_suspend(si->cancelled); + gnttab_resume(); if (!si->cancelled) { xen_irq_resume(); @@ -154,16 +133,6 @@ static void do_suspend(void) si.cancelled = 1; - if (xen_hvm_domain()) { - si.arg = 0UL; - si.pre = NULL; - si.post = &xen_hvm_post_suspend; - } else { - si.arg = virt_to_mfn(xen_start_info); - si.pre = &xen_pre_suspend; - si.post = &xen_post_suspend; - } - err = stop_machine(xen_suspend, &si, cpumask_of(0)); raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 2cf47175b12b..0b3149ed7eaa 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -9,10 +9,6 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); void xen_arch_pre_suspend(void); void xen_arch_post_suspend(int suspend_cancelled); -void xen_arch_hvm_post_suspend(int suspend_cancelled); - -void xen_mm_pin_all(void); -void xen_mm_unpin_all(void); void xen_timer_resume(void); void xen_arch_resume(void); -- cgit v1.2.3 From d9f89b88f5102ce235b75a5907838e3c7ed84b97 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 10 May 2014 09:24:34 +0200 Subject: KVM: x86: Fix CR3 reserved bits check in long mode Regression of 346874c9: PAE is set in long mode, but that does not mean we have valid PDPTRs. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41f673facf2f..fb313fc896dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -701,10 +701,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu) && (cr3 & CR3_L_MODE_RESERVED_BITS)) - return 1; - if (is_pae(vcpu) && is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; + } else if (is_pae(vcpu) && is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; vcpu->arch.cr3 = cr3; -- cgit v1.2.3 From bc5eb20161ad742a38b7b4deda393e577ade669b Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Tue, 13 May 2014 18:56:25 +0200 Subject: xen/x86: set panic notifier priority to minimum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Execution is not going to continue after telling Xen about the crash. Let other panic notifiers run by postponing the final hypercall as much as possible. Signed-off-by: Andrew Jones Signed-off-by: Radim Krčmář Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 201d09a7c46b..662bdcbbbd18 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1339,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) static struct notifier_block xen_panic_block = { .notifier_call= xen_panic_event, + .priority = INT_MIN }; int xen_panic_handler_init(void) -- cgit v1.2.3 From fcca2e3119f3771dcfd0b03b3718b3c51b5f21c3 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 7 Jan 2014 11:53:35 +0000 Subject: x86/xen: rename early_p2m_alloc() and early_p2m_alloc_middle() early_p2m_alloc_middle() allocates a new leaf page and early_p2m_alloc() allocates a new middle page. This is confusing. Swap the names so they match what the functions actually do. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 85e5d78c9874..4fc71cc1705b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -596,7 +596,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) +static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; unsigned long *p2m; @@ -638,7 +638,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary return true; } -static bool __init early_alloc_p2m(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx = p2m_top_index(pfn); unsigned long *mid_mfn_p; @@ -663,7 +663,7 @@ static bool __init early_alloc_p2m(unsigned long pfn) p2m_top_mfn_p[topidx] = mid_mfn_p; p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); /* Note: we don't set mid_mfn_p[midix] here, - * look in early_alloc_p2m_middle */ + * look in early_alloc_p2m() */ } return true; } @@ -739,7 +739,7 @@ found: /* This shouldn't happen */ if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) - early_alloc_p2m(set_pfn); + early_alloc_p2m_middle(set_pfn); if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) return false; @@ -754,13 +754,13 @@ found: bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!early_alloc_p2m(pfn)) + if (!early_alloc_p2m_middle(pfn)) return false; if (early_can_reuse_p2m_middle(pfn, mfn)) return __set_phys_to_machine(pfn, mfn); - if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) + if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) return false; if (!__set_phys_to_machine(pfn, mfn)) -- cgit v1.2.3 From a9b5bff66b2a63f7d0f42434f5da9024b442159c Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 6 Jan 2014 11:55:13 +0000 Subject: x86/xen: fix set_phys_range_identity() if pfn_e > MAX_P2M_PFN Allow set_phys_range_identity() to work with a range that overlaps MAX_P2M_PFN by clamping pfn_e to MAX_P2M_PFN. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 4fc71cc1705b..82c8c9305510 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -774,7 +774,7 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, { unsigned long pfn; - if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) + if (unlikely(pfn_s >= MAX_P2M_PFN)) return 0; if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) @@ -783,6 +783,9 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (pfn_s > pfn_e) return 0; + if (pfn_e > MAX_P2M_PFN) + pfn_e = MAX_P2M_PFN; + for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) -- cgit v1.2.3 From 3cb83e46d032505016ab2565f067e24c8cba9a9d Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 7 Jan 2014 11:44:32 +0000 Subject: x86/xen: compactly store large identity ranges in the p2m Large (multi-GB) identity ranges currently require a unique middle page (filled with p2m_identity entries) per 1 GB region. Similar to the common p2m_mid_missing middle page for large missing regions, introduce a p2m_mid_identity page (filled with p2m_identity entries) which can be used instead. set_phys_range_identity() thus only needs to allocate new middle pages at the beginning and end of the range. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 155 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 105 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 82c8c9305510..57001443231e 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -36,7 +36,7 @@ * pfn_to_mfn(0xc0000)=0xc0000 * * The benefit of this is, that we can assume for non-RAM regions (think - * PCI BARs, or ACPI spaces), we can create mappings easily b/c we + * PCI BARs, or ACPI spaces), we can create mappings easily because we * get the PFN value to match the MFN. * * For this to work efficiently we have one new page p2m_identity and @@ -60,7 +60,7 @@ * There is also a digram of the P2M at the end that can help. * Imagine your E820 looking as so: * - * 1GB 2GB + * 1GB 2GB 4GB * /-------------------+---------\/----\ /----------\ /---+-----\ * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | * \-------------------+---------/\----/ \----------/ \---+-----/ @@ -77,9 +77,8 @@ * of the PFN and the end PFN (263424 and 512256 respectively). The first step * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page * covers 512^2 of page estate (1GB) and in case the start or end PFN is not - * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn - * to end pfn. We reserve_brk top leaf pages if they are missing (means they - * point to p2m_mid_missing). + * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as + * required to split any existing p2m_mid_missing middle pages. * * With the E820 example above, 263424 is not 1GB aligned so we allocate a * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. @@ -88,7 +87,7 @@ * Next stage is to determine if we need to do a more granular boundary check * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. * We check if the start pfn and end pfn violate that boundary check, and if - * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer + * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer * granularity of setting which PFNs are missing and which ones are identity. * In our example 263424 and 512256 both fail the check so we reserve_brk two * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" @@ -102,9 +101,10 @@ * * The next step is to walk from the start pfn to the end pfn setting * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. - * If we find that the middle leaf is pointing to p2m_missing we can swap it - * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this - * point we do not need to worry about boundary aligment (so no need to + * If we find that the middle entry is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and + * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). + * At this point we do not need to worry about boundary aligment (so no need to * reserve_brk a middle page, figure out which PFNs are "missing" and which * ones are identity), as that has been done earlier. If we find that the * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference @@ -118,6 +118,9 @@ * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] * contain the INVALID_P2M_ENTRY value and are considered "missing." * + * Finally, the region beyond the end of of the E820 (4 GB in this example) + * is set to be identity (in case there are MMIO regions placed here). + * * This is what the p2m ends up looking (for the E820 above) with this * fabulous drawing: * @@ -129,21 +132,27 @@ * |-----| \ | [p2m_identity]+\\ | .... | * | 2 |--\ \-------------------->| ... | \\ \----------------/ * |-----| \ \---------------/ \\ - * | 3 |\ \ \\ p2m_identity - * |-----| \ \-------------------->/---------------\ /-----------------\ - * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | - * \-----/ / | [p2m_identity]+-->| ..., ~0 | - * / /---------------\ | .... | \-----------------/ - * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | - * / | IDENTITY[@256]|<----/ \---------------/ - * / | ~0, ~0, .... | - * | \---------------/ - * | - * p2m_mid_missing p2m_missing - * /-----------------\ /------------\ - * | [p2m_missing] +---->| ~0, ~0, ~0 | - * | [p2m_missing] +---->| ..., ~0 | - * \-----------------/ \------------/ + * | 3 |-\ \ \\ p2m_identity [1] + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ | | | [p2m_identity]+-->| ..., ~0 | + * | | | .... | \-----------------/ + * | | +-[x], ~0, ~0.. +\ + * | | \---------------/ \ + * | | \-> /---------------\ + * | V p2m_mid_missing p2m_missing | IDENTITY[@0] | + * | /-----------------\ /------------\ | IDENTITY[@256]| + * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... | + * | | [p2m_missing] +---->| ..., ~0 | \---------------/ + * | | ... | \------------/ + * | \-----------------/ + * | + * | p2m_mid_identity + * | /-----------------\ + * \-->| [p2m_identity] +---->[1] + * | [p2m_identity] +---->[1] + * | ... | + * \-----------------/ * * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ @@ -187,13 +196,15 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); /* We might hit two boundary violations at the start and end, at max each * boundary violation will require three middle nodes. */ -RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); +RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); /* When we populate back during bootup, the amount of pages can vary. The * max we have is seen is 395979, but that does not mean it can't be more. @@ -242,20 +253,20 @@ static void p2m_top_mfn_p_init(unsigned long **top) top[i] = p2m_mid_missing_mfn; } -static void p2m_mid_init(unsigned long **mid) +static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = p2m_missing; + mid[i] = leaf; } -static void p2m_mid_mfn_init(unsigned long *mid) +static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(p2m_missing); + mid[i] = virt_to_mfn(leaf); } static void p2m_init(unsigned long *p2m) @@ -286,7 +297,9 @@ void __ref xen_build_mfn_list_list(void) /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); @@ -295,7 +308,8 @@ void __ref xen_build_mfn_list_list(void) p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ - p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { @@ -327,7 +341,7 @@ void __ref xen_build_mfn_list_list(void) * it too late. */ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; } @@ -365,16 +379,17 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_init(p2m_missing); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing); + p2m_mid_init(p2m_mid_missing, p2m_missing); + p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_identity, p2m_identity); p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); - p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_identity); - /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just @@ -386,7 +401,7 @@ void __init xen_build_dynamic_phys_to_machine(void) if (p2m_top[topidx] == p2m_mid_missing) { unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; } @@ -545,7 +560,7 @@ static bool alloc_p2m(unsigned long pfn) if (!mid) return false; - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) free_p2m_page(mid); @@ -565,7 +580,7 @@ static bool alloc_p2m(unsigned long pfn) if (!mid_mfn) return false; - p2m_mid_mfn_init(mid_mfn); + p2m_mid_mfn_init(mid_mfn, p2m_missing); missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); @@ -649,7 +664,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) if (mid == p2m_mid_missing) { mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; @@ -658,7 +673,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) /* And the save/restore P2M tables.. */ if (mid_mfn_p == p2m_mid_missing_mfn) { mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); @@ -769,6 +784,24 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + +static void __init early_split_p2m(unsigned long pfn) +{ + unsigned long mididx, idx; + + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* + * Allocate new middle and leaf pages if this pfn lies in the + * middle of one. + */ + if (mididx || idx) + early_alloc_p2m_middle(pfn); + if (idx) + early_alloc_p2m(pfn, false); +} + unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { @@ -786,19 +819,27 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (pfn_e > MAX_P2M_PFN) pfn_e = MAX_P2M_PFN; - for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); - pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); - pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) - { - WARN_ON(!early_alloc_p2m(pfn)); - } + early_split_p2m(pfn_s); + early_split_p2m(pfn_e); - early_alloc_p2m_middle(pfn_s, true); - early_alloc_p2m_middle(pfn_e, true); + for (pfn = pfn_s; pfn < pfn_e;) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); - for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) break; + pfn++; + + /* + * If the PFN was set to a middle or leaf identity + * page the remainder must also be identity, so skip + * ahead to the next middle or leaf entry. + */ + if (p2m_top[topidx] == p2m_mid_identity) + pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); + else if (p2m_top[topidx][mididx] == p2m_identity) + pfn = ALIGN(pfn, P2M_PER_PAGE); + } if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), "Identity mapping failed. We are %ld short of 1-1 mappings!\n", @@ -828,8 +869,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) /* For sparse holes were the p2m leaf has real PFN along with * PCI holes, stick in the PFN as the MFN value. + * + * set_phys_range_identity() will have allocated new middle + * and leaf pages as required so an existing p2m_mid_missing + * or p2m_missing mean that whole range will be identity so + * these can be switched to p2m_mid_identity or p2m_identity. */ if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx] == p2m_mid_identity) + return true; + + if (p2m_top[topidx] == p2m_mid_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, + p2m_mid_identity) != p2m_mid_missing); + return true; + } + if (p2m_top[topidx][mididx] == p2m_identity) return true; -- cgit v1.2.3 From 2dcc9a3de1d7f77bb4dbc108358a75776328e887 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 7 Jan 2014 11:36:53 +0000 Subject: x86/xen: only warn once if bad MFNs are found during setup In xen_add_extra_mem(), if the WARN() checks for bad MFNs trigger it is likely that they will trigger at lot, spamming the log. Use WARN_ONCE() instead. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0982233b9b84..2afe55e21d59 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -89,10 +89,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size) for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); - if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) continue; - WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", - pfn, mfn); + WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } -- cgit v1.2.3 From 25b884a83d487fd62c3de7ac1ab5549979188482 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 3 Jan 2014 15:46:10 +0000 Subject: x86/xen: set regions above the end of RAM as 1:1 PCI devices may have BARs located above the end of RAM so mark such frames as identity frames in the p2m (instead of the default of missing). PFNs outside the p2m (above MAX_P2M_PFN) are also considered to be identity frames for the same reason. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 2 +- arch/x86/xen/setup.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 57001443231e..9bb3d82ffec8 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -507,7 +507,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) unsigned topidx, mididx, idx; if (unlikely(pfn >= MAX_P2M_PFN)) - return INVALID_P2M_ENTRY; + return IDENTITY_FRAME(pfn); topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2afe55e21d59..210426a26cc0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -468,6 +468,15 @@ char * __init xen_memory_setup(void) i++; } + /* + * Set the rest as identity mapped, in case PCI BARs are + * located here. + * + * PFNs above MAX_P2M_PFN are considered identity mapped as + * well. + */ + set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke -- cgit v1.2.3 From f59c5145dc6a079b14b349c388d44362eb813cdf Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 8 Jan 2014 14:00:01 +0000 Subject: x86/xen: do not use _PAGE_IOMAP in xen_remap_domain_mfn_range() _PAGE_IOMAP is used in xen_remap_domain_mfn_range() to prevent the pfn_pte() call in remap_area_mfn_pte_fn() from using the p2m to translate the MFN. If mfn_pte() is used instead, the p2m look up is avoided and the use of _PAGE_IOMAP is no longer needed. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 86e02eabb640..d91602424b39 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2522,7 +2522,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *rmd = data; - pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); + pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; rmd->mmu_update->val = pte_val_ma(pte); @@ -2547,8 +2547,6 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (xen_feature(XENFEAT_auto_translated_physmap)) return -EINVAL; - prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); rmd.mfn = mfn; -- cgit v1.2.3 From 9b17aeec232a5f0a61ce3952c2e728a0eeddda8b Mon Sep 17 00:00:00 2001 From: Alan Date: Mon, 12 May 2014 16:55:35 +0100 Subject: goldfish: Allow 64bit builds We can now enable the 64bit option for the Goldfish 64bit emulator. Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25d2c6f7325e..9c43ea3cf510 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -415,7 +415,6 @@ config X86_UV config X86_GOLDFISH bool "Goldfish (Virtual Platform)" - depends on X86_32 depends on X86_EXTENDED_PLATFORM ---help--- Enable support for the Goldfish virtual platform used primarily -- cgit v1.2.3 From b1ee544174fd0eb28a7770403b9577fd70f1cd3d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:06 +0000 Subject: x86: Implement arch_setup/teardown_hwirq() This is just a cleanup to get rid of the create/destroy_irq variants which were designed in hell. The long term solution for x86 is to switch over to irq domains and cleanup the whole vector allocation mess. The generic irq_alloc_hwirqs() interface deliberately prevents multi-MSI vector allocation to further enforce the irq domain conversion (aside of the desire to support ioapic hotplug). Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154334.482904047@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/io_apic.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25d2c6f7325e..47247708c9eb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -831,6 +831,7 @@ config X86_LOCAL_APIC config X86_IO_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI + select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 992060e09897..b7175c0c552c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3010,6 +3010,39 @@ void destroy_irqs(unsigned int irq, unsigned int count) destroy_irq(irq + i); } +int arch_setup_hwirq(unsigned int irq, int node) +{ + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + cfg = alloc_irq_cfg(irq, node); + if (!cfg) + return -ENOMEM; + + raw_spin_lock_irqsave(&vector_lock, flags); + ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + if (!ret) + irq_set_chip_data(irq, cfg); + else + free_irq_cfg(irq, cfg); + return ret; +} + +void arch_teardown_hwirq(unsigned int irq) +{ + struct irq_cfg *cfg = irq_get_chip_data(irq); + unsigned long flags; + + free_remapped_irq(irq); + raw_spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq, cfg); + raw_spin_unlock_irqrestore(&vector_lock, flags); + free_irq_cfg(irq, cfg); +} + /* * MSI message composition */ -- cgit v1.2.3 From 499c2b75e9c695b57faaf6a63fde391ff9e523a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:07 +0000 Subject: x86: hpet: Use irq_alloc/free_hwirq() Use the new interfaces. No functional change. This does not replace the requirement to move x86 to irq domains, but it limits the mess to some degree. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154334.991589924@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4177bfbc80b0..5f5a147d1cd2 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -479,7 +479,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - destroy_irq(irq); + irq_free_hwirq(irq); return -EINVAL; } return 0; @@ -487,9 +487,8 @@ static int hpet_setup_msi_irq(unsigned int irq) static int hpet_assign_irq(struct hpet_dev *dev) { - unsigned int irq; + unsigned int irq = irq_alloc_hwirq(-1); - irq = create_irq_nr(0, -1); if (!irq) return -EINVAL; -- cgit v1.2.3 From 0a2db49dc4fe2873f857617d320c37b6bfe40255 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:08 +0000 Subject: x86: uv: Use irq_alloc/free_hwirq() No functional change. The request to allocate the irq above NR_IRQS_LEGACY is completely pointless as the implementation enforces that the dynamic allocations are above the GSI interrupts, which includes the legacy PIT irqs. This does not replace the requirement to move x86 to irq domains, but it limits the mess to some degree. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154335.252789823@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/platform/uv/uv_irq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index acf7752da952..b233681af4de 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -238,11 +238,9 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { - int irq, ret; + int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade)); - irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - - if (irq <= 0) + if (!irq) return -EBUSY; ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, @@ -250,7 +248,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, if (ret == irq) uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); else - destroy_irq(irq); + irq_free_hwirq(irq); return ret; } @@ -285,6 +283,6 @@ void uv_teardown_irq(unsigned int irq) n = n->rb_right; } spin_unlock_irqrestore(&uv_irq_lock, irqflags); - destroy_irq(irq); + irq_free_hwirq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); -- cgit v1.2.3 From be47be6c28a83dd8b3c5540d0be3675af1ac7b2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:09 +0000 Subject: x86: ioapic: Use irq_alloc/free_hwirq() No functional change just less crap. This does not replace the requirement to move x86 to irq domains, but it limits the mess to some degree. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154335.749579081@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b7175c0c552c..3c17b25d159d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3169,8 +3169,8 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - unsigned int irq, irq_want; struct msi_desc *msidesc; + unsigned int irq; int node, ret; /* Multiple MSI vectors only supported with interrupt remapping */ @@ -3178,28 +3178,25 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; node = dev_to_node(&dev->dev); - irq_want = nr_irqs_gsi; + list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want, node); - if (irq == 0) + irq = irq_alloc_hwirq(node); + if (!irq) return -ENOSPC; - irq_want = irq + 1; - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) - goto error; + if (ret < 0) { + irq_free_hwirq(irq); + return ret; + } + } return 0; - -error: - destroy_irq(irq); - return ret; } void native_teardown_msi_irq(unsigned int irq) { - destroy_irq(irq); + irq_free_hwirq(irq); } #ifdef CONFIG_DMAR_TABLE -- cgit v1.2.3 From d07c9f18756e8231909a9bbcbfa7502c60cbc810 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:10 +0000 Subject: x86: Get rid of get_nr_irqs_gsi() No need to expose this outside of the ioapic code. The dynamic allocations are guaranteed not to happen in the gsi space. See commit 62a08ae2a. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: x86@kernel.org Cc: Konrad Rzeszutek Wilk Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20140507154335.959870037@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 2 -- arch/x86/kernel/apic/io_apic.c | 5 ----- drivers/xen/events/events_base.c | 17 +---------------- 3 files changed, 1 insertion(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 459e50a424d1..90f97b4b9347 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -168,8 +168,6 @@ extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); -extern int get_nr_irqs_gsi(void); - extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c17b25d159d..be3b5741badb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3450,11 +3450,6 @@ static void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } -int get_nr_irqs_gsi(void) -{ - return nr_irqs_gsi; -} - unsigned int arch_dynirq_lower_bound(unsigned int from) { return from < nr_irqs_gsi ? nr_irqs_gsi : from; diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index dfa12a4a0a48..c919d3d5c845 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -390,22 +390,7 @@ static void xen_irq_init(unsigned irq) static int __must_check xen_allocate_irqs_dynamic(int nvec) { - int first = 0; - int i, irq; - -#ifdef CONFIG_X86_IO_APIC - /* - * For an HVM guest or domain 0 which see "real" (emulated or - * actual respectively) GSIs we allocate dynamic IRQs - * e.g. those corresponding to event channels or MSIs - * etc. from the range above those "real" GSIs to avoid - * collisions. - */ - if (xen_initial_domain() || xen_hvm_domain()) - first = get_nr_irqs_gsi(); -#endif - - irq = irq_alloc_descs_from(first, nvec, -1); + int i, irq = irq_alloc_descs(-1, 0, nvec, -1); if (irq >= 0) { for (i = 0; i < nvec; i++) -- cgit v1.2.3 From a553b142b8effbfcbba24ebbf8c07a1a86d32ce6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:11 +0000 Subject: iommu: dmar: Provide arch specific irq allocation ia64 and x86 share this driver. x86 is moving to a different irq allocation and ia64 keeps its private irq_create/destroy stuff. Use macros to redirect to one or the other. Yes, macros to avoid include hell. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: Fenghua Yu Acked-by: Joerg Roedel Cc: x86@kernel.org Cc: linux-ia64@vger.kernel.org Cc: iommu@lists.linux-foundation.org Link: http://lkml.kernel.org/r/20140507154336.372289825@linutronix.de Signed-off-by: Thomas Gleixner --- arch/ia64/include/asm/irq_remapping.h | 2 ++ arch/x86/include/asm/irq_remapping.h | 3 +++ drivers/iommu/dmar.c | 6 +++--- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h index a8687b1d8906..e3b3556e2e1b 100644 --- a/arch/ia64/include/asm/irq_remapping.h +++ b/arch/ia64/include/asm/irq_remapping.h @@ -1,4 +1,6 @@ #ifndef __IA64_INTR_REMAPPING_H #define __IA64_INTR_REMAPPING_H #define irq_remapping_enabled 0 +#define dmar_alloc_hwirq create_irq +#define dmar_free_hwirq destroy_irq #endif diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index d806b228d2c0..b7747c4c2cf2 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -103,4 +103,7 @@ static inline bool setup_remapped_irq(int irq, } #endif /* CONFIG_IRQ_REMAP */ +#define dmar_alloc_hwirq() irq_alloc_hwirq(-1) +#define dmar_free_hwirq irq_free_hwirq + #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 3ce1f62a091f..9a4f05e5b23f 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -994,7 +994,7 @@ static void free_iommu(struct intel_iommu *iommu) if (iommu->irq) { free_irq(iommu->irq, iommu); irq_set_handler_data(iommu->irq, NULL); - destroy_irq(iommu->irq); + dmar_free_hwirq(iommu->irq); } if (iommu->qi) { @@ -1550,7 +1550,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu) if (iommu->irq) return 0; - irq = create_irq(); + irq = dmar_alloc_hwirq(); if (irq <= 0) { pr_err("IOMMU: no free vectors\n"); return -EINVAL; @@ -1563,7 +1563,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu) if (ret) { irq_set_handler_data(irq, NULL); iommu->irq = 0; - destroy_irq(irq); + dmar_free_hwirq(irq); return ret; } -- cgit v1.2.3 From 54859f59fc18e5c104a4095420b3fcef8bc3ae63 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:12 +0000 Subject: x86: Remove create/destroy_irq() No more users. Remove the cruft Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154336.760446122@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 106 +---------------------------------------- include/linux/irq.h | 4 -- 2 files changed, 1 insertion(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index be3b5741badb..efda2f648f59 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -281,18 +281,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } -static int alloc_irqs_from(unsigned int from, unsigned int count, int node) -{ - return irq_alloc_descs_from(from, count, node); -} - -static void free_irq_at(unsigned int at, struct irq_cfg *cfg) -{ - free_irq_cfg(at, cfg); - irq_free_desc(at); -} - - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -2916,100 +2904,8 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); /* - * Dynamic irq allocate and deallocation + * Dynamic irq allocate and deallocation. Should be replaced by irq domains! */ -unsigned int __create_irqs(unsigned int from, unsigned int count, int node) -{ - struct irq_cfg **cfg; - unsigned long flags; - int irq, i; - - if (from < nr_irqs_gsi) - from = nr_irqs_gsi; - - cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node); - if (!cfg) - return 0; - - irq = alloc_irqs_from(from, count, node); - if (irq < 0) - goto out_cfgs; - - for (i = 0; i < count; i++) { - cfg[i] = alloc_irq_cfg(irq + i, node); - if (!cfg[i]) - goto out_irqs; - } - - raw_spin_lock_irqsave(&vector_lock, flags); - for (i = 0; i < count; i++) - if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus())) - goto out_vecs; - raw_spin_unlock_irqrestore(&vector_lock, flags); - - for (i = 0; i < count; i++) { - irq_set_chip_data(irq + i, cfg[i]); - irq_clear_status_flags(irq + i, IRQ_NOREQUEST); - } - - kfree(cfg); - return irq; - -out_vecs: - for (i--; i >= 0; i--) - __clear_irq_vector(irq + i, cfg[i]); - raw_spin_unlock_irqrestore(&vector_lock, flags); -out_irqs: - for (i = 0; i < count; i++) - free_irq_at(irq + i, cfg[i]); -out_cfgs: - kfree(cfg); - return 0; -} - -unsigned int create_irq_nr(unsigned int from, int node) -{ - return __create_irqs(from, 1, node); -} - -int create_irq(void) -{ - int node = cpu_to_node(0); - unsigned int irq_want; - int irq; - - irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want, node); - - if (irq == 0) - irq = -1; - - return irq; -} - -void destroy_irq(unsigned int irq) -{ - struct irq_cfg *cfg = irq_get_chip_data(irq); - unsigned long flags; - - irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - - free_remapped_irq(irq); - - raw_spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, cfg); - raw_spin_unlock_irqrestore(&vector_lock, flags); - free_irq_at(irq, cfg); -} - -void destroy_irqs(unsigned int irq, unsigned int count) -{ - unsigned int i; - - for (i = 0; i < count; i++) - destroy_irq(irq + i); -} - int arch_setup_hwirq(unsigned int irq, int node) { struct irq_cfg *cfg; diff --git a/include/linux/irq.h b/include/linux/irq.h index c75dd161d37f..7549ed59d3d4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -526,12 +526,8 @@ static inline void irq_set_percpu_devid_flags(unsigned int irq) } /* Handle dynamic irq creation and destruction */ -extern unsigned int create_irq_nr(unsigned int irq_want, int node); -extern unsigned int __create_irqs(unsigned int from, unsigned int count, - int node); extern int create_irq(void); extern void destroy_irq(unsigned int irq); -extern void destroy_irqs(unsigned int irq, unsigned int count); /* * Dynamic irq helper functions. Obsolete. Use irq_alloc_desc* and -- cgit v1.2.3 From 18a67d32c31b88679467e93e1825010d245b9bf4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:18 +0000 Subject: x86, irq: Remove pointless irq_reserve_irqs() call That's a leftover from the time where x86 supported SPARSE_IRQ=n. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154338.967285614@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index efda2f648f59..9d0a9795a0f8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -206,9 +206,6 @@ int __init arch_early_irq_init(void) count = ARRAY_SIZE(irq_cfgx); node = cpu_to_node(0); - /* Make sure the legacy interrupts are marked in the bitmap */ - irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); - for (i = 0; i < count; i++) { irq_set_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); -- cgit v1.2.3 From 722e76e60f2775c21b087ff12c5e678cf0ebcaaf Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 15 May 2014 17:56:44 +0200 Subject: fix Haswell precise store data source encoding This patch fixes a bug in precise_store_data_hsw() whereby it would set the data source memory level to the wrong value. As per the the SDM Vol 3b Table 18-41 (Layout of Data Linear Address Information in PEBS Record), when status bit 0 is set this is a L1 hit, otherwise this is a L1 miss. This patch encodes the memory level according to the specification. In V2, we added the filtering on the store events. Only the following events produce L1 information: * MEM_UOPS_RETIRED.STLB_MISS_STORES * MEM_UOPS_RETIRED.LOCK_STORES * MEM_UOPS_RETIRED.SPLIT_STORES * MEM_UOPS_RETIRED.ALL_STORES Cc: mingo@elte.hu Cc: acme@ghostprotocols.net Cc: jolsa@redhat.com Cc: jmario@redhat.com Cc: ak@linux.intel.com Tested-and-Reviewed-by: Don Zickus Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140515155644.GA3884@quad Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ae96cfa5eddd..980970cb744d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status) return val; } -static u64 precise_store_data_hsw(u64 status) +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) { union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; dse.val = 0; dse.mem_op = PERF_MEM_OP_STORE; dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + if (status & 1) - dse.mem_lvl = PERF_MEM_LVL_L1; + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + /* Nothing else supported. Sorry. */ return dse.val; } @@ -887,7 +903,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, data.data_src.val = load_latency_data(pebs->dse); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) data.data_src.val = - precise_store_data_hsw(pebs->dse); + precise_store_data_hsw(event, pebs->dse); else data.data_src.val = precise_store_data(pebs->dse); } -- cgit v1.2.3 From 1e844fb43c96dcdba3b578918f5c485d88750891 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 May 2014 15:58:31 -0700 Subject: x86, vdso: Fix an OOPS accessing the HPET mapping w/o an HPET The oops can be triggered in qemu using -no-hpet (but not nohpet) by reading a couple of pages past the end of the vdso text. This should send SIGBUS instead of OOPSing. The bug was introduced by: commit 7a59ed415f5b57469e22e41fc4188d5399e0b194 Author: Stefani Seibold Date: Mon Mar 17 23:22:09 2014 +0100 x86, vdso: Add 32 bit VDSO time support for 32 bit kernel which is new in 3.15. This will be fixed separately in 3.15, but that patch will not apply to tip/x86/vdso. This is the equivalent fix for tip/x86/vdso and, presumably, 3.16. Cc: Stefani Seibold Reported-by: Sasha Levin Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/c8b0a9a0b8d011a8b273cbb2de88d37190ed2751.1400538962.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index e915eaec4f96..8ad0081df7a8 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -90,6 +90,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) struct vm_area_struct *vma; unsigned long addr; int ret = 0; + static struct page *no_pages[] = {NULL}; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -125,7 +126,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) addr + image->size, image->sym_end_mapping - image->size, VM_READ, - NULL); + no_pages); if (IS_ERR(vma)) { ret = PTR_ERR(vma); -- cgit v1.2.3 From a62c34bd2a8a3f159945becd57401e478818d51c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 May 2014 15:58:33 -0700 Subject: x86, mm: Improve _install_special_mapping and fix x86 vdso naming Using arch_vma_name to give special mappings a name is awkward. x86 currently implements it by comparing the start address of the vma to the expected address of the vdso. This requires tracking the start address of special mappings and is probably buggy if a special vma is split or moved. Improve _install_special_mapping to just name the vma directly. Use it to give the x86 vvar area a name, which should make CRIU's life easier. As a side effect, the vvar area will show up in core dumps. This could be considered weird and is fixable. [hpa: I say we accept this as-is but be prepared to deal with knocking out the vvars from core dumps if this becomes a problem.] Cc: Cyrill Gorcunov Cc: Pavel Emelyanov Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/276b39b6b645fb11e345457b503f17b83c2c6fd0.1400538962.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vdso.h | 6 ++- arch/x86/mm/init_64.c | 3 -- arch/x86/vdso/vdso2c.h | 5 ++- arch/x86/vdso/vdso32-setup.c | 7 ---- arch/x86/vdso/vma.c | 25 ++++++++----- include/linux/mm.h | 4 +- include/linux/mm_types.h | 6 +++ mm/mmap.c | 89 +++++++++++++++++++++++++++++--------------- 8 files changed, 94 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index d0a2c909c72d..30be253dd283 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -7,10 +7,14 @@ #ifndef __ASSEMBLER__ +#include + struct vdso_image { void *data; unsigned long size; /* Always a multiple of PAGE_SIZE */ - struct page **pages; /* Big enough for data/size page pointers */ + + /* text_mapping.pages is big enough for data/size page pointers */ + struct vm_special_mapping text_mapping; unsigned long alt, alt_len; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6f881842116c..9deb59b0baea 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1223,9 +1223,6 @@ int in_gate_area_no_mm(unsigned long addr) const char *arch_vma_name(struct vm_area_struct *vma) { - if (vma->vm_mm && vma->vm_start == - (long __force)vma->vm_mm->context.vdso) - return "[vdso]"; if (vma == &gate_vma) return "[vsyscall]"; return NULL; diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index ed2e894e89ab..3dcc61e796e9 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -136,7 +136,10 @@ static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", data_size); - fprintf(outfile, "\t.pages = pages,\n"); + fprintf(outfile, "\t.text_mapping = {\n"); + fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); + fprintf(outfile, "\t\t.pages = pages,\n"); + fprintf(outfile, "\t},\n"); if (alt_sec) { fprintf(outfile, "\t.alt = %lu,\n", (unsigned long)alt_sec->sh_offset); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index c3ed708e50f4..e4f7781ee162 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -119,13 +119,6 @@ __initcall(ia32_binfmt_init); #else /* CONFIG_X86_32 */ -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - return NULL; -} - struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 8ad0081df7a8..e1513c47872a 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -30,7 +30,8 @@ void __init init_vdso_image(const struct vdso_image *image) BUG_ON(image->size % PAGE_SIZE != 0); for (i = 0; i < npages; i++) - image->pages[i] = virt_to_page(image->data + i*PAGE_SIZE); + image->text_mapping.pages[i] = + virt_to_page(image->data + i*PAGE_SIZE); apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + @@ -91,6 +92,10 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) unsigned long addr; int ret = 0; static struct page *no_pages[] = {NULL}; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .pages = no_pages, + }; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -112,21 +117,23 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) /* * MAYWRITE to allow gdb to COW and set breakpoints */ - ret = install_special_mapping(mm, - addr, - image->size, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - image->pages); + vma = _install_special_mapping(mm, + addr, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &image->text_mapping); - if (ret) + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto up_fail; + } vma = _install_special_mapping(mm, addr + image->size, image->sym_end_mapping - image->size, VM_READ, - no_pages); + &vvar_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/include/linux/mm.h b/include/linux/mm.h index 63f8d4efe303..05aab09803e6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1782,7 +1782,9 @@ extern struct file *get_mm_exe_file(struct mm_struct *mm); extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); + unsigned long flags, + const struct vm_special_mapping *spec); +/* This is an obsolete alternative to _install_special_mapping. */ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20cbe57..22c6f4e16d10 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -510,4 +510,10 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm) } #endif +struct vm_special_mapping +{ + const char *name; + struct page **pages; +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/mmap.c b/mm/mmap.c index b1202cf81f4b..52bbc9514d9d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2872,6 +2872,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) return 1; } +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; static int special_mapping_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -2887,7 +2912,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, */ pgoff = vmf->pgoff - vma->vm_pgoff; - for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -2900,30 +2931,11 @@ static int special_mapping_fault(struct vm_area_struct *vma, return VM_FAULT_SIGBUS; } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) { int ret; struct vm_area_struct *vma; @@ -2940,8 +2952,8 @@ struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - vma->vm_ops = &special_mapping_vmops; - vma->vm_private_data = pages; + vma->vm_ops = ops; + vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) @@ -2958,12 +2970,31 @@ out: return ERR_PTR(ret); } +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { - struct vm_area_struct *vma = _install_special_mapping(mm, - addr, len, vm_flags, pages); + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); if (IS_ERR(vma)) return PTR_ERR(vma); -- cgit v1.2.3 From ac49b9a9f26b6c42585f87857722085ef4b19c13 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 May 2014 15:58:34 -0700 Subject: x86, mm: Replace arch_vma_name with vm_ops->name for vsyscalls This removes the last vestiges of arch_vma_name from x86, replacing it with vm_ops->name. Good riddance. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/e681cb56096eee5b8b8767093a4f6fb82839f0a4.1400538962.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9deb59b0baea..bdcde58ca9ed 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1185,11 +1185,19 @@ int kern_addr_valid(unsigned long addr) * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; static struct vm_area_struct gate_vma = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) @@ -1221,13 +1229,6 @@ int in_gate_area_no_mm(unsigned long addr) return (addr & PAGE_MASK) == VSYSCALL_ADDR; } -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; -} - #ifdef CONFIG_X86_UV unsigned long memory_block_size_bytes(void) { -- cgit v1.2.3 From 1b1ded57a4f2f4420b4de7c395d1b841d8b3c41a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 May 2014 20:59:16 +0200 Subject: x86, boot: Carve out early cmdline parsing function Carve out early cmdline parsing function into .../lib/cmdline.c so it can be used by early code in the kernel proper as well. Adapted from arch/x86/boot/cmdline.c. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1400525957-11525-2-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cmdline.h | 6 +++ arch/x86/lib/Makefile | 2 +- arch/x86/lib/cmdline.c | 84 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/cmdline.h create mode 100644 arch/x86/lib/cmdline.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h new file mode 100644 index 000000000000..e01f7f7ccb0c --- /dev/null +++ b/arch/x86/include/asm/cmdline.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_CMDLINE_H +#define _ASM_X86_CMDLINE_H + +int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); + +#endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index eabcb6e6a900..4d4f96a27638 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -16,7 +16,7 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o -lib-y := delay.o misc.o +lib-y := delay.o misc.o cmdline.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c new file mode 100644 index 000000000000..422db000d727 --- /dev/null +++ b/arch/x86/lib/cmdline.c @@ -0,0 +1,84 @@ +/* + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * Misc librarized functions for cmdline poking. + */ +#include +#include +#include +#include + +static inline int myisspace(u8 c) +{ + return c <= ' '; /* Close enough approximation */ +} + +/** + * Find a boolean option (like quiet,noapic,nosmp....) + * + * @cmdline: the cmdline string + * @option: option string to look for + * + * Returns the position of that @option (starts counting with 1) + * or 0 on not found. + */ +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + char c; + int len, pos = 0, wstart = 0; + const char *opptr = NULL; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); + if (!len) + return 0; + + while (len--) { + c = *(char *)cmdline++; + pos++; + + switch (state) { + case st_wordstart: + if (!c) + return 0; + else if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + wstart = pos; + /* fall through */ + + case st_wordcmp: + if (!*opptr) + if (!c || myisspace(c)) + return wstart; + else + state = st_wordskip; + else if (!c) + return 0; + else if (c != *opptr++) + state = st_wordskip; + else if (!len) /* last word and is matching */ + return wstart; + break; + + case st_wordskip: + if (!c) + return 0; + else if (myisspace(c)) + state = st_wordstart; + break; + } + } + + return 0; /* Buffer overrun */ +} -- cgit v1.2.3 From 65cef1311d5d212fd3d48a43678536dc878ca288 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 May 2014 20:59:17 +0200 Subject: x86, microcode: Add a disable chicken bit Add a cmdline param which disables the microcode loader. This is useful mostly in debugging situations where we want to turn off microcode loading, both early from the initrd and late, as a means to be able to rule out its influence on the machine. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1400525957-11525-3-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/microcode.h | 1 + arch/x86/kernel/cpu/microcode/core.c | 6 +++++ arch/x86/kernel/cpu/microcode/core_early.c | 37 ++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index b59827e76529..64dc362506b7 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -25,6 +25,7 @@ struct cpu_signature { struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; +extern bool dis_ucode_ldr; struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 15c987698b0f..dd9d6190b08d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -97,6 +97,9 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + /* * Synchronization. * @@ -546,6 +549,9 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; + if (dis_ucode_ldr) + return 0; + if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); else if (c->x86_vendor == X86_VENDOR_AMD) diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index be7f8514f577..5f28a64e71ea 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -17,9 +17,11 @@ * 2 of the License, or (at your option) any later version. */ #include +#include #include #include #include +#include #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') @@ -72,10 +74,33 @@ static int x86_family(void) return x86; } +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + void __init load_ucode_bsp(void) { int vendor, x86; + if (check_loader_disabled_bsp()) + return; + if (!have_cpuid_p()) return; @@ -96,10 +121,22 @@ void __init load_ucode_bsp(void) } } +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return __pa_nodebug(dis_ucode_ldr); +#else + return dis_ucode_ldr; +#endif +} + void load_ucode_ap(void) { int vendor, x86; + if (check_loader_disabled_ap()) + return; + if (!have_cpuid_p()) return; -- cgit v1.2.3 From 7ed6fb9b5a5510e4ef78ab27419184741169978a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 21 May 2014 10:22:59 -0700 Subject: Revert "x86-64, modify_ldt: Make support for 16-bit segments a runtime option" This reverts commit fa81511bb0bbb2b1aace3695ce869da9762624ff in preparation of merging in the proper fix (espfix64). Signed-off-by: H. Peter Anvin --- arch/x86/kernel/ldt.c | 4 +--- arch/x86/vdso/vdso32-setup.c | 8 -------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index dcbbaa165bde..af1d14a9ebda 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,8 +20,6 @@ #include #include -int sysctl_ldt16 = 0; - #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -236,7 +234,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) * IRET leaking the high bits of the kernel stack address. */ #ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit && !sysctl_ldt16) { + if (!ldt_info.seg_32bit) { error = -EINVAL; goto out_unlock; } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index e1f220e3ca68..00348980a3a6 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -39,7 +39,6 @@ #ifdef CONFIG_X86_64 #define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages -extern int sysctl_ldt16; #endif /* @@ -250,13 +249,6 @@ static struct ctl_table abi_table2[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "ldt16", - .data = &sysctl_ldt16, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, {} }; -- cgit v1.2.3 From 33673101335b19913745ff0e9d2946b490608e5f Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Thu, 8 May 2014 11:44:20 -0500 Subject: x86/PCI: Warn if we have to "guess" host bridge node information The vast majority of platforms are not supplying ACPI _PXM (proximity) information corresponding to host bridge (PNP0A03/PNP0A08) devices resulting in sysfs "numa_node" values of -1 (NUMA_NO_NODE): # for i in /sys/devices/pci0000\:00/*/numa_node; do cat $i; done | uniq -1 # find /sys/ -name "numa_node" | while read fname; do cat $fname; \ done | uniq -1 AMD based platforms provide a fall-back for this situation via amd_bus.c. These platforms snoop out the information by directly reading specific registers from the Northbridge and caching them via alloc_pci_root_info(). Later during boot processing when host bridges are discovered - pci_acpi_scan_root() - the kernel looks for their corresponding ACPI _PXM method - drivers/acpi/numa.c::acpi_get_node(). If the BIOS supplied a _PXM method then that node (proximity) value is associated. If the BIOS did not supply a _PXM method *and* the platform is AMD-based, the fall-back cached values obtained directly from the Northbridge are used; otherwise, "NUMA_NO_NODE" is associated. There are a number of issues with this fall-back mechanism the most notable being that amd_bus.c extracts a 3-bit number from a CPU register and uses it as the node number. The node numbers used by Linux are logical and there's no reason they need to be identical to settings in the CPU registers. So if we have some node information obtained in the normal way (from _PXM, SLIT, SRAT, etc.) and some from amd_bus.c, there's no reason to believe they will be compatible. This patch warns when this situation occurs: pci_root PNP0A08:00: [Firmware Bug]: no _PXM; falling back to node 0 from hardware (may be inconsistent with ACPI node numbers) Link: https://bugzilla.kernel.org/show_bug.cgi?id=72051 Signed-off-by: Myron Stowe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 01edac6c5e18..5075371ab593 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -489,8 +489,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) } node = acpi_get_node(device->handle); - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE) { node = x86_pci_root_bus_node(busnum); + if (node != 0 && node != NUMA_NO_NODE) + dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", + node); + } if (node != NUMA_NO_NODE && !node_online(node)) node = NUMA_NO_NODE; -- cgit v1.2.3 From 94d4bb5b13978127d12860bc6b7071a784144e2f Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 8 May 2014 11:44:18 -0500 Subject: x86/PCI: Work around AMD Fam15h BIOSes that fail to provide _PXM The BIOS is supposed to provide ACPI _PXM methods for PCI host bridges if it cares about platform topology. But some BIOSes do not, so add Fam15h to the list of CPUs for which we fall back to reading node numbers from the hardware. Note that pci_acpi_scan_root() warns about the BIOS bug if we use this information because (1) the hardware node numbers are not necessarily compatible with other logical node numbers from ACPI, and (2) the lack of _PXM forces OS updates that would not otherwise be required. [bhelgaas: changelog, comments] Link: https://bugzilla.kernel.org/show_bug.cgi?id=72051 Tested-by: Aravind Gopalakrishnan Signed-off-by: Suravee Suthikulpanit Signed-off-by: Myron Stowe Signed-off-by: Bjorn Helgaas Cc: Borislav Petkov Cc: Robert Richter Cc: Daniel J Blueman Cc: Andreas Herrmann --- arch/x86/pci/amd_bus.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index e88f4c53d7f6..aa936e3a2019 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -24,10 +24,11 @@ struct pci_hostbridge_probe { }; static struct pci_hostbridge_probe pci_probes[] __initdata = { - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, /* K8 */ + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, /* Fam10h */ + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, /* Fam10h */ + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, /* Fam11h */ + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1600 }, /* Fam15h */ }; #define RANGE_NUM 16 @@ -96,6 +97,11 @@ static int __init early_fill_mp_bus_info(void) if (!found) return 0; + /* + * We should learn topology and routing information from _PXM and + * _CRS methods in the ACPI namespace. We extract node numbers + * here to work around BIOSes that don't supply _PXM. + */ for (i = 0; i < 4; i++) { int min_bus; int max_bus; @@ -113,6 +119,17 @@ static int __init early_fill_mp_bus_info(void) info = alloc_pci_root_info(min_bus, max_bus, node, link); } + /* + * The following code extracts routing information for use on old + * systems where Linux doesn't automatically use host bridge _CRS + * methods (or when the user specifies "pci=nocrs"). + * + * We only do this through Fam11h, because _CRS should be enough on + * newer systems. + */ + if (boot_cpu_data.x86 > 0x11) + return 0; + /* get the default node and link for left over res */ reg = read_pci_config(bus, slot, 0, 0x60); def_node = (reg >> 8) & 0x07; -- cgit v1.2.3 From 1bd24efc8bd1e644b115cae51048c008bea76de1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 May 2014 15:07:07 -0700 Subject: x86_64, entry: Add missing 'DEFAULT_FRAME 0' entry annotations The paranoidzeroentry macros were missing them. I'm not at all convinced that these annotations are correct and/or necessary, but this makes the macros more consistent with each other. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/10ad65f534f8bc62e77f74fe15f68e8d4a59d8b3.1400709717.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..39372ec11f94 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1230,6 +1230,7 @@ ENTRY(\sym) subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid + DEFAULT_FRAME 0 TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ @@ -1249,6 +1250,7 @@ ENTRY(\sym) subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid + DEFAULT_FRAME 0 TRACE_IRQS_OFF_DEBUG movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ -- cgit v1.2.3 From cb5dd2c5eed155919f626684691cf525234ecda1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 May 2014 15:07:08 -0700 Subject: x86_64, entry: Merge most 64-bit asm entry macros I haven't touched the device interrupt code, which is different enough that it's probably not worth merging, and I haven't done anything about paranoidzeroentry_ist yet. This appears to produce an entry_64.o file that differs only in the debug info line numbers. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/e7a6acfb130471700370e77af9e4b4b6ed46f5ef.1400709717.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 152 +++++++++++++++++++-------------------------- 1 file changed, 64 insertions(+), 88 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 39372ec11f94..eac9b81edfe1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -36,7 +36,7 @@ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. + * - idtentry - Define exception entry points. */ #include @@ -1203,39 +1203,53 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -.macro zeroentry sym do_sym +.macro idtentry sym do_sym has_error_code:req paranoid=0 ENTRY(\sym) + .if \has_error_code + XCPT_FRAME + .else INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + .endif -.macro paranoidzeroentry sym do_sym -ENTRY(\sym) - INTR_FRAME ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + + .ifeq \has_error_code + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + .endif + subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + + .if \paranoid call save_paranoid + .else + call error_entry + .endif + DEFAULT_FRAME 0 + + .if \paranoid TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ + .endif + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ + + .if \paranoid + jmp paranoid_exit /* %ebx: no swapgs flag */ + .else + jmp error_exit /* %ebx: no swapgs flag */ + .endif + CFI_ENDPROC END(\sym) .endm @@ -1262,68 +1276,30 @@ ENTRY(\sym) END(\sym) .endm -.macro errorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm - #ifdef CONFIG_TRACING -.macro trace_errorentry sym do_sym -errorentry trace(\sym) trace(\do_sym) -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code .endm #else -.macro trace_errorentry sym do_sym -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif - /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - DEFAULT_FRAME 0 - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm - -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ @@ -1373,7 +1349,7 @@ ENTRY(do_softirq_own_stack) END(do_softirq_own_stack) #ifdef CONFIG_XEN -zeroentry xen_hypervisor_callback xen_do_hypervisor_callback +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -1486,19 +1462,19 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 #ifdef CONFIG_XEN -zeroentry xen_debug do_debug -zeroentry xen_int3 do_int3 -errorentry xen_stack_segment do_stack_segment +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -errorentry general_protection do_general_protection -trace_errorentry page_fault do_page_fault +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST -errorentry async_page_fault do_async_page_fault +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check *machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* -- cgit v1.2.3 From 577ed45ec56e9c43269df1c4a05c59ed19ea8e6a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 May 2014 15:07:09 -0700 Subject: x86_64, entry: Merge paranoidzeroentry_ist into idtentry One more specialized entry function is now gone. Again, this seems to only change line numbers in entry_64.o. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/f54854f07ff3be8162b166124dbead23feeefe10.1400709717.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 47 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index eac9b81edfe1..be846d2468f7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1203,8 +1203,15 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) + +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif + .if \has_error_code XCPT_FRAME .else @@ -1230,8 +1237,12 @@ ENTRY(\sym) DEFAULT_FRAME 0 .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else TRACE_IRQS_OFF .endif + .endif movq %rsp,%rdi /* pt_regs pointer */ @@ -1242,8 +1253,16 @@ ENTRY(\sym) xorl %esi,%esi /* no error code */ .endif + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + call \do_sym + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + .if \paranoid jmp paranoid_exit /* %ebx: no swapgs flag */ .else @@ -1254,28 +1273,6 @@ ENTRY(\sym) END(\sym) .endm -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist -ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - DEFAULT_FRAME 0 - TRACE_IRQS_OFF_DEBUG - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - call \do_sym - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm - #ifdef CONFIG_TRACING .macro trace_idtentry sym do_sym has_error_code:req idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code @@ -1460,8 +1457,8 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ */ .pushsection .kprobes.text, "ax" -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 #ifdef CONFIG_XEN idtentry xen_debug do_debug has_error_code=0 -- cgit v1.2.3 From 2356aaeb2f58f491679dc0c38bc3f6dbe54e7ded Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 15 May 2014 17:56:57 +0200 Subject: KVM: x86: use new CS.RPL as CPL during task switch During task switch, all of CS.DPL, CS.RPL, SS.DPL must match (in addition to all the other requirements) and will be the new CPL. So far this worked by carefully setting the CS selector and flag before doing the task switch; setting CS.selector will already change the CPL. However, this will not work once we get the CPL from SS.DPL, because then you will have to set the full segment descriptor cache to change the CPL. ctxt->ops->cpl(ctxt) will then return the old CPL during the task switch, and the check that SS.DPL == CPL will fail. Temporarily assume that the CPL comes from CS.RPL during task switch to a protected-mode task. This is the same approach used in QEMU's emulation code, which (until version 2.0) manually tracks the CPL. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 60 +++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e8a58409b5ac..47e716ef46b7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1410,11 +1410,11 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, } /* Does not support long mode */ -static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg) +static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg, u8 cpl) { struct desc_struct seg_desc, old_desc; - u8 dpl, rpl, cpl; + u8 dpl, rpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ @@ -1442,7 +1442,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } rpl = selector & 3; - cpl = ctxt->ops->cpl(ctxt); /* NULL selector is not valid for TR, CS and SS (except for long mode) */ if ((seg == VCPU_SREG_CS @@ -1544,6 +1543,13 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg) +{ + u8 cpl = ctxt->ops->cpl(ctxt); + return __load_segment_descriptor(ctxt, selector, seg, cpl); +} + static void write_register_operand(struct operand *op) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ @@ -2405,6 +2411,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { int ret; + u8 cpl; ctxt->_eip = tss->ip; ctxt->eflags = tss->flag | 2; @@ -2427,23 +2434,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + cpl = tss->cs & 3; + /* * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); if (ret != X86EMUL_CONTINUE) return ret; @@ -2521,6 +2530,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { int ret; + u8 cpl; if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); @@ -2539,7 +2549,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, /* * SDM says that segment selectors are loaded before segment - * descriptors + * descriptors. This is important because CPL checks will + * use CS.RPL. */ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); @@ -2553,43 +2564,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * If we're switching between Protected Mode and VM86, we need to make * sure to update the mode before loading the segment descriptors so * that the selectors are interpreted correctly. - * - * Need to get rflags to the vcpu struct immediately because it - * influences the CPL which is checked at least when loading the segment - * descriptors and when pushing an error code to the new kernel stack. - * - * TODO Introduce a separate ctxt->ops->set_cpl callback */ - if (ctxt->eflags & X86_EFLAGS_VM) + if (ctxt->eflags & X86_EFLAGS_VM) { ctxt->mode = X86EMUL_MODE_VM86; - else + cpl = 3; + } else { ctxt->mode = X86EMUL_MODE_PROT32; - - ctxt->ops->set_rflags(ctxt, ctxt->eflags); + cpl = tss->cs & 3; + } /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl); if (ret != X86EMUL_CONTINUE) return ret; -- cgit v1.2.3 From fb5e336b977086557739791ed51955c5913dc773 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 15 May 2014 18:02:50 +0200 Subject: KVM: x86: drop set_rflags callback Not needed anymore now that the CPL is computed directly during task switch. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 1 - arch/x86/kvm/x86.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 24ec1216596e..a04fe4eb237d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -189,7 +189,6 @@ struct x86_emulate_ops { void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); - void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb313fc896dd..57eac309c22e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4646,11 +4646,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } -static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) -{ - kvm_set_rflags(emul_to_vcpu(ctxt), val); -} - static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4835,7 +4830,6 @@ static const struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, - .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, -- cgit v1.2.3 From 5045b468037dfe1c848827ce10e99d87f5669160 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 15 May 2014 18:09:29 +0200 Subject: KVM: x86: check CS.DPL against RPL during task switch Table 7-1 of the SDM mentions a check that the code segment's DPL must match the selector's RPL. This was not done by KVM, fix it. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 47e716ef46b7..2fa7ab069817 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1411,7 +1411,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg, u8 cpl) + u16 selector, int seg, u8 cpl, bool in_task_switch) { struct desc_struct seg_desc, old_desc; u8 dpl, rpl; @@ -1486,6 +1486,9 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; break; case VCPU_SREG_CS: + if (in_task_switch && rpl != dpl) + goto exception; + if (!(seg_desc.type & 8)) goto exception; @@ -1547,7 +1550,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl); + return __load_segment_descriptor(ctxt, selector, seg, cpl, false); } static void write_register_operand(struct operand *op) @@ -2440,19 +2443,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; @@ -2577,25 +2580,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); if (ret != X86EMUL_CONTINUE) return ret; -- cgit v1.2.3 From ae9fedc793c4d98aa9bb298585b2b9246096ce65 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 09:39:49 +0200 Subject: KVM: x86: get CPL from SS.DPL CS.RPL is not equal to the CPL in the few instructions between setting CR0.PE and reloading CS. And CS.DPL is also not equal to the CPL for conforming code segments. However, SS.DPL *is* always equal to the CPL except for the weird case of SYSRET on AMD processors, which sets SS.DPL=SS.RPL from the value in the STAR MSR, but force CPL=3 (Intel instead forces SS.DPL=SS.RPL=CPL=3). So this patch: - modifies SVM to update the CPL from SS.DPL rather than CS.RPL; the above case with SYSRET is not broken further, and the way to fix it would be to pass the CPL to userspace and back - modifies VMX to always return the CPL from SS.DPL (except forcing it to 0 if we are emulating real mode via vm86 mode; in vm86 mode all DPLs have to be 3, but real mode does allow privileged instructions). It also removes the CPL cache, which becomes a duplicate of the SS access rights cache. This fixes doing KVM_IOCTL_SET_SREGS exactly after setting CR0.PE=1 but before CS has been reloaded. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm.c | 35 ++++++++++++++--------------------- arch/x86/kvm/vmx.c | 24 ++++-------------------- 3 files changed, 18 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e21aee98a5c2..49314155b66c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -130,7 +130,6 @@ enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, - VCPU_EXREG_CPL, VCPU_EXREG_SEGMENTS, }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0b7d58d0c5fb..ec8366c5cfea 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } -static void svm_update_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int cpl; - - if (!is_protmode(vcpu)) - cpl = 0; - else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) - cpl = 3; - else - cpl = svm->vmcb->save.cs.selector & 0x3; - - svm->vmcb->save.cpl = cpl; -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; - + /* + * Any change of EFLAGS.VM is accompained by a reload of SS + * (caused by either a task switch or an inter-privilege IRET), + * so we do not need to update the CPL here. + */ to_svm(vcpu)->vmcb->save.rflags = rflags; - if ((old_rflags ^ rflags) & X86_EFLAGS_VM) - svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) @@ -1631,8 +1617,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } - if (seg == VCPU_SREG_CS) - svm_update_cpl(vcpu); + + /* + * This is always accurate, except if SYSRET returned to a segment + * with SS.DPL != 3. Intel does not have this quirk, and always + * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it + * would entail passing the CPL to userspace and back. + */ + if (seg == VCPU_SREG_SS) + svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; mark_dirty(svm->vmcb, VMCB_SEG); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6f7463f53ed9..a267108403f5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -414,7 +414,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - u8 cpl; bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; @@ -3150,10 +3149,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - - /* CPL is always 0 when CPU enters protected mode */ - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = 0; } static void fix_rmode_seg(int seg, struct kvm_segment *save) @@ -3555,22 +3550,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_protmode(vcpu)) + if (unlikely(vmx->rmode.vm86_active)) return 0; - - if (!is_long_mode(vcpu) - && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ - return 3; - - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; + else { + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return AR_DPL(ar); } - - return vmx->cpl; } - static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -3598,8 +3585,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmx_segment_cache_clear(vmx); - if (seg == VCPU_SREG_CS) - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; @@ -7471,7 +7456,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); -- cgit v1.2.3 From 65a7f03f6b534916e279e403dff41e1015dd0dce Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 16 May 2014 12:45:15 -0700 Subject: x86: fix page fault tracing when KVM guest support enabled I noticed on some of my systems that page fault tracing doesn't work: cd /sys/kernel/debug/tracing echo 1 > events/exceptions/enable cat trace; # nothing shows up I eventually traced it down to CONFIG_KVM_GUEST. At least in a KVM VM, enabling that option breaks page fault tracing, and disabling fixes it. I tried on some old kernels and this does not appear to be a regression: it never worked. There are two page-fault entry functions today. One when tracing is on and another when it is off. The KVM code calls do_page_fault() directly instead of calling the traced version: > dotraplinkage void __kprobes > do_async_page_fault(struct pt_regs *regs, unsigned long > error_code) > { > enum ctx_state prev_state; > > switch (kvm_read_and_reset_pf_reason()) { > default: > do_page_fault(regs, error_code); > break; > case KVM_PV_REASON_PAGE_NOT_PRESENT: I'm also having problems with the page fault tracing on bare metal (same symptom of no trace output). I'm unsure if it's related. Steven had an alternative to this which has zero overhead when tracing is off where this includes the standard noops even when tracing is disabled. I'm unconvinced that the extra complexity of his apporach: http://lkml.kernel.org/r/20140508194508.561ed220@gandalf.local.home is worth it, expecially considering that the KVM code is already making page fault entry slower here. This solution is dirt-simple. Cc: Thomas Gleixner Cc: x86@kernel.org Cc: Peter Zijlstra Cc: Gleb Natapov Cc: kvm@vger.kernel.org Cc: Paolo Bonzini Signed-off-by: Dave Hansen Acked-by: "H. Peter Anvin" Acked-by: Steven Rostedt Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/traps.h | 5 +++++ arch/x86/kernel/kvm.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 58d66fe06b61..8ba18842c48e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -74,6 +74,11 @@ dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); #ifdef CONFIG_TRACING dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); +#else +static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) +{ + do_page_fault(regs, error); +} #endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0331cb389d68..7e97371387fd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -259,7 +259,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code); + trace_do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ -- cgit v1.2.3 From 1f854112553a1d65363ab27d4ee3dfb4b27075fb Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 19 May 2014 09:50:50 +0300 Subject: KVM: vmx: DR7 masking on task switch emulation is wrong The DR7 masking which is done on task switch emulation should be in hex format (clearing the local breakpoints enable bits 0,2,4 and 6). Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a267108403f5..248287cefa7a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5401,7 +5401,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); /* * TODO: What about debug traps on tss switch? -- cgit v1.2.3 From adc429d69937b3ef6fba2f8e8de9b7ce8347b662 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 14 Apr 2014 16:13:48 -0600 Subject: x86/PCI: Move pcibios_assign_resources() annotation to definition Move the pcibios_assign_resources() fs_initcall annotation next to the function definition. No functional change. Signed-off-by: Bjorn Helgaas --- arch/x86/pci/i386.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 6db58d68feb5..a19ed92e74e4 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -361,6 +361,12 @@ static int __init pcibios_assign_resources(void) return 0; } +/** + * called in fs_initcall (one below subsys_initcall), + * give a chance for motherboard reserve resources + */ +fs_initcall(pcibios_assign_resources); + void pcibios_resource_survey_bus(struct pci_bus *bus) { dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n"); @@ -397,12 +403,6 @@ void __init pcibios_resource_survey(void) ioapic_insert_resources(); } -/** - * called in fs_initcall (one below subsys_initcall), - * give a chance for motherboard reserve resources - */ -fs_initcall(pcibios_assign_resources); - static const struct vm_operations_struct pci_mmap_ops = { .access = generic_access_phys, }; -- cgit v1.2.3 From a5d3244a0b1c16963fd7ceadf76da843df27c3c5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 28 Apr 2014 15:16:33 -0600 Subject: x86/gart: Replace printk() with pr_info() Replace printk() with pr_info(), pr_err(), etc. Define pr_fmt() to prefix output with "AGP: ". No functional change except the addition of "AGP: " prefix in dmesg output. Signed-off-by: Bjorn Helgaas --- arch/x86/kernel/aperture_64.c | 54 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9fa8aa051f54..b11edf2b656d 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -10,6 +10,8 @@ * * Copyright 2002 Andi Kleen, SuSE Labs. */ +#define pr_fmt(fmt) "AGP: " fmt + #include #include #include @@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void) addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); if (!addr) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%lx,%uK)\n", - addr, aper_size>>10); + pr_err("Cannot allocate aperture memory hole (%lx,%uK)\n", + addr, aper_size>>10); return 0; } memblock_reserve(addr, aper_size); - printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, addr); + pr_info("Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, + addr); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -126,10 +127,10 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + pr_info("AGP bridge at %02x:%02x:%02x\n", bus, slot, func); apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); + pr_err("APSIZE in AGP bridge unreadable\n"); return 0; } @@ -153,16 +154,16 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", - aper, 32 << old_order); + pr_info("Aperture from AGP @ %Lx old size %u MB\n", + aper, 32 << old_order); if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { - printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", - 32 << *order, apsizereg); + pr_info("Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", + 32 << *order, apsizereg); *order = old_order; } - printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + pr_info("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, + 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) return 0; @@ -218,7 +219,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) } } } - printk(KERN_INFO "No AGP bridge found\n"); + pr_info("No AGP bridge found\n"); return 0; } @@ -310,7 +311,7 @@ void __init early_gart_iommu_check(void) if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ - printk(KERN_INFO "update e820 for GART\n"); + pr_info("update e820 for GART\n"); e820_add_region(aper_base, aper_size, E820_RESERVED); update_e820(); } @@ -354,7 +355,7 @@ int __init gart_iommu_hole_init(void) !early_pci_allowed()) return -ENODEV; - printk(KERN_INFO "Checking aperture...\n"); + pr_info("Checking aperture...\n"); if (!fallback_aper_force) agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); @@ -395,8 +396,8 @@ int __init gart_iommu_hole_init(void) aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; - printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); + pr_info("Node %d: aperture @ %Lx size %u MB\n", + node, aper_base, aper_size >> 20); node++; if (!aperture_valid(aper_base, aper_size, 64<<20)) { @@ -407,9 +408,9 @@ int __init gart_iommu_hole_init(void) if (!no_iommu && max_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); - printk(KERN_ERR "please increase GART size in your BIOS setup\n"); - printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + pr_err("you are using iommu with agp, but GART size is less than 64M\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); printed_gart_size_msg = 1; } } else { @@ -446,13 +447,10 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_INFO - "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_INFO - "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_INFO - "This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %d MB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); -- cgit v1.2.3 From c96ec95315b9242ec423b8348984c394d27a8135 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 14 Apr 2014 15:29:19 -0600 Subject: x86/gart: Tidy messages and add bridge device info Print the AGP bridge info the same way as the rest of the kernel, e.g., "0000:00:04.0" instead of "00:04:00". Also print the AGP aperture address range the same way we print resources, and label it explicitly as a bus address range. No functional change except the message changes. Signed-off-by: Bjorn Helgaas --- arch/x86/kernel/aperture_64.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b11edf2b656d..76164e173a24 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -77,13 +77,13 @@ static u32 __init allocate_aperture(void) addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); if (!addr) { - pr_err("Cannot allocate aperture memory hole (%lx,%uK)\n", - addr, aper_size>>10); + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); return 0; } memblock_reserve(addr, aper_size); - pr_info("Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, - addr); + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -127,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - pr_info("AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - pr_err("APSIZE in AGP bridge unreadable\n"); + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); return 0; } @@ -154,15 +155,17 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - pr_info("Aperture from AGP @ %Lx old size %u MB\n", - aper, 32 << old_order); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { - pr_info("Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", - 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); *order = old_order; } - pr_info("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) @@ -311,7 +314,8 @@ void __init early_gart_iommu_check(void) if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ - pr_info("update e820 for GART\n"); + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); e820_add_region(aper_base, aper_size, E820_RESERVED); update_e820(); } @@ -396,8 +400,9 @@ int __init gart_iommu_hole_init(void) aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; - pr_info("Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); node++; if (!aperture_valid(aper_base, aper_size, 64<<20)) { @@ -408,7 +413,7 @@ int __init gart_iommu_hole_init(void) if (!no_iommu && max_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - pr_err("you are using iommu with agp, but GART size is less than 64M\n"); + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); pr_err("please increase GART size in your BIOS setup\n"); pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); printed_gart_size_msg = 1; @@ -449,7 +454,7 @@ out: fallback_aper_force) { pr_info("Your BIOS doesn't leave a aperture memory hole\n"); pr_info("Please enable the IOMMU option in the BIOS setup\n"); - pr_info("This costs you %d MB of RAM\n", + pr_info("This costs you %dMB of RAM\n", 32 << fallback_aper_order); aper_order = fallback_aper_order; -- cgit v1.2.3 From fc57ac2c9ca8109ea97fcc594f4be436944230cc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:40:58 +0200 Subject: KVM: lapic: sync highest ISR to hardware apic on EOI When Hyper-V enlightenments are in effect, Windows prefers to issue an Hyper-V MSR write to issue an EOI rather than an x2apic MSR write. The Hyper-V MSR write is not handled by the processor, and besides being slower, this also causes bugs with APIC virtualization. The reason is that on EOI the processor will modify the highest in-service interrupt (SVI) field of the VMCS, as explained in section 29.1.4 of the SDM; every other step in EOI virtualization is already done by apic_send_eoi or on VM entry, but this one is missing. We need to do the same, and be careful not to muck with the isr_count and highest_isr_cache fields that are unused when virtual interrupt delivery is enabled. Cc: stable@vger.kernel.org Reviewed-by: Yang Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 62 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9736529ade08..006911858174 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { + /* Note that we never get here with APIC virtualization enabled. */ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) apic->highest_isr_cache = vec; } +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + /* + * Note that isr_count is always 1, and highest_isr_cache + * is always -1, with APIC virtualization enabled. + */ + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + struct kvm_vcpu *vcpu; + if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + return; + + vcpu = apic->vcpu; + + /* + * We do get here for APIC virtualization enabled if the guest + * uses the Hyper-V APIC enlightenment. In this case we may need + * to trigger a new interrupt delivery by writing the SVI field; + * on the other hand isr_count and highest_isr_cache are unused + * and must be left alone. + */ + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); + else { --apic->isr_count; - BUG_ON(apic->isr_count < 0); - apic->highest_isr_cache = -1; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; + } } int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) @@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static inline int apic_find_highest_isr(struct kvm_lapic *apic) -{ - int result; - - /* Note that isr_count is always 1 with vid enabled */ - if (!apic->isr_count) - return -1; - if (likely(apic->highest_isr_cache != -1)) - return apic->highest_isr_cache; - - result = find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); - - return result; -} - void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + /* Note that we never get here with APIC virtualization enabled. */ + if (vector == -1) return -1; -- cgit v1.2.3 From 9b88ae99d2fe11e359b3b3992aff953e28b0b43a Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 25 May 2014 23:05:21 +0300 Subject: KVM: x86: MOV CR/DR emulation should ignore mod MOV CR/DR instructions ignore the mod field (in the ModR/M byte). As the SDM states: "The 2 bits in the mod field are ignored". Accordingly, the second operand of these instructions is always a general purpose register. The current emulator implementation does not do so. If the mod bits do not equal 3, it expects the second operand to be in memory. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2fa7ab069817..e4e833d3d7d7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -161,6 +161,7 @@ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ +#define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -1077,7 +1078,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm |= (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; - if (ctxt->modrm_mod == 3) { + if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, @@ -3877,10 +3878,12 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), /* 0x20 - 0x2F */ - DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), - DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), - IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), - IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, + check_cr_write), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, + check_dr_write), N, N, N, N, GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), -- cgit v1.2.3 From 77945ca73e9a66cae25882fcab33ae0c6692763f Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 23 May 2014 19:33:44 -0700 Subject: x86/xen: map foreign pfns for autotranslated guests When running as a dom0 in PVH mode, foreign pfns that are accessed must be added to our p2m which is managed by xen. This is done via XENMEM_add_to_physmap_range hypercall. This is needed for toolstack building guests and mapping guest memory, xentrace mapping xen pages, etc. Signed-off-by: Mukesh Rathor Signed-off-by: David Vrabel --- arch/x86/xen/mmu.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d91602424b39..6f6e15d28466 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2510,6 +2510,95 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +#ifdef CONFIG_XEN_PVH +/* + * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user + * space creating new guest on pvh dom0 and needing to map domU pages. + */ +static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, + unsigned int domid) +{ + int rc, err = 0; + xen_pfn_t gpfn = lpfn; + xen_ulong_t idx = fgfn; + + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = domid, + .size = 1, + .space = XENMAPSPACE_gmfn_foreign, + }; + set_xen_guest_handle(xatp.idxs, &idx); + set_xen_guest_handle(xatp.gpfns, &gpfn); + set_xen_guest_handle(xatp.errs, &err); + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + if (rc < 0) + return rc; + return err; +} + +static int xlate_remove_from_p2m(unsigned long spfn, int count) +{ + struct xen_remove_from_physmap xrp; + int i, rc; + + for (i = 0; i < count; i++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = spfn+i; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + if (rc) + break; + } + return rc; +} + +struct xlate_remap_data { + unsigned long fgfn; /* foreign domain's gfn */ + pgprot_t prot; + domid_t domid; + int index; + struct page **pages; +}; + +static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + int rc; + struct xlate_remap_data *remap = data; + unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); + + rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); + if (rc) + return rc; + native_set_pte(ptep, pteval); + + return 0; +} + +static int xlate_remap_gfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long mfn, + int nr, pgprot_t prot, unsigned domid, + struct page **pages) +{ + int err; + struct xlate_remap_data pvhdata; + + BUG_ON(!pages); + + pvhdata.fgfn = mfn; + pvhdata.prot = prot; + pvhdata.domid = domid; + pvhdata.index = 0; + pvhdata.pages = pages; + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, + xlate_map_pte_fn, &pvhdata); + flush_tlb_all(); + return err; +} +#endif + #define REMAP_BATCH_SIZE 16 struct remap_data { @@ -2544,11 +2633,18 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -EINVAL; - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_XEN_PVH + /* We need to update the local page tables and the xen HAP */ + return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, + domid, pages); +#else + return -EINVAL; +#endif + } + rmd.mfn = mfn; rmd.prot = prot; @@ -2586,6 +2682,25 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) return 0; +#ifdef CONFIG_XEN_PVH + while (numpgs--) { + /* + * The mmu has already cleaned up the process mmu + * resources at this point (lookup_address will return + * NULL). + */ + unsigned long pfn = page_to_pfn(pages[numpgs]); + + xlate_remove_from_p2m(pfn, 1); + } + /* + * We don't need to flush tlbs because as part of + * xlate_remove_from_p2m, the hypervisor will do tlb flushes + * after removing the p2m entries from the EPT/NPT + */ + return 0; +#else return -EINVAL; +#endif } EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); -- cgit v1.2.3 From 07d8391433380fc72f999d99c554b1cfedea9778 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Mon, 12 May 2014 15:46:38 +0800 Subject: ACPICA: Linux headers: Add to remove mis-ordered inclusion of There is a mis-order inclusion for . As we will enforce including for all Linux ACPI users, we can find the inclusion order is as follows: (acenv.h before including aclinux.h) ........................................................................... (aclinux.h before including asm/acpi.h) @Redundant@ (ACPICA specific stuff) ........................................................................... ........................................................................... (Linux ACPI specific stuff) ? - - - - - - - - - - - - + (aclinux.h after including asm/acpi.h) @Invisible@ | (acenv.h after including aclinux.h) @Invisible@ | other ACPICA headers @Invisible@ | ............................................................|.............. | | (Excluded) | (Linux ACPI specific stuff) ! <- - - - - - - - - - - - - + NOTE that, in ACPICA, is more like Kconfig generated for Linux, it is meant to be included before including any ACPICA code. In the above figure, there is a question mark for "Linux ACPI specific stuff" in which should be included after including all other ACPICA header files. Thus they really need to be moved to the position marked with exclaimation mark or the definitions in the blocks marked with "@Invisible@" will be invisible to such architecture specific "Linux ACPI specific stuff" header blocks. This leaves 2 issues: 1. All environmental definitions in these blocks should have a copy in the area marked with "@Redundant@" if they are required by the "Linux ACPI specific stuff". 2. We cannot use any ACPICA defined types in . This patch splits architecture specific ACPICA stuff from to fix this issue. Signed-off-by: Lv Zheng Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Rafael J. Wysocki --- arch/ia64/include/asm/acenv.h | 71 +++++++++++++++++++++++++++++++++++++++++ arch/ia64/include/asm/acpi.h | 50 ----------------------------- arch/x86/include/asm/acenv.h | 65 +++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/acpi.h | 45 -------------------------- include/acpi/platform/aclinux.h | 2 +- 5 files changed, 137 insertions(+), 96 deletions(-) create mode 100644 arch/ia64/include/asm/acenv.h create mode 100644 arch/x86/include/asm/acenv.h (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/acenv.h b/arch/ia64/include/asm/acenv.h new file mode 100644 index 000000000000..e0896eb26bf1 --- /dev/null +++ b/arch/ia64/include/asm/acenv.h @@ -0,0 +1,71 @@ +/* + * IA64 specific ACPICA environments and implementation + * + * Copyright (C) 2014, Intel Corporation + * Author: Lv Zheng + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_IA64_ACENV_H +#define _ASM_IA64_ACENV_H + +#include + +#define COMPILER_DEPENDENT_INT64 long +#define COMPILER_DEPENDENT_UINT64 unsigned long + +/* + * Calling conventions: + * + * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) + * ACPI_EXTERNAL_XFACE - External ACPI interfaces + * ACPI_INTERNAL_XFACE - Internal ACPI interfaces + * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces + */ +#define ACPI_SYSTEM_XFACE +#define ACPI_EXTERNAL_XFACE +#define ACPI_INTERNAL_XFACE +#define ACPI_INTERNAL_VAR_XFACE + +/* Asm macros */ + +#define ACPI_FLUSH_CPU_CACHE() + +#ifdef CONFIG_ACPI + +static inline int +ia64_acpi_acquire_global_lock(unsigned int *lock) +{ + unsigned int old, new, val; + do { + old = *lock; + new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); + val = ia64_cmpxchg4_acq(lock, new, old); + } while (unlikely (val != old)); + return (new < 3) ? -1 : 0; +} + +static inline int +ia64_acpi_release_global_lock(unsigned int *lock) +{ + unsigned int old, new, val; + do { + old = *lock; + new = old & ~0x3; + val = ia64_cmpxchg4_acq(lock, new, old); + } while (unlikely (val != old)); + return old & 0x1; +} + +#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = ia64_acpi_acquire_global_lock(&facs->global_lock)) + +#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = ia64_acpi_release_global_lock(&facs->global_lock)) + +#endif + +#endif /* _ASM_IA64_ACENV_H */ diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index d651102a4d45..b0ddcfd384a4 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -34,56 +34,6 @@ #include #include -#define COMPILER_DEPENDENT_INT64 long -#define COMPILER_DEPENDENT_UINT64 unsigned long - -/* - * Calling conventions: - * - * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) - * ACPI_EXTERNAL_XFACE - External ACPI interfaces - * ACPI_INTERNAL_XFACE - Internal ACPI interfaces - * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces - */ -#define ACPI_SYSTEM_XFACE -#define ACPI_EXTERNAL_XFACE -#define ACPI_INTERNAL_XFACE -#define ACPI_INTERNAL_VAR_XFACE - -/* Asm macros */ - -#define ACPI_FLUSH_CPU_CACHE() - -static inline int -ia64_acpi_acquire_global_lock (unsigned int *lock) -{ - unsigned int old, new, val; - do { - old = *lock; - new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); - val = ia64_cmpxchg4_acq(lock, new, old); - } while (unlikely (val != old)); - return (new < 3) ? -1 : 0; -} - -static inline int -ia64_acpi_release_global_lock (unsigned int *lock) -{ - unsigned int old, new, val; - do { - old = *lock; - new = old & ~0x3; - val = ia64_cmpxchg4_acq(lock, new, old); - } while (unlikely (val != old)); - return old & 0x1; -} - -#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ - ((Acq) = ia64_acpi_acquire_global_lock(&facs->global_lock)) - -#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ - ((Acq) = ia64_acpi_release_global_lock(&facs->global_lock)) - #ifdef CONFIG_ACPI #define acpi_disabled 0 /* ACPI always enabled on IA64 */ #define acpi_noirq 0 /* ACPI always enabled on IA64 */ diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h new file mode 100644 index 000000000000..6978584934f7 --- /dev/null +++ b/arch/x86/include/asm/acenv.h @@ -0,0 +1,65 @@ +/* + * X86 specific ACPICA environments and implementation + * + * Copyright (C) 2014, Intel Corporation + * Author: Lv Zheng + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_X86_ACENV_H +#define _ASM_X86_ACENV_H + +#include + +#define COMPILER_DEPENDENT_INT64 long long +#define COMPILER_DEPENDENT_UINT64 unsigned long long + +/* + * Calling conventions: + * + * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) + * ACPI_EXTERNAL_XFACE - External ACPI interfaces + * ACPI_INTERNAL_XFACE - Internal ACPI interfaces + * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces + */ +#define ACPI_SYSTEM_XFACE +#define ACPI_EXTERNAL_XFACE +#define ACPI_INTERNAL_XFACE +#define ACPI_INTERNAL_VAR_XFACE + +/* Asm macros */ + +#define ACPI_FLUSH_CPU_CACHE() wbinvd() + +#ifdef CONFIG_ACPI + +int __acpi_acquire_global_lock(unsigned int *lock); +int __acpi_release_global_lock(unsigned int *lock); + +#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_acquire_global_lock(&facs->global_lock)) + +#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_release_global_lock(&facs->global_lock)) + +/* + * Math helper asm macros + */ +#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \ + asm("divl %2;" \ + : "=a"(q32), "=d"(r32) \ + : "r"(d32), \ + "0"(n_lo), "1"(n_hi)) + +#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \ + asm("shrl $1,%2 ;" \ + "rcrl $1,%3;" \ + : "=r"(n_hi), "=r"(n_lo) \ + : "0"(n_hi), "1"(n_lo)) + +#endif + +#endif /* _ASM_X86_ACENV_H */ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index c8c1e700c26e..e06225eda635 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -32,51 +32,6 @@ #include #include -#define COMPILER_DEPENDENT_INT64 long long -#define COMPILER_DEPENDENT_UINT64 unsigned long long - -/* - * Calling conventions: - * - * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) - * ACPI_EXTERNAL_XFACE - External ACPI interfaces - * ACPI_INTERNAL_XFACE - Internal ACPI interfaces - * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces - */ -#define ACPI_SYSTEM_XFACE -#define ACPI_EXTERNAL_XFACE -#define ACPI_INTERNAL_XFACE -#define ACPI_INTERNAL_VAR_XFACE - -/* Asm macros */ - -#define ACPI_FLUSH_CPU_CACHE() wbinvd() - -int __acpi_acquire_global_lock(unsigned int *lock); -int __acpi_release_global_lock(unsigned int *lock); - -#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ - ((Acq) = __acpi_acquire_global_lock(&facs->global_lock)) - -#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ - ((Acq) = __acpi_release_global_lock(&facs->global_lock)) - -/* - * Math helper asm macros - */ -#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \ - asm("divl %2;" \ - : "=a"(q32), "=d"(r32) \ - : "r"(d32), \ - "0"(n_lo), "1"(n_hi)) - - -#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \ - asm("shrl $1,%2 ;" \ - "rcrl $1,%3;" \ - : "=r"(n_hi), "=r"(n_lo) \ - : "0"(n_hi), "1"(n_lo)) - #ifdef CONFIG_ACPI extern int acpi_lapic; extern int acpi_ioapic; diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h index 0ab05d95e3a3..5d27e560a48e 100644 --- a/include/acpi/platform/aclinux.h +++ b/include/acpi/platform/aclinux.h @@ -71,7 +71,7 @@ #ifdef EXPORT_ACPI_INTERFACES #include #endif -#include +#include #ifndef CONFIG_ACPI -- cgit v1.2.3 From 92985ef1db428cc6129a1d375a68c277aa05820b Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Mon, 12 May 2014 15:46:49 +0800 Subject: ACPICA: Clean up redudant definitions already defined elsewhere Since mis-order issues have been solved, we can cleanup redundant definitions that already have defaults in . This patch removes redudant environments for __KERNEL__ surrounded code. Signed-off-by: Lv Zheng Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Rafael J. Wysocki --- arch/ia64/include/asm/acenv.h | 15 --------------- arch/x86/include/asm/acenv.h | 16 ---------------- include/acpi/platform/aclinux.h | 1 - 3 files changed, 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/acenv.h b/arch/ia64/include/asm/acenv.h index e0896eb26bf1..3f9eaeec9873 100644 --- a/arch/ia64/include/asm/acenv.h +++ b/arch/ia64/include/asm/acenv.h @@ -17,23 +17,8 @@ #define COMPILER_DEPENDENT_INT64 long #define COMPILER_DEPENDENT_UINT64 unsigned long -/* - * Calling conventions: - * - * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) - * ACPI_EXTERNAL_XFACE - External ACPI interfaces - * ACPI_INTERNAL_XFACE - Internal ACPI interfaces - * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces - */ -#define ACPI_SYSTEM_XFACE -#define ACPI_EXTERNAL_XFACE -#define ACPI_INTERNAL_XFACE -#define ACPI_INTERNAL_VAR_XFACE - /* Asm macros */ -#define ACPI_FLUSH_CPU_CACHE() - #ifdef CONFIG_ACPI static inline int diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 6978584934f7..66873297e9f5 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -14,22 +14,6 @@ #include -#define COMPILER_DEPENDENT_INT64 long long -#define COMPILER_DEPENDENT_UINT64 unsigned long long - -/* - * Calling conventions: - * - * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) - * ACPI_EXTERNAL_XFACE - External ACPI interfaces - * ACPI_INTERNAL_XFACE - Internal ACPI interfaces - * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces - */ -#define ACPI_SYSTEM_XFACE -#define ACPI_EXTERNAL_XFACE -#define ACPI_INTERNAL_XFACE -#define ACPI_INTERNAL_VAR_XFACE - /* Asm macros */ #define ACPI_FLUSH_CPU_CACHE() wbinvd() diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h index 5d27e560a48e..e70012956db3 100644 --- a/include/acpi/platform/aclinux.h +++ b/include/acpi/platform/aclinux.h @@ -48,7 +48,6 @@ #define ACPI_USE_SYSTEM_CLIBRARY #define ACPI_USE_DO_WHILE_0 -#define ACPI_MUTEX_TYPE ACPI_BINARY_SEMAPHORE #ifdef __KERNEL__ -- cgit v1.2.3 From 9e7f7231e451aa7a64edb9d1a84d6dd231019c5a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 8 May 2014 11:44:19 -0500 Subject: x86/PCI: Clean up and mark early_root_info_init() as deprecated early_root_info_init() is now deprecated in favor of info in ACPI. Add a note to that effect. Also, clean up the code a bit. There is no functional change. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Bjorn Helgaas --- arch/x86/pci/amd_bus.c | 68 ++++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index aa936e3a2019..c20d2cc7ef64 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -11,28 +11,33 @@ #include "bus_numa.h" -/* - * This discovers the pcibus <-> node mapping on AMD K8. - * also get peer root bus resource for io,mmio - */ +#define AMD_NB_F0_NODE_ID 0x60 +#define AMD_NB_F0_UNIT_ID 0x64 +#define AMD_NB_F1_CONFIG_MAP_REG 0xe0 + +#define RANGE_NUM 16 +#define AMD_NB_F1_CONFIG_MAP_RANGES 4 -struct pci_hostbridge_probe { +struct amd_hostbridge { u32 bus; u32 slot; - u32 vendor; u32 device; }; -static struct pci_hostbridge_probe pci_probes[] __initdata = { - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, /* K8 */ - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, /* Fam10h */ - { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, /* Fam10h */ - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, /* Fam11h */ - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1600 }, /* Fam15h */ +/* + * IMPORTANT NOTE: + * hb_probes[] and early_root_info_init() is in maintenance mode. + * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh . + * Future processor will rely on information in ACPI. + */ +static struct amd_hostbridge hb_probes[] __initdata = { + { 0, 0x18, 0x1100 }, /* K8 */ + { 0, 0x18, 0x1200 }, /* Family10h */ + { 0xff, 0, 0x1200 }, /* Family10h */ + { 0, 0x18, 0x1300 }, /* Family11h */ + { 0, 0x18, 0x1600 }, /* Family15h */ }; -#define RANGE_NUM 16 - static struct pci_root_info __init *find_pci_root_info(int node, int link) { struct pci_root_info *info; @@ -46,12 +51,12 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link) } /** - * early_fill_mp_bus_to_node() + * early_root_info_init() * called before pcibios_scan_root and pci_scan_bus - * fills the mp_bus_to_cpumask array based according to the LDT Bus Number - * Registers found in the K8 northbridge + * fills the mp_bus_to_cpumask array based according + * to the LDT Bus Number Registers found in the northbridge. */ -static int __init early_fill_mp_bus_info(void) +static int __init early_root_info_init(void) { int i; unsigned bus; @@ -76,19 +81,21 @@ static int __init early_fill_mp_bus_info(void) return -1; found = false; - for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + for (i = 0; i < ARRAY_SIZE(hb_probes); i++) { u32 id; u16 device; u16 vendor; - bus = pci_probes[i].bus; - slot = pci_probes[i].slot; + bus = hb_probes[i].bus; + slot = hb_probes[i].slot; id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); - vendor = id & 0xffff; device = (id>>16) & 0xffff; - if (pci_probes[i].vendor == vendor && - pci_probes[i].device == device) { + + if (vendor != PCI_VENDOR_ID_AMD) + continue; + + if (hb_probes[i].device == device) { found = true; break; } @@ -102,10 +109,11 @@ static int __init early_fill_mp_bus_info(void) * _CRS methods in the ACPI namespace. We extract node numbers * here to work around BIOSes that don't supply _PXM. */ - for (i = 0; i < 4; i++) { + for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) { int min_bus; int max_bus; - reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); + reg = read_pci_config(bus, slot, 1, + AMD_NB_F1_CONFIG_MAP_REG + (i << 2)); /* Check if that register is enabled for bus range */ if ((reg & 7) != 3) @@ -131,9 +139,9 @@ static int __init early_fill_mp_bus_info(void) return 0; /* get the default node and link for left over res */ - reg = read_pci_config(bus, slot, 0, 0x60); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID); def_node = (reg >> 8) & 0x07; - reg = read_pci_config(bus, slot, 0, 0x64); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID); def_link = (reg >> 8) & 0x03; memset(range, 0, sizeof(range)); @@ -380,7 +388,7 @@ static int __init pci_io_ecs_init(void) int cpu; /* assume all cpus from fam10h have IO ECS */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return 0; /* Try the PCI method first. */ @@ -404,7 +412,7 @@ static int __init amd_postcore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - early_fill_mp_bus_info(); + early_root_info_init(); pci_io_ecs_init(); return 0; -- cgit v1.2.3 From 56a41f9949f13dfcf6ceca15723d88d9f5288bb9 Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Sun, 4 May 2014 12:23:39 +0800 Subject: x86/PCI: Use pci_is_bridge() to simplify code Use pci_is_bridge() to simplify code. No functional change. Requires: 326c1cdae741 PCI: Rename pci_is_bridge() to pci_has_subordinate() Requires: 1c86438c9423 PCI: Add new pci_is_bridge() interface Signed-off-by: Yijing Wang Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 94ae9ae9574f..e5f000c02177 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -337,9 +337,7 @@ static void pci_fixup_video(struct pci_dev *pdev) * type BRIDGE, or CARDBUS. Host to PCI controllers use * PCI header type NORMAL. */ - if (bridge - && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) - || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { + if (bridge && (pci_is_bridge(bridge))) { pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, &config); if (!(config & PCI_BRIDGE_CTL_VGA)) -- cgit v1.2.3 From a43ae58c848cfbadaba81c8d63202b4487f922a0 Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Tue, 6 May 2014 11:29:52 +0800 Subject: PCI: Turn pcibios_penalize_isa_irq() into a weak function pcibios_penalize_isa_irq() is only implemented by x86 now, and legacy ISA is not used by some architectures. Make pcibios_penalize_isa_irq() a __weak function to simplify the code. This removes the need for new platforms to add stub implementations of pcibios_penalize_isa_irq(). [bhelgaas: changelog, comments] Signed-off-by: Hanjun Guo Signed-off-by: Bjorn Helgaas Acked-by: Arnd Bergmann --- arch/alpha/include/asm/pci.h | 5 ----- arch/arm/include/asm/pci.h | 5 ----- arch/blackfin/include/asm/pci.h | 5 ----- arch/cris/include/asm/pci.h | 1 - arch/frv/include/asm/pci.h | 2 -- arch/frv/mb93090-mb00/pci-irq.c | 4 ---- arch/ia64/include/asm/pci.h | 6 ------ arch/microblaze/include/asm/pci.h | 5 ----- arch/mips/include/asm/pci.h | 5 ----- arch/mn10300/include/asm/pci.h | 1 - arch/mn10300/unit-asb2305/pci-irq.c | 4 ---- arch/parisc/include/asm/pci.h | 5 ----- arch/powerpc/include/asm/pci.h | 5 ----- arch/sh/include/asm/pci.h | 5 ----- arch/sparc/include/asm/pci_32.h | 5 ----- arch/sparc/include/asm/pci_64.h | 5 ----- arch/unicore32/include/asm/pci.h | 5 ----- arch/x86/include/asm/pci.h | 1 - arch/xtensa/include/asm/pci.h | 5 ----- drivers/pci/pci.c | 11 +++++++++++ include/linux/pci.h | 1 + 21 files changed, 12 insertions(+), 79 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h index d01afb78919c..f7f680f7457d 100644 --- a/arch/alpha/include/asm/pci.h +++ b/arch/alpha/include/asm/pci.h @@ -59,11 +59,6 @@ struct pci_controller { extern void pcibios_set_master(struct pci_dev *dev); -extern inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - /* IOMMU controls. */ /* The PCI address space does not equal the physical memory address space. diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h index 680a83e94467..7e95d8535e24 100644 --- a/arch/arm/include/asm/pci.h +++ b/arch/arm/include/asm/pci.h @@ -31,11 +31,6 @@ static inline int pci_proc_domain(struct pci_bus *bus) } #endif /* CONFIG_PCI_DOMAINS */ -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - /* * The PCI address space does equal the physical memory address space. * The networking and block device layers use this boolean for bounce diff --git a/arch/blackfin/include/asm/pci.h b/arch/blackfin/include/asm/pci.h index 74352c4597d9..c737909fba47 100644 --- a/arch/blackfin/include/asm/pci.h +++ b/arch/blackfin/include/asm/pci.h @@ -10,9 +10,4 @@ #define PCIBIOS_MIN_IO 0x00001000 #define PCIBIOS_MIN_MEM 0x10000000 -static inline void pcibios_penalize_isa_irq(int irq) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - #endif /* _ASM_BFIN_PCI_H */ diff --git a/arch/cris/include/asm/pci.h b/arch/cris/include/asm/pci.h index f666734926d5..cc2399c175e9 100644 --- a/arch/cris/include/asm/pci.h +++ b/arch/cris/include/asm/pci.h @@ -20,7 +20,6 @@ void pcibios_config_init(void); struct pci_bus * pcibios_scan_root(int bus); void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h index ef03baf5d89d..2035a4d3f9b9 100644 --- a/arch/frv/include/asm/pci.h +++ b/arch/frv/include/asm/pci.h @@ -24,8 +24,6 @@ struct pci_dev; extern void pcibios_set_master(struct pci_dev *dev); -extern void pcibios_penalize_isa_irq(int irq); - #ifdef CONFIG_MMU extern void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle); extern void consistent_free(void *vaddr); diff --git a/arch/frv/mb93090-mb00/pci-irq.c b/arch/frv/mb93090-mb00/pci-irq.c index c677b9d81d30..1c35c93f942b 100644 --- a/arch/frv/mb93090-mb00/pci-irq.c +++ b/arch/frv/mb93090-mb00/pci-irq.c @@ -55,10 +55,6 @@ void __init pcibios_fixup_irqs(void) } } -void __init pcibios_penalize_isa_irq(int irq) -{ -} - void pcibios_enable_irq(struct pci_dev *dev) { pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h index 7d41cc089822..52af5ed9f60b 100644 --- a/arch/ia64/include/asm/pci.h +++ b/arch/ia64/include/asm/pci.h @@ -50,12 +50,6 @@ struct pci_dev; extern unsigned long ia64_max_iommu_merge_mask; #define PCI_DMA_BUS_IS_PHYS (ia64_max_iommu_merge_mask == ~0UL) -static inline void -pcibios_penalize_isa_irq (int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - #include #ifdef CONFIG_PCI diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index 935f9bec414a..335524040fff 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -44,11 +44,6 @@ struct pci_dev; */ #define pcibios_assign_all_busses() 0 -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - #ifdef CONFIG_PCI extern void set_pci_dma_ops(struct dma_map_ops *dma_ops); extern struct dma_map_ops *get_pci_dma_ops(void); diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 12d6842962be..974b0e308963 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -73,11 +73,6 @@ extern unsigned long PCIBIOS_MIN_MEM; extern void pcibios_set_master(struct pci_dev *dev); -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - #define HAVE_PCI_MMAP extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h index 166323824683..5f70af25c7d0 100644 --- a/arch/mn10300/include/asm/pci.h +++ b/arch/mn10300/include/asm/pci.h @@ -48,7 +48,6 @@ extern void unit_pci_init(void); #define PCIBIOS_MIN_MEM 0xB8000000 void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq); /* Dynamic DMA mapping stuff. * i386 has everything mapped statically. diff --git a/arch/mn10300/unit-asb2305/pci-irq.c b/arch/mn10300/unit-asb2305/pci-irq.c index 77439da04671..fcb28ceb824d 100644 --- a/arch/mn10300/unit-asb2305/pci-irq.c +++ b/arch/mn10300/unit-asb2305/pci-irq.c @@ -40,10 +40,6 @@ void __init pcibios_fixup_irqs(void) } } -void __init pcibios_penalize_isa_irq(int irq) -{ -} - void pcibios_enable_irq(struct pci_dev *dev) { pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h index 465154076d23..20df2b04fc09 100644 --- a/arch/parisc/include/asm/pci.h +++ b/arch/parisc/include/asm/pci.h @@ -215,11 +215,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, } #endif -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't need to penalize isa irq's */ -} - static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) { return channel ? 15 : 14; diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 95145a15c708..1b0739bc14b5 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -46,11 +46,6 @@ struct pci_dev; #define pcibios_assign_all_busses() \ (pci_has_flag(PCI_REASSIGN_ALL_BUS)) -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - #define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) { diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index bff96c2e7d25..5b4511552998 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -70,11 +70,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine); extern void pcibios_set_master(struct pci_dev *dev); -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - /* Dynamic DMA mapping stuff. * SuperH has everything mapped statically like x86. */ diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h index dc503297481f..53e9b4987db0 100644 --- a/arch/sparc/include/asm/pci_32.h +++ b/arch/sparc/include/asm/pci_32.h @@ -16,11 +16,6 @@ #define PCI_IRQ_NONE 0xffffffff -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - /* Dynamic DMA mapping stuff. */ #define PCI_DMA_BUS_IS_PHYS (0) diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h index 1633b718d3bc..c6c7396e7627 100644 --- a/arch/sparc/include/asm/pci_64.h +++ b/arch/sparc/include/asm/pci_64.h @@ -16,11 +16,6 @@ #define PCI_IRQ_NONE 0xffffffff -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - /* The PCI address space does not equal the physical memory * address space. The networking and block device layers use * this boolean for bounce buffer decisions. diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h index f5e108f4a151..654407e98619 100644 --- a/arch/unicore32/include/asm/pci.h +++ b/arch/unicore32/include/asm/pci.h @@ -18,11 +18,6 @@ #include #include /* for PCIBIOS_MIN_* */ -static inline void pcibios_penalize_isa_irq(int irq, int active) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - #ifdef CONFIG_PCI static inline void pci_dma_burst_advice(struct pci_dev *pdev, enum pci_dma_burst_strategy *strat, diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 96ae4f4040bb..0892ea0e683f 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -68,7 +68,6 @@ void pcibios_config_init(void); void pcibios_scan_root(int bus); void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq, int active); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h index 614be031a79a..5d52dc43dfe7 100644 --- a/arch/xtensa/include/asm/pci.h +++ b/arch/xtensa/include/asm/pci.h @@ -22,11 +22,6 @@ extern struct pci_controller* pcibios_alloc_controller(void); -static inline void pcibios_penalize_isa_irq(int irq) -{ - /* We don't do dynamic PCI IRQ allocation */ -} - /* Assume some values. (We should revise them, if necessary) */ #define PCIBIOS_MIN_IO 0x2000 diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 39012831867e..11f24912523c 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1468,6 +1468,17 @@ void __weak pcibios_release_device(struct pci_dev *dev) {} */ void __weak pcibios_disable_device (struct pci_dev *dev) {} +/** + * pcibios_penalize_isa_irq - penalize an ISA IRQ + * @irq: ISA IRQ to penalize + * @active: IRQ active or not + * + * Permits the platform to provide architecture-specific functionality when + * penalizing ISA IRQs. This is the default implementation. Architecture + * implementations can override this. + */ +void __weak pcibios_penalize_isa_irq(int irq, int active) {} + static void do_pci_disable_device(struct pci_dev *dev) { u16 pci_command; diff --git a/include/linux/pci.h b/include/linux/pci.h index 84182b153b21..018877b8b4e8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1578,6 +1578,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state); int pcibios_add_device(struct pci_dev *dev); void pcibios_release_device(struct pci_dev *dev); +void pcibios_penalize_isa_irq(int irq, int active); #ifdef CONFIG_HIBERNATE_CALLBACKS extern struct dev_pm_ops pcibios_pm_ops; -- cgit v1.2.3 From 7ca22cfa0f04917c6dc22b9587f6a6448a7b67d3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 28 May 2014 09:40:45 -0700 Subject: USB: remove CONFIG_USB_DEBUG from defconfig files Now that CONFIG_USB_DEBUG is gone, remove it from a number of defconfig files that were enabling it. Signed-off-by: Greg Kroah-Hartman --- arch/arm/configs/badge4_defconfig | 1 - arch/arm/configs/colibri_pxa300_defconfig | 1 - arch/arm/configs/ep93xx_defconfig | 1 - arch/arm/configs/footbridge_defconfig | 1 - arch/arm/configs/keystone_defconfig | 1 - arch/arm/configs/neponset_defconfig | 1 - arch/arm/configs/omap1_defconfig | 1 - arch/arm/configs/omap2plus_defconfig | 1 - arch/arm/configs/raumfeld_defconfig | 1 - arch/arm/configs/tct_hammer_defconfig | 1 - arch/parisc/configs/c3000_defconfig | 1 - arch/parisc/configs/generic-64bit_defconfig | 1 - arch/powerpc/configs/44x/currituck_defconfig | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/sh/configs/apsh4ad0a_defconfig | 1 - arch/sh/configs/rsk7264_defconfig | 1 - arch/sh/configs/rsk7269_defconfig | 1 - arch/sh/configs/sdk7780_defconfig | 1 - arch/sh/configs/se7343_defconfig | 1 - arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 21 files changed, 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/configs/badge4_defconfig b/arch/arm/configs/badge4_defconfig index 60b0b9368b1b..0494c8f229a2 100644 --- a/arch/arm/configs/badge4_defconfig +++ b/arch/arm/configs/badge4_defconfig @@ -73,7 +73,6 @@ CONFIG_SA1100_WATCHDOG=m CONFIG_SOUND=y CONFIG_SOUND_PRIME=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_MON=y CONFIG_USB_ACM=m CONFIG_USB_PRINTER=m diff --git a/arch/arm/configs/colibri_pxa300_defconfig b/arch/arm/configs/colibri_pxa300_defconfig index 2641dd6ed2f5..be02fe2b14cb 100644 --- a/arch/arm/configs/colibri_pxa300_defconfig +++ b/arch/arm/configs/colibri_pxa300_defconfig @@ -47,7 +47,6 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_HID_SUPPORT is not set CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_STORAGE=y diff --git a/arch/arm/configs/ep93xx_defconfig b/arch/arm/configs/ep93xx_defconfig index 6ac5ea73bd0a..1b650c85bdd0 100644 --- a/arch/arm/configs/ep93xx_defconfig +++ b/arch/arm/configs/ep93xx_defconfig @@ -80,7 +80,6 @@ CONFIG_I2C_DEBUG_BUS=y CONFIG_WATCHDOG=y CONFIG_EP93XX_WATCHDOG=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_DYNAMIC_MINORS=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_HCD_PLATFORM=y diff --git a/arch/arm/configs/footbridge_defconfig b/arch/arm/configs/footbridge_defconfig index 17bb3f7b5802..87e020f303ab 100644 --- a/arch/arm/configs/footbridge_defconfig +++ b/arch/arm/configs/footbridge_defconfig @@ -100,7 +100,6 @@ CONFIG_FB_CYBER2000=y CONFIG_SOUND=m # CONFIG_USB_HID is not set CONFIG_USB=m -CONFIG_USB_DEBUG=y CONFIG_USB_MON=m CONFIG_USB_PRINTER=m CONFIG_EXT2_FS=y diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig index ec9a41d50680..095bb52671f6 100644 --- a/arch/arm/configs/keystone_defconfig +++ b/arch/arm/configs/keystone_defconfig @@ -135,7 +135,6 @@ CONFIG_WATCHDOG=y CONFIG_WATCHDOG_CORE=y CONFIG_DAVINCI_WATCHDOG=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_XHCI_HCD=y diff --git a/arch/arm/configs/neponset_defconfig b/arch/arm/configs/neponset_defconfig index 63aa319b44bb..460dca4a4f98 100644 --- a/arch/arm/configs/neponset_defconfig +++ b/arch/arm/configs/neponset_defconfig @@ -68,7 +68,6 @@ CONFIG_SOUND=y CONFIG_SOUND_PRIME=y # CONFIG_USB_HID is not set CONFIG_USB=m -CONFIG_USB_DEBUG=y CONFIG_USB_MON=m CONFIG_USB_OHCI_HCD=m CONFIG_USB_STORAGE=m diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig index 0f258d555fbc..ce541bb3c2de 100644 --- a/arch/arm/configs/omap1_defconfig +++ b/arch/arm/configs/omap1_defconfig @@ -197,7 +197,6 @@ CONFIG_SND_OMAP_SOC=y # CONFIG_USB_HID is not set CONFIG_USB=y CONFIG_USB_PHY=y -CONFIG_USB_DEBUG=y # CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index a4e8d017f25b..b793a5362cf2 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -217,7 +217,6 @@ CONFIG_SND_OMAP_SOC_OMAP_TWL4030=m CONFIG_SND_OMAP_SOC_OMAP_ABE_TWL6040=m CONFIG_SND_OMAP_SOC_OMAP3_PANDORA=m CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_WDM=y diff --git a/arch/arm/configs/raumfeld_defconfig b/arch/arm/configs/raumfeld_defconfig index f7caa909b40d..3d833aea545a 100644 --- a/arch/arm/configs/raumfeld_defconfig +++ b/arch/arm/configs/raumfeld_defconfig @@ -122,7 +122,6 @@ CONFIG_HID_TOPSEED=y CONFIG_HID_THRUSTMASTER=y CONFIG_HID_ZEROPLUS=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/arm/configs/tct_hammer_defconfig b/arch/arm/configs/tct_hammer_defconfig index 71277a1591ba..7209a2caefcf 100644 --- a/arch/arm/configs/tct_hammer_defconfig +++ b/arch/arm/configs/tct_hammer_defconfig @@ -47,7 +47,6 @@ CONFIG_BLK_DEV_RAM_SIZE=10240 # CONFIG_VGA_CONSOLE is not set # CONFIG_HID_SUPPORT is not set CONFIG_USB=y -CONFIG_USB_DEBUG=y # CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig index b2c3905682d9..fb92b8920785 100644 --- a/arch/parisc/configs/c3000_defconfig +++ b/arch/parisc/configs/c3000_defconfig @@ -127,7 +127,6 @@ CONFIG_SND_SEQUENCER_OSS=y CONFIG_SND_AD1889=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_PRINTER=m CONFIG_USB_STORAGE=m diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 28c1b5de044e..dc0d7ce71ea7 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -219,7 +219,6 @@ CONFIG_HIDRAW=y CONFIG_HID_PID=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_DYNAMIC_MINORS=y CONFIG_USB_MON=m diff --git a/arch/powerpc/configs/44x/currituck_defconfig b/arch/powerpc/configs/44x/currituck_defconfig index 4192322f8a7f..47de68261443 100644 --- a/arch/powerpc/configs/44x/currituck_defconfig +++ b/arch/powerpc/configs/44x/currituck_defconfig @@ -71,7 +71,6 @@ CONFIG_I2C_IBM_IIC=y # CONFIG_HWMON is not set CONFIG_THERMAL=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_OHCI_HCD=y CONFIG_RTC_CLASS=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index b21156372b24..c91066944842 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -964,7 +964,6 @@ CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index 95ae23fcfdd6..ec70475da890 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -93,7 +93,6 @@ CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y CONFIG_LOGO=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_MON=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_STORAGE=y diff --git a/arch/sh/configs/rsk7264_defconfig b/arch/sh/configs/rsk7264_defconfig index 1600426224c2..eecdf65bb789 100644 --- a/arch/sh/configs/rsk7264_defconfig +++ b/arch/sh/configs/rsk7264_defconfig @@ -60,7 +60,6 @@ CONFIG_SERIAL_SH_SCI_NR_UARTS=8 CONFIG_SERIAL_SH_SCI_CONSOLE=y # CONFIG_HWMON is not set CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_R8A66597_HCD=y diff --git a/arch/sh/configs/rsk7269_defconfig b/arch/sh/configs/rsk7269_defconfig index 9f062b5837d7..8370b10df357 100644 --- a/arch/sh/configs/rsk7269_defconfig +++ b/arch/sh/configs/rsk7269_defconfig @@ -43,7 +43,6 @@ CONFIG_SERIAL_SH_SCI_NR_UARTS=8 CONFIG_SERIAL_SH_SCI_CONSOLE=y # CONFIG_HWMON is not set CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_R8A66597_HCD=y diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig index fa37e897399b..6a96b9a2f7a5 100644 --- a/arch/sh/configs/sdk7780_defconfig +++ b/arch/sh/configs/sdk7780_defconfig @@ -100,7 +100,6 @@ CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y CONFIG_USB=y -CONFIG_USB_DEBUG=y # CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig index af3fe7352471..201acb4652f7 100644 --- a/arch/sh/configs/se7343_defconfig +++ b/arch/sh/configs/se7343_defconfig @@ -92,7 +92,6 @@ CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_ISP116X_HCD=y CONFIG_UIO=y diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 619e7f7426c6..32d2e7056c87 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -244,7 +244,6 @@ CONFIG_HID_TOPSEED=y CONFIG_HID_PID=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 6181c69b786b..a481dd4755d5 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -239,7 +239,6 @@ CONFIG_HID_TOPSEED=y CONFIG_HID_PID=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y -- cgit v1.2.3 From 9c15a24b038f4d8da93a2bc2554731f8953a7c17 Mon Sep 17 00:00:00 2001 From: Mathieu Souchaud Date: Wed, 28 May 2014 09:12:37 +0200 Subject: x86/mce: Improve mcheck_init_device() error handling Check return code of every function called by mcheck_init_device(). Signed-off-by: Mathieu Souchaud Link: http://lkml.kernel.org/r/1399151031-19905-1-git-send-email-mattieu.souchaud@free.fr Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 47 ++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 68317c80de7f..0078761219a2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2437,32 +2437,65 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); if (err) { cpu_notifier_register_done(); - return err; + goto err_device_create; } } - register_syscore_ops(&mce_syscore_ops); __register_hotcpu_notifier(&mce_cpu_notifier); cpu_notifier_register_done(); + register_syscore_ops(&mce_syscore_ops); + /* register character device /dev/mcelog */ - misc_register(&mce_chrdev_device); + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } -- cgit v1.2.3 From 716079f66eacd31d040db9cd0627ca0d625d6126 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 23 May 2014 11:06:35 +0200 Subject: mce: Panic when a core has reached a timeout There is very little and maybe practically nothing we can do to recover from a system where at least one core has reached a timeout during the whole monarch cores gathering. So panic when that happens. Link: http://lkml.kernel.org/r/20140523091041.GA21332@pd.tnic Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0078761219a2..6cc800381d14 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -704,8 +704,7 @@ static int mce_timed_out(u64 *t) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (mca_cfg.tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; -- cgit v1.2.3 From 011561837dad082a92c0537db2d134e66419c6ad Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 May 2014 08:48:48 -0700 Subject: x86/vdso, build: When vdso2c fails, unlink the output This avoids bizarre failures if make is run again. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/1764385fe9931e8940b9d001132515448ea89523.1401464755.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso2c.c | 20 +++++++++++--------- arch/x86/vdso/vdso2c.h | 10 +++------- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index 81edd1ec9df8..fe8bfbf62612 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -14,6 +14,8 @@ #include #include +const char *outfilename; + /* Symbols that we need in vdso2c. */ enum { sym_vvar_page, @@ -44,6 +46,7 @@ static void fail(const char *format, ...) va_start(ap, format); fprintf(stderr, "Error: "); vfprintf(stderr, format, ap); + unlink(outfilename); exit(1); va_end(ap); } @@ -82,17 +85,16 @@ static void fail(const char *format, ...) #undef Elf_Sym #undef Elf_Dyn -static int go(void *addr, size_t len, FILE *outfile, const char *name) +static void go(void *addr, size_t len, FILE *outfile, const char *name) { Elf64_Ehdr *hdr = (Elf64_Ehdr *)addr; if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { - return go64(addr, len, outfile, name); + go64(addr, len, outfile, name); } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { - return go32(addr, len, outfile, name); + go32(addr, len, outfile, name); } else { - fprintf(stderr, "Error: unknown ELF class\n"); - return 1; + fail("unknown ELF class\n"); } } @@ -102,7 +104,6 @@ int main(int argc, char **argv) off_t len; void *addr; FILE *outfile; - int ret; char *name, *tmp; int namelen; @@ -143,14 +144,15 @@ int main(int argc, char **argv) if (addr == MAP_FAILED) err(1, "mmap"); - outfile = fopen(argv[2], "w"); + outfilename = argv[2]; + outfile = fopen(outfilename, "w"); if (!outfile) err(1, "%s", argv[2]); - ret = go(addr, (size_t)len, outfile, name); + go(addr, (size_t)len, outfile, name); munmap(addr, len); fclose(outfile); - return ret; + return 0; } diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index 3dcc61e796e9..26a7c1fa7452 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -4,7 +4,7 @@ * are built for 32-bit userspace. */ -static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) +static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ @@ -62,10 +62,8 @@ static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) alt_sec = sh; } - if (!symtab_hdr) { + if (!symtab_hdr) fail("no symbol table\n"); - return 1; - } strtab_hdr = addr + hdr->e_shoff + hdr->e_shentsize * symtab_hdr->sh_link; @@ -112,7 +110,7 @@ static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) if (!name) { fwrite(addr, load_size, 1, outfile); - return 0; + return; } fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); @@ -152,6 +150,4 @@ static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) required_syms[i], syms[i]); } fprintf(outfile, "};\n"); - - return 0; } -- cgit v1.2.3 From add4eed0a2abea3951206f504330ee5daf8c178a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 May 2014 08:48:49 -0700 Subject: x86/vdso, build: Fix cross-compilation from big-endian architectures This adds a macro GET(x) to convert x from big-endian to little-endian. Hopefully I put it everywhere it needs to go and got all the cases needed for everyone's linux/elf.h. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/2cf258df123cb24bad63c274c8563c050547d99d.1401464755.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso2c.c | 15 ++++++++++++ arch/x86/vdso/vdso2c.h | 63 ++++++++++++++++++++++++++++---------------------- 2 files changed, 50 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index fe8bfbf62612..de19ced6c87d 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -51,6 +51,21 @@ static void fail(const char *format, ...) va_end(ap); } +/* + * Evil macros to do a little-endian read. + */ +#define __GET_TYPE(x, type, bits, ifnot) \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), type), \ + le##bits##toh((x)), ifnot) + +extern void bad_get(uint64_t); + +#define GET(x) \ + __GET_TYPE((x), __u32, 32, __GET_TYPE((x), __u64, 64, \ + __GET_TYPE((x), __s32, 32, __GET_TYPE((x), __s64, 64, \ + __GET_TYPE((x), __u16, 16, bad_get(x)))))) + #define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) #define BITS 64 diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index 26a7c1fa7452..f0475dad2286 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -18,25 +18,27 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) const char *secstrings; uint64_t syms[NSYMS] = {}; - Elf_Phdr *pt = (Elf_Phdr *)(addr + hdr->e_phoff); + Elf_Phdr *pt = (Elf_Phdr *)(addr + GET(hdr->e_phoff)); /* Walk the segment table. */ - for (i = 0; i < hdr->e_phnum; i++) { - if (pt[i].p_type == PT_LOAD) { + for (i = 0; i < GET(hdr->e_phnum); i++) { + if (GET(pt[i].p_type) == PT_LOAD) { if (found_load) fail("multiple PT_LOAD segs\n"); - if (pt[i].p_offset != 0 || pt[i].p_vaddr != 0) + if (GET(pt[i].p_offset) != 0 || + GET(pt[i].p_vaddr) != 0) fail("PT_LOAD in wrong place\n"); - if (pt[i].p_memsz != pt[i].p_filesz) + if (GET(pt[i].p_memsz) != GET(pt[i].p_filesz)) fail("cannot handle memsz != filesz\n"); - load_size = pt[i].p_memsz; + load_size = GET(pt[i].p_memsz); found_load = 1; - } else if (pt[i].p_type == PT_DYNAMIC) { - dyn = addr + pt[i].p_offset; - dyn_end = addr + pt[i].p_offset + pt[i].p_memsz; + } else if (GET(pt[i].p_type) == PT_DYNAMIC) { + dyn = addr + GET(pt[i].p_offset); + dyn_end = addr + GET(pt[i].p_offset) + + GET(pt[i].p_memsz); } } if (!found_load) @@ -44,43 +46,48 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) data_size = (load_size + 4095) / 4096 * 4096; /* Walk the dynamic table */ - for (i = 0; dyn + i < dyn_end && dyn[i].d_tag != DT_NULL; i++) { - if (dyn[i].d_tag == DT_REL || dyn[i].d_tag == DT_RELSZ || - dyn[i].d_tag == DT_RELENT || dyn[i].d_tag == DT_TEXTREL) + for (i = 0; dyn + i < dyn_end && GET(dyn[i].d_tag) != DT_NULL; i++) { + typeof(dyn[i].d_tag) tag = GET(dyn[i].d_tag); + if (tag == DT_REL || tag == DT_RELSZ || + tag == DT_RELENT || tag == DT_TEXTREL) fail("vdso image contains dynamic relocations\n"); } /* Walk the section table */ - secstrings_hdr = addr + hdr->e_shoff + hdr->e_shentsize*hdr->e_shstrndx; - secstrings = addr + secstrings_hdr->sh_offset; - for (i = 0; i < hdr->e_shnum; i++) { - Elf_Shdr *sh = addr + hdr->e_shoff + hdr->e_shentsize * i; - if (sh->sh_type == SHT_SYMTAB) + secstrings_hdr = addr + GET(hdr->e_shoff) + + GET(hdr->e_shentsize)*GET(hdr->e_shstrndx); + secstrings = addr + GET(secstrings_hdr->sh_offset); + for (i = 0; i < GET(hdr->e_shnum); i++) { + Elf_Shdr *sh = addr + GET(hdr->e_shoff) + + GET(hdr->e_shentsize) * i; + if (GET(sh->sh_type) == SHT_SYMTAB) symtab_hdr = sh; - if (!strcmp(secstrings + sh->sh_name, ".altinstructions")) + if (!strcmp(secstrings + GET(sh->sh_name), ".altinstructions")) alt_sec = sh; } if (!symtab_hdr) fail("no symbol table\n"); - strtab_hdr = addr + hdr->e_shoff + - hdr->e_shentsize * symtab_hdr->sh_link; + strtab_hdr = addr + GET(hdr->e_shoff) + + GET(hdr->e_shentsize) * GET(symtab_hdr->sh_link); /* Walk the symbol table */ - for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { + for (i = 0; i < GET(symtab_hdr->sh_size) / GET(symtab_hdr->sh_entsize); + i++) { int k; - Elf_Sym *sym = addr + symtab_hdr->sh_offset + - symtab_hdr->sh_entsize * i; - const char *name = addr + strtab_hdr->sh_offset + sym->st_name; + Elf_Sym *sym = addr + GET(symtab_hdr->sh_offset) + + GET(symtab_hdr->sh_entsize) * i; + const char *name = addr + GET(strtab_hdr->sh_offset) + + GET(sym->st_name); for (k = 0; k < NSYMS; k++) { if (!strcmp(name, required_syms[k])) { if (syms[k]) { fail("duplicate symbol %s\n", required_syms[k]); } - syms[k] = sym->st_value; + syms[k] = GET(sym->st_value); } } } @@ -106,7 +113,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) hdr->e_shoff = 0; hdr->e_shentsize = 0; hdr->e_shnum = 0; - hdr->e_shstrndx = SHN_UNDEF; + hdr->e_shstrndx = htole16(SHN_UNDEF); if (!name) { fwrite(addr, load_size, 1, outfile); @@ -140,9 +147,9 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) fprintf(outfile, "\t},\n"); if (alt_sec) { fprintf(outfile, "\t.alt = %lu,\n", - (unsigned long)alt_sec->sh_offset); + (unsigned long)GET(alt_sec->sh_offset)); fprintf(outfile, "\t.alt_len = %lu,\n", - (unsigned long)alt_sec->sh_size); + (unsigned long)GET(alt_sec->sh_size)); } for (i = 0; i < NSYMS; i++) { if (syms[i]) -- cgit v1.2.3 From c191920f737a09a7252088f018f6747f0d2f484d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 30 May 2014 17:03:22 -0700 Subject: x86/vdso, build: Make LE access macros clearer, host-safe Make it a little clearer what the littleendian access macros in vdso2c.[ch] actually do. This way they can probably also be moved to a central location (e.g. tools/include) for the benefit of other host tools. We should avoid implementation namespace symbols when writing code that is compiling for the compiler host, so avoid names starting with double underscore or underscore-capital. Signed-off-by: H. Peter Anvin Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/2cf258df123cb24bad63c274c8563c050547d99d.1401464755.git.luto@amacapital.net --- arch/x86/vdso/vdso2c.c | 16 ++++++------- arch/x86/vdso/vdso2c.h | 65 ++++++++++++++++++++++++++------------------------ 2 files changed, 42 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index de19ced6c87d..deabaf5bfb89 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -54,17 +54,17 @@ static void fail(const char *format, ...) /* * Evil macros to do a little-endian read. */ -#define __GET_TYPE(x, type, bits, ifnot) \ +#define GLE(x, bits, ifnot) \ __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(x), type), \ - le##bits##toh((x)), ifnot) + (sizeof(x) == bits/8), \ + (__typeof__(x))le##bits##toh(x), ifnot) -extern void bad_get(uint64_t); +extern void bad_get_le(uint64_t); +#define LAST_LE(x) \ + __builtin_choose_expr(sizeof(x) == 1, (x), bad_get_le(x)) -#define GET(x) \ - __GET_TYPE((x), __u32, 32, __GET_TYPE((x), __u64, 64, \ - __GET_TYPE((x), __s32, 32, __GET_TYPE((x), __s64, 64, \ - __GET_TYPE((x), __u16, 16, bad_get(x)))))) +#define GET_LE(x) \ + GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_LE(x)))) #define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index f0475dad2286..d1e99e1892c4 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -18,27 +18,27 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) const char *secstrings; uint64_t syms[NSYMS] = {}; - Elf_Phdr *pt = (Elf_Phdr *)(addr + GET(hdr->e_phoff)); + Elf_Phdr *pt = (Elf_Phdr *)(addr + GET_LE(hdr->e_phoff)); /* Walk the segment table. */ - for (i = 0; i < GET(hdr->e_phnum); i++) { - if (GET(pt[i].p_type) == PT_LOAD) { + for (i = 0; i < GET_LE(hdr->e_phnum); i++) { + if (GET_LE(pt[i].p_type) == PT_LOAD) { if (found_load) fail("multiple PT_LOAD segs\n"); - if (GET(pt[i].p_offset) != 0 || - GET(pt[i].p_vaddr) != 0) + if (GET_LE(pt[i].p_offset) != 0 || + GET_LE(pt[i].p_vaddr) != 0) fail("PT_LOAD in wrong place\n"); - if (GET(pt[i].p_memsz) != GET(pt[i].p_filesz)) + if (GET_LE(pt[i].p_memsz) != GET_LE(pt[i].p_filesz)) fail("cannot handle memsz != filesz\n"); - load_size = GET(pt[i].p_memsz); + load_size = GET_LE(pt[i].p_memsz); found_load = 1; - } else if (GET(pt[i].p_type) == PT_DYNAMIC) { - dyn = addr + GET(pt[i].p_offset); - dyn_end = addr + GET(pt[i].p_offset) + - GET(pt[i].p_memsz); + } else if (GET_LE(pt[i].p_type) == PT_DYNAMIC) { + dyn = addr + GET_LE(pt[i].p_offset); + dyn_end = addr + GET_LE(pt[i].p_offset) + + GET_LE(pt[i].p_memsz); } } if (!found_load) @@ -46,48 +46,51 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) data_size = (load_size + 4095) / 4096 * 4096; /* Walk the dynamic table */ - for (i = 0; dyn + i < dyn_end && GET(dyn[i].d_tag) != DT_NULL; i++) { - typeof(dyn[i].d_tag) tag = GET(dyn[i].d_tag); + for (i = 0; dyn + i < dyn_end && + GET_LE(dyn[i].d_tag) != DT_NULL; i++) { + typeof(dyn[i].d_tag) tag = GET_LE(dyn[i].d_tag); if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELENT || tag == DT_TEXTREL) fail("vdso image contains dynamic relocations\n"); } /* Walk the section table */ - secstrings_hdr = addr + GET(hdr->e_shoff) + - GET(hdr->e_shentsize)*GET(hdr->e_shstrndx); - secstrings = addr + GET(secstrings_hdr->sh_offset); - for (i = 0; i < GET(hdr->e_shnum); i++) { - Elf_Shdr *sh = addr + GET(hdr->e_shoff) + - GET(hdr->e_shentsize) * i; - if (GET(sh->sh_type) == SHT_SYMTAB) + secstrings_hdr = addr + GET_LE(hdr->e_shoff) + + GET_LE(hdr->e_shentsize)*GET_LE(hdr->e_shstrndx); + secstrings = addr + GET_LE(secstrings_hdr->sh_offset); + for (i = 0; i < GET_LE(hdr->e_shnum); i++) { + Elf_Shdr *sh = addr + GET_LE(hdr->e_shoff) + + GET_LE(hdr->e_shentsize) * i; + if (GET_LE(sh->sh_type) == SHT_SYMTAB) symtab_hdr = sh; - if (!strcmp(secstrings + GET(sh->sh_name), ".altinstructions")) + if (!strcmp(secstrings + GET_LE(sh->sh_name), + ".altinstructions")) alt_sec = sh; } if (!symtab_hdr) fail("no symbol table\n"); - strtab_hdr = addr + GET(hdr->e_shoff) + - GET(hdr->e_shentsize) * GET(symtab_hdr->sh_link); + strtab_hdr = addr + GET_LE(hdr->e_shoff) + + GET_LE(hdr->e_shentsize) * GET_LE(symtab_hdr->sh_link); /* Walk the symbol table */ - for (i = 0; i < GET(symtab_hdr->sh_size) / GET(symtab_hdr->sh_entsize); + for (i = 0; + i < GET_LE(symtab_hdr->sh_size) / GET_LE(symtab_hdr->sh_entsize); i++) { int k; - Elf_Sym *sym = addr + GET(symtab_hdr->sh_offset) + - GET(symtab_hdr->sh_entsize) * i; - const char *name = addr + GET(strtab_hdr->sh_offset) + - GET(sym->st_name); + Elf_Sym *sym = addr + GET_LE(symtab_hdr->sh_offset) + + GET_LE(symtab_hdr->sh_entsize) * i; + const char *name = addr + GET_LE(strtab_hdr->sh_offset) + + GET_LE(sym->st_name); for (k = 0; k < NSYMS; k++) { if (!strcmp(name, required_syms[k])) { if (syms[k]) { fail("duplicate symbol %s\n", required_syms[k]); } - syms[k] = GET(sym->st_value); + syms[k] = GET_LE(sym->st_value); } } } @@ -147,9 +150,9 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) fprintf(outfile, "\t},\n"); if (alt_sec) { fprintf(outfile, "\t.alt = %lu,\n", - (unsigned long)GET(alt_sec->sh_offset)); + (unsigned long)GET_LE(alt_sec->sh_offset)); fprintf(outfile, "\t.alt_len = %lu,\n", - (unsigned long)GET(alt_sec->sh_size)); + (unsigned long)GET_LE(alt_sec->sh_size)); } for (i = 0; i < NSYMS; i++) { if (syms[i]) -- cgit v1.2.3 From c177c81e09e517bbf75b67762cdab1b83aba6976 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:05:35 -0700 Subject: hugetlb: restrict hugepage_migration_support() to x86_64 Currently hugepage migration is available for all archs which support pmd-level hugepage, but testing is done only for x86_64 and there're bugs for other archs. So to avoid breaking such archs, this patch limits the availability strictly to x86_64 until developers of other archs get interested in enabling this feature. Simply disabling hugepage migration on non-x86_64 archs is not enough to fix the reported problem where sys_move_pages() hits the BUG_ON() in follow_page(FOLL_GET), so let's fix this by checking if hugepage migration is supported in vma_migratable(). Signed-off-by: Naoya Horiguchi Reported-by: Michael Ellerman Tested-by: Michael Ellerman Acked-by: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: Tony Luck Cc: Russell King Cc: Martin Schwidefsky Cc: James Hogan Cc: Ralf Baechle Cc: David Miller Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/hugetlbpage.c | 5 ----- arch/arm64/mm/hugetlbpage.c | 5 ----- arch/ia64/mm/hugetlbpage.c | 5 ----- arch/metag/mm/hugetlbpage.c | 5 ----- arch/mips/mm/hugetlbpage.c | 5 ----- arch/powerpc/mm/hugetlbpage.c | 10 ---------- arch/s390/mm/hugetlbpage.c | 5 ----- arch/sh/mm/hugetlbpage.c | 5 ----- arch/sparc/mm/hugetlbpage.c | 5 ----- arch/tile/mm/hugetlbpage.c | 5 ----- arch/x86/Kconfig | 4 ++++ arch/x86/mm/hugetlbpage.c | 10 ---------- include/linux/hugetlb.h | 13 +++++-------- include/linux/mempolicy.h | 6 ++++++ mm/Kconfig | 3 +++ 15 files changed, 18 insertions(+), 73 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index 54ee6163c181..66781bf34077 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c @@ -56,8 +56,3 @@ int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); } - -int pmd_huge_support(void) -{ - return 1; -} diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 31eb959e9aa8..023747bf4dd7 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -58,11 +58,6 @@ int pud_huge(pud_t pud) #endif } -int pmd_huge_support(void) -{ - return 1; -} - static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 68232db98baa..76069c18ee42 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -114,11 +114,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 0; -} - struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 042431509b56..3c52fa6d0f8e 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -110,11 +110,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 1; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 77e0ae036e7c..4ec8ee10d371 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -84,11 +84,6 @@ int pud_huge(pud_t pud) return (pud_val(pud) & _PAGE_HUGE) != 0; } -int pmd_huge_support(void) -{ - return 1; -} - struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index eb923654ba80..7e70ae968e5f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -86,11 +86,6 @@ int pgd_huge(pgd_t pgd) */ return ((pgd_val(pgd) & 0x3) != 0x0); } - -int pmd_huge_support(void) -{ - return 1; -} #else int pmd_huge(pmd_t pmd) { @@ -106,11 +101,6 @@ int pgd_huge(pgd_t pgd) { return 0; } - -int pmd_huge_support(void) -{ - return 0; -} #endif pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 0727a55d87d9..0ff66a7e29bb 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -220,11 +220,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 1; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmdp, int write) { diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 0d676a41081e..d7762349ea48 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -83,11 +83,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 0; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 9bd9ce80bf77..d329537739c6 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -231,11 +231,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 0; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 0cb3bbaa580c..e514899e1100 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -166,11 +166,6 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_HUGE_PAGE); } -int pmd_huge_support(void) -{ - return 1; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7d5feb5908dd..e41b258ad040 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1873,6 +1873,10 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y depends on X86_64 || X86_PAE +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on X86_64 && HUGETLB_PAGE && MIGRATION + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8c9f647ff9e1..8b977ebf9388 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - -int pmd_huge_support(void) -{ - return 0; -} #else struct page * @@ -80,11 +75,6 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } - -int pmd_huge_support(void) -{ - return 1; -} #endif #ifdef CONFIG_HUGETLB_PAGE diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b65166de1d9d..d0bad1a8b0bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -392,15 +392,13 @@ static inline pgoff_t basepage_index(struct page *page) extern void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); -int pmd_huge_support(void); -/* - * Currently hugepage migration is enabled only for pmd-based hugepage. - * This function will be updated when hugepage migration is more widely - * supported. - */ static inline int hugepage_migration_support(struct hstate *h) { - return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION + return huge_page_shift(h) == PMD_SHIFT; +#else + return 0; +#endif } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, @@ -450,7 +448,6 @@ static inline pgoff_t basepage_index(struct page *page) return page->index; } #define dissolve_free_huge_pages(s, e) do {} while (0) -#define pmd_huge_support() 0 #define hugepage_migration_support(h) 0 static inline spinlock_t *huge_pte_lockptr(struct hstate *h, diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c1b968da0ca..f230a978e6ba 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -175,6 +175,12 @@ static inline int vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return 0; + +#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION + if (vma->vm_flags & VM_HUGETLB) + return 0; +#endif + /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not diff --git a/mm/Kconfig b/mm/Kconfig index 28cec518f4d4..75ac479cbacd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -267,6 +267,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + boolean + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -- cgit v1.2.3 From 4468dd76f51f8be75d4f04f1d721e379596e7262 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:06:29 -0700 Subject: x86: require x86-64 for automatic NUMA balancing 32-bit support for NUMA is an oddity on its own but with automatic NUMA balancing on top there is a reasonable risk that the CPUPID information cannot be stored in the page flags. This patch removes support for automatic NUMA support on 32-bit x86. Signed-off-by: Mel Gorman Cc: David Vrabel Cc: Ingo Molnar Cc: Peter Anvin Cc: Fengguang Wu Cc: Linus Torvalds Cc: Steven Noonan Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Srikar Dronamraju Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e41b258ad040..896a411a4584 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,7 +26,7 @@ config X86 select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK - select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE -- cgit v1.2.3 From c46a7c817e662a820373bb76b88d0ad67d6abe5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:06:30 -0700 Subject: x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels _PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting faults on x86. Care is taken such that _PAGE_NUMA is used only in situations where the VMA flags distinguish between NUMA hinting faults and prot_none faults. This decision was x86-specific and conceptually it is difficult requiring special casing to distinguish between PROTNONE and NUMA ptes based on context. Fundamentally, we only need the _PAGE_NUMA bit to tell the difference between an entry that is really unmapped and a page that is protected for NUMA hinting faults as if the PTE is not present then a fault will be trapped. Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset. This patch shrinks the maximum possible swap size and uses the bit to uniquely distinguish between NUMA hinting ptes and swap ptes. Signed-off-by: Mel Gorman Cc: David Vrabel Cc: Ingo Molnar Cc: Peter Anvin Cc: Fengguang Wu Cc: Linus Torvalds Cc: Steven Noonan Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Srikar Dronamraju Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/pgtable.h | 6 ++++ arch/x86/include/asm/pgtable.h | 15 +++++--- arch/x86/include/asm/pgtable_64.h | 8 +++++ arch/x86/include/asm/pgtable_types.h | 66 +++++++++++++++++++----------------- arch/x86/mm/pageattr-test.c | 2 +- include/asm-generic/pgtable.h | 8 +++-- include/linux/swapops.h | 2 +- mm/memory.c | 17 ++++------ 8 files changed, 75 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 3ebb188c3ff5..d98c1ecc3266 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte) return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); } +#define pte_present_nonuma pte_present_nonuma +static inline int pte_present_nonuma(pte_t pte) +{ + return pte_val(pte) & (_PAGE_PRESENT); +} + #define pte_numa pte_numa static inline int pte_numa(pte_t pte) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b459ddf27d64..66276c1d23bb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - return pte_flags(pte) & _PAGE_SPECIAL; + return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) == + (_PAGE_PRESENT|_PAGE_SPECIAL); } static inline unsigned long pte_pfn(pte_t pte) @@ -452,6 +453,12 @@ static inline int pte_present(pte_t a) _PAGE_NUMA); } +#define pte_present_nonuma pte_present_nonuma +static inline int pte_present_nonuma(pte_t a) +{ + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +} + #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { @@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e22c1dbf7feb..6d6ecd09883c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#ifdef CONFIG_NUMA_BALANCING +/* Automatic NUMA balancing needs to be distinguishable from swap entries */ +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) +#else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#endif #else +#ifdef CONFIG_NUMA_BALANCING +#error Incompatible format for automatic NUMA balancing +#endif #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) #endif diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index eb3d44945133..f216963760e5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -16,15 +16,26 @@ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ -#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ +#define _PAGE_BIT_SOFTW1 9 /* available for programmer */ +#define _PAGE_BIT_SOFTW2 10 /* " */ +#define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ +#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* + * Swap offsets on configurations that allow automatic NUMA balancing use the + * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from + * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the + * maximum possible swap space from 16TB to 8TB. + */ +#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) + /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -40,7 +51,7 @@ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) @@ -61,14 +72,27 @@ * they do not conflict with each other. */ -#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN - #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page + * that is not present. The hinting fault gathers numa placement statistics + * (see pte_numa()). The bit is always zero when the PTE is not present. + * + * The bit picked must be always zero when the pmd is present and not + * present, so that we don't lose information when we set it while + * atomically clearing the present bit. + */ +#ifdef CONFIG_NUMA_BALANCING +#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) +#else +#define _PAGE_NUMA (_AT(pteval_t, 0)) +#endif + /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict @@ -94,26 +118,6 @@ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -/* - * _PAGE_NUMA indicates that this page will trigger a numa hinting - * minor page fault to gather numa placement statistics (see - * pte_numa()). The bit picked (8) is within the range between - * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't - * require changes to the swp entry format because that bit is always - * zero when the pte is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - * - * Because we shared the same bit (8) with _PAGE_PROTNONE this can be - * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE - * couldn't reach, like handle_mm_fault() (see access_error in - * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for - * handle_mm_fault() to be invoked). - */ -#define _PAGE_NUMA _PAGE_PROTNONE - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ @@ -122,8 +126,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + _PAGE_SOFT_DIRTY | _PAGE_NUMA) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 461bc8289024..6629f397b467 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -35,7 +35,7 @@ enum { static int pte_testbit(pte_t pte) { - return pte_flags(pte) & _PAGE_UNUSED1; + return pte_flags(pte) & _PAGE_SOFTW1; } struct split_state { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a8015a7a55bb..53b2acc38213 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) # define pte_accessible(mm, pte) ((void)(pte), 1) #endif +#ifndef pte_present_nonuma +#define pte_present_nonuma(pte) pte_present(pte) +#endif + #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd) static inline int pte_numa(pte_t pte) { return (pte_flags(pte) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; + (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; } #endif @@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte) static inline int pmd_numa(pmd_t pmd) { return (pmd_flags(pmd) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; + (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; } #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c0f75261a728..6adfb7bfbf44 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); + return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); } #endif diff --git a/mm/memory.c b/mm/memory.c index e302ae1dcce0..0897830011f3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -756,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) + if (likely(!pte_special(pte) || pte_numa(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -782,14 +782,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - if (is_zero_pfn(pfn)) - return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } + if (is_zero_pfn(pfn)) + return NULL; + /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. @@ -1722,13 +1723,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* - * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault - * would be called on PROT_NONE ranges. We must never invoke - * handle_mm_fault on PROT_NONE ranges or the NUMA hinting - * page faults would unprotect the PROT_NONE ranges if - * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd - * bitflag. So to avoid that, don't set FOLL_NUMA if - * FOLL_FORCE is set. + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space */ if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; -- cgit v1.2.3 From 982792c782ef337381e982fd2047391886f89693 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Jun 2014 16:06:32 -0700 Subject: x86, mm: probe memory block size for generic x86 64bit On system with 2TiB ram, current x86_64 have 128M as section size, and one memory_block only include one section. So will have 16400 entries under /sys/devices/system/memory/. Current code try to use block id to find block pointer in /sys for any section, and reuse that block pointer. that finding will take some time even after commit 7c243c7168dc ("mm: speedup in __early_pfn_to_nid") that will skip the search in that case during booting up. So solution could be increase block size just like SGI UV system did. (harded code to 2g). This patch is trying to probe the block size to make it match mmio remap size. for example, Intel Nehalem later system will have memory range [0, TOML), [4g, TOMH]. If the memory hole is 2g and total is 128g, TOM will be 2g, and TOM2 will be 130g. We could use 2g as block size instead of default 128M. That will reduce number of entries in /sys/devices/system/memory/ On system 6TiB system will reduce boot time by 35 seconds. Signed-off-by: Yinghai Lu Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f35c66c5959a..b92591fa8970 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1230,17 +1230,43 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -#ifdef CONFIG_X86_UV -unsigned long memory_block_size_bytes(void) +static unsigned long probe_memory_block_size(void) { + /* start from 2g */ + unsigned long bz = 1UL<<31; + +#ifdef CONFIG_X86_UV if (is_uv_system()) { printk(KERN_INFO "UV: memory block size 2GB\n"); return 2UL * 1024 * 1024 * 1024; } - return MIN_MEMORY_BLOCK_SIZE; -} #endif + /* less than 64g installed */ + if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) + return MIN_MEMORY_BLOCK_SIZE; + + /* get the tail size */ + while (bz > MIN_MEMORY_BLOCK_SIZE) { + if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) + break; + bz >>= 1; + } + + printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + + return bz; +} + +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) +{ + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; +} + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. -- cgit v1.2.3 From d92ef66c4f8fdf7a24736b1ab6c48d32de9bfc07 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:48 -0700 Subject: x86: make dma_alloc_coherent() return zeroed memory if CMA is enabled This patchset enhances the DMA Contiguous Memory Allocator on x86. Currently the DMA CMA is only supported with pci-nommu dma_map_ops and furthermore it can't be enabled on x86_64. But I would like to allocate big contiguous memory with dma_alloc_coherent() and tell it to the device that requires it, regardless of which dma mapping implementation is actually used in the system. So this makes it work with swiotlb and intel-iommu dma_map_ops, too. And this also extends "cma=" kernel parameter to specify placement constraint by the physical address range of memory allocations. For example, CMA allocates memory below 4GB by "cma=64M@0-4G", it is required for the devices only supporting 32-bit addressing on 64-bit systems without iommu. This patch (of 5): Calling dma_alloc_coherent() with __GFP_ZERO must return zeroed memory. But when the contiguous memory allocator (CMA) is enabled on x86 and the memory region is allocated by dma_alloc_from_contiguous(), it doesn't return zeroed memory. Because dma_generic_alloc_coherent() forgot to fill the memory region with zero if it was allocated by dma_alloc_from_contiguous() Most implementations of dma_alloc_coherent() return zeroed memory regardless of whether __GFP_ZERO is specified. So this fixes it by unconditionally zeroing the allocated memory region. Alternatively, we could fix dma_alloc_from_contiguous() to return zeroed out memory and remove memset() from all caller of it. But we can't simply remove the memset on arm because __dma_clear_buffer() is used there for ensuring cache flushing and it is used in many places. Of course we can do redundant memset in dma_alloc_from_contiguous(), but I think this patch is less impact for fixing this problem. Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f7d0672481fd..e5f4e9629e61 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -97,7 +97,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_mask = dma_alloc_coherent_mask(dev, flag); - flag |= __GFP_ZERO; + flag &= ~__GFP_ZERO; again: page = NULL; /* CMA can be used only in the context which permits sleeping */ @@ -120,7 +120,7 @@ again: return NULL; } - + memset(page_address(page), 0, size); *dma_addr = addr; return page_address(page); } -- cgit v1.2.3 From 9c5a3621427da68afe6a078cadf807d2c8cc1d12 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:50 -0700 Subject: x86: enable DMA CMA with swiotlb The DMA Contiguous Memory Allocator support on x86 is disabled when swiotlb config option is enabled. So DMA CMA is always disabled on x86_64 because swiotlb is always enabled. This attempts to support for DMA CMA with enabling swiotlb config option. The contiguous memory allocator on x86 is integrated in the function dma_generic_alloc_coherent() which is .alloc callback in nommu_dma_ops for dma_alloc_coherent(). x86_swiotlb_alloc_coherent() which is .alloc callback in swiotlb_dma_ops tries to allocate with dma_generic_alloc_coherent() firstly and then swiotlb_alloc_coherent() is called as a fallback. The main part of supporting DMA CMA with swiotlb is that changing x86_swiotlb_free_coherent() which is .free callback in swiotlb_dma_ops for dma_free_coherent() so that it can distinguish memory allocated by dma_generic_alloc_coherent() from one allocated by swiotlb_alloc_coherent() and release it with dma_generic_free_coherent() which can handle contiguous memory. This change requires making is_swiotlb_buffer() global function. This also needs to change .free callback in the dma_map_ops for amd_gart and sta2x11, because these dma_ops are also using dma_generic_alloc_coherent(). Signed-off-by: Akinobu Mita Acked-by: Marek Szyprowski Acked-by: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/swiotlb.h | 7 +++++++ arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/pci-swiotlb.c | 9 ++++++--- arch/x86/pci/sta2x11-fixup.c | 6 ++---- include/linux/swiotlb.h | 2 ++ lib/swiotlb.c | 2 +- 7 files changed, 20 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 896a411a4584..4a0137f6f032 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -41,7 +41,7 @@ config X86 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS - select HAVE_DMA_CONTIGUOUS if !SWIOTLB + select HAVE_DMA_CONTIGUOUS select HAVE_KRETPROBES select GENERIC_EARLY_IOREMAP select HAVE_OPTPROBES diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 977f1761a25d..ab05d73e2bb7 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -29,4 +29,11 @@ static inline void pci_swiotlb_late_init(void) static inline void dma_mark_clean(void *addr, size_t size) {} +extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs); +extern void x86_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs); + #endif /* _ASM_X86_SWIOTLB_H */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b574b295a2f9..8e3842fc8bea 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, struct dma_attrs *attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); - free_pages((unsigned long)vaddr, get_order(size)); + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6c483ba98b9c..77dd0ad58be4 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -14,7 +14,7 @@ #include int swiotlb __read_mostly; -static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, +void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, struct dma_attrs *attrs) { @@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); } -static void x86_swiotlb_free_coherent(struct device *dev, size_t size, +void x86_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, struct dma_attrs *attrs) { - swiotlb_free_coherent(dev, size, vaddr, dma_addr); + if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) + swiotlb_free_coherent(dev, size, vaddr, dma_addr); + else + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } static struct dma_map_ops swiotlb_dma_ops = { diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 9d8a509c9730..5ceda85b8687 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -173,9 +173,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, { void *vaddr; - vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs); - if (!vaddr) - vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags); + vaddr = x86_swiotlb_alloc_coherent(dev, size, dma_handle, flags, attrs); *dma_handle = p2a(*dma_handle, to_pci_dev(dev)); return vaddr; } @@ -183,7 +181,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, /* We have our own dma_ops: the same as swiotlb but from alloc (above) */ static struct dma_map_ops sta2x11_dma_ops = { .alloc = sta2x11_swiotlb_alloc_coherent, - .free = swiotlb_free_coherent, + .free = x86_swiotlb_free_coherent, .map_page = swiotlb_map_page, .unmap_page = swiotlb_unmap_page, .map_sg = swiotlb_map_sg_attrs, diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index a5ffd32642fd..e7a018eaf3a2 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -116,4 +116,6 @@ static inline void swiotlb_free(void) { } #endif extern void swiotlb_print_info(void); +extern int is_swiotlb_buffer(phys_addr_t paddr); + #endif /* __LINUX_SWIOTLB_H */ diff --git a/lib/swiotlb.c b/lib/swiotlb.c index b604b831f4d1..649d097853a1 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -374,7 +374,7 @@ void __init swiotlb_free(void) io_tlb_nslabs = 0; } -static int is_swiotlb_buffer(phys_addr_t paddr) +int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= io_tlb_start && paddr < io_tlb_end; } -- cgit v1.2.3 From 5ea3b1b2f8ad9162684431ce6188102ca4c64b7a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:54 -0700 Subject: cma: add placement specifier for "cma=" kernel parameter Currently, "cma=" kernel parameter is used to specify the size of CMA, but we can't specify where it is located. We want to locate CMA below 4GB for devices only supporting 32-bit addressing on 64-bit systems without iommu. This enables to specify the placement of CMA by extending "cma=" kernel parameter. Examples: 1. locate 64MB CMA below 4GB by "cma=64M@0-4G" 2. locate 64MB CMA exact at 512MB by "cma=64M@512M" Note that the DMA contiguous memory allocator on x86 assumes that page_address() works for the pages to allocate. So this change requires to limit end address of contiguous memory area upto max_pfn_mapped to prevent from locating it on highmem area by the argument of dma_contiguous_reserve(). Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 7 +++++-- arch/x86/kernel/setup.c | 2 +- drivers/base/dma-contiguous.c | 42 ++++++++++++++++++++++++++++--------- include/linux/dma-contiguous.h | 9 +++++--- 4 files changed, 44 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index af55e13ace8f..adea3a22fa00 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -630,8 +630,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Also note the kernel might malfunction if you disable some critical bits. - cma=nn[MG] [ARM,KNL] - Sets the size of kernel global memory area for contiguous + cma=nn[MG]@[start[MG][-end[MG]]] + [ARM,X86,KNL] + Sets the size of kernel global memory area for + contiguous memory allocations and optionally the + placement constraint by the physical address range of memory allocations. For more information, see include/linux/dma-contiguous.h diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 09c76d265550..78a0e6298922 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); memblock_set_current_limit(get_max_mapped()); - dma_contiguous_reserve(0); + dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index c34ec3364243..83969f8c5727 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -60,11 +60,22 @@ struct cma *dma_contiguous_default_area; */ static const phys_addr_t size_bytes = CMA_SIZE_MBYTES * SZ_1M; static phys_addr_t size_cmdline = -1; +static phys_addr_t base_cmdline; +static phys_addr_t limit_cmdline; static int __init early_cma(char *p) { pr_debug("%s(%s)\n", __func__, p); size_cmdline = memparse(p, &p); + if (*p != '@') + return 0; + base_cmdline = memparse(p + 1, &p); + if (*p != '-') { + limit_cmdline = base_cmdline + size_cmdline; + return 0; + } + limit_cmdline = memparse(p + 1, &p); + return 0; } early_param("cma", early_cma); @@ -108,11 +119,18 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) void __init dma_contiguous_reserve(phys_addr_t limit) { phys_addr_t selected_size = 0; + phys_addr_t selected_base = 0; + phys_addr_t selected_limit = limit; + bool fixed = false; pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); if (size_cmdline != -1) { selected_size = size_cmdline; + selected_base = base_cmdline; + selected_limit = min_not_zero(limit_cmdline, limit); + if (base_cmdline + size_cmdline == limit_cmdline) + fixed = true; } else { #ifdef CONFIG_CMA_SIZE_SEL_MBYTES selected_size = size_bytes; @@ -129,10 +147,12 @@ void __init dma_contiguous_reserve(phys_addr_t limit) pr_debug("%s: reserving %ld MiB for global area\n", __func__, (unsigned long)selected_size / SZ_1M); - dma_contiguous_reserve_area(selected_size, 0, limit, - &dma_contiguous_default_area); + dma_contiguous_reserve_area(selected_size, selected_base, + selected_limit, + &dma_contiguous_default_area, + fixed); } -}; +} static DEFINE_MUTEX(cma_mutex); @@ -189,15 +209,20 @@ core_initcall(cma_init_reserved_areas); * @base: Base address of the reserved area optional, use 0 for any * @limit: End address of the reserved memory (optional, 0 for any). * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area * * This function reserves memory from early allocator. It should be * called by arch specific code once the early allocator (memblock or bootmem) * has been activated and all other subsystems have already allocated/reserved * memory. This function allows to create custom reserved areas for specific * devices. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. */ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma) + phys_addr_t limit, struct cma **res_cma, + bool fixed) { struct cma *cma = &cma_areas[cma_area_count]; phys_addr_t alignment; @@ -223,18 +248,15 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, limit &= ~(alignment - 1); /* Reserve memory */ - if (base) { + if (base && fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { ret = -EBUSY; goto err; } } else { - /* - * Use __memblock_alloc_base() since - * memblock_alloc_base() panic()s. - */ - phys_addr_t addr = __memblock_alloc_base(size, alignment, limit); + phys_addr_t addr = memblock_alloc_range(size, alignment, base, + limit); if (!addr) { ret = -ENOMEM; goto err; diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 3b28f937d959..772eab5d524a 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -88,7 +88,8 @@ static inline void dma_contiguous_set_default(struct cma *cma) void dma_contiguous_reserve(phys_addr_t addr_limit); int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma); + phys_addr_t limit, struct cma **res_cma, + bool fixed); /** * dma_declare_contiguous() - reserve area for contiguous memory handling @@ -108,7 +109,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, { struct cma *cma; int ret; - ret = dma_contiguous_reserve_area(size, base, limit, &cma); + ret = dma_contiguous_reserve_area(size, base, limit, &cma, true); if (ret == 0) dev_set_cma_area(dev, cma); @@ -136,7 +137,9 @@ static inline void dma_contiguous_set_default(struct cma *cma) { } static inline void dma_contiguous_reserve(phys_addr_t limit) { } static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma) { + phys_addr_t limit, struct cma **res_cma, + bool fixed) +{ return -ENOSYS; } -- cgit v1.2.3 From 38f7ea5a082bbde9e64b7ece389f20e71a9806f4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:56 -0700 Subject: arch/x86/kernel/pci-dma.c: fix dma_generic_alloc_coherent() when CONFIG_DMA_CMA is enabled dma_generic_alloc_coherent() firstly attempts to allocate by dma_alloc_from_contiguous() if CONFIG_DMA_CMA is enabled. But the memory region allocated by it may not fit within the device's DMA mask. This change makes it fall back to usual alloc_pages_node() allocation for such cases. Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-dma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index e5f4e9629e61..a25e202bb319 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -101,8 +101,13 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) + if (flag & __GFP_WAIT) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (page && page_to_phys(page) + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } /* fallback */ if (!page) page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); -- cgit v1.2.3 From 2373eaecff33db5972bde9418f92d6401b4a945c Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:08:14 -0700 Subject: mm: x86 pgtable: drop unneeded preprocessor ifdef _PAGE_BIT_FILE (bit 6) is always less than _PAGE_BIT_PROTNONE (bit 8), so drop redundant #ifdef. Signed-off-by: Cyrill Gorcunov Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Anvin Cc: Ingo Molnar Cc: Steven Noonan Cc: Rik van Riel Cc: David Vrabel Cc: Peter Zijlstra Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level.h | 10 ---------- arch/x86/include/asm/pgtable_64.h | 8 -------- 2 files changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 0d193e234647..eec82d4c4473 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -115,13 +115,8 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off) */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#else -#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) -#endif #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) @@ -153,13 +148,8 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off) #endif /* CONFIG_MEM_SOFT_DIRTY */ /* Encode and de-code a swap entry */ -#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#else -#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) -#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6d6ecd09883c..5be9063545d2 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,7 +143,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ /* Encode and de-code a swap entry */ -#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) #ifdef CONFIG_NUMA_BALANCING /* Automatic NUMA balancing needs to be distinguishable from swap entries */ @@ -151,13 +150,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) #endif -#else -#ifdef CONFIG_NUMA_BALANCING -#error Incompatible format for automatic NUMA balancing -#endif -#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) -#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -- cgit v1.2.3 From 2bf01f9f0cf07b231c90e5d56266e84fe17cec79 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:08:16 -0700 Subject: mm: x86 pgtable: require X86_64 for soft-dirty tracker Tracking dirty status on 2 level pages requires very ugly macros and taking into account how old the machines who can operate without PAE mode only are, lets drop soft dirty tracker from them for code simplicity (note I can't drop all the macros from 2 level pages by now since _PAGE_BIT_PROTNONE and _PAGE_BIT_FILE are still used even without tracker). Linus proposed to completely rip off softdirty support on x86-32 (even with PAE) and since for CRIU we're not planning to support native x86-32 mode, lets do that. (Softdirty tracker is relatively new feature which is mostly used by CRIU so I don't expect if such API change would cause problems for userspace). Signed-off-by: Cyrill Gorcunov Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Anvin Cc: Ingo Molnar Cc: Steven Noonan Cc: Rik van Riel Cc: David Vrabel Cc: Peter Zijlstra Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/pgtable-2level.h | 49 ----------------------------------- arch/x86/include/asm/pgtable.h | 5 ++++ 3 files changed, 6 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4a0137f6f032..69086a3c58e0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -105,7 +105,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE - select HAVE_ARCH_SOFT_DIRTY + select HAVE_ARCH_SOFT_DIRTY if X86_64 select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index eec82d4c4473..206a87fdd22d 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -62,53 +62,6 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -#ifdef CONFIG_MEM_SOFT_DIRTY - -/* - * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and - * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset - * into this range. - */ -#define PTE_FILE_MAX_BITS 28 -#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) -#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) -#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) -#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) - -#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) -#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) -#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) - -#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) -#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) -#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) - -static __always_inline pgoff_t pte_to_pgoff(pte_t pte) -{ - return (pgoff_t) - (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4)); -} - -static __always_inline pte_t pgoff_to_pte(pgoff_t off) -{ - return (pte_t){ - .pte_low = - pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + - pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + - pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + - pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) + - _PAGE_FILE, - }; -} - -#else /* CONFIG_MEM_SOFT_DIRTY */ - /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, * split up the 29 bits of offset into this range. @@ -145,8 +98,6 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off) }; } -#endif /* CONFIG_MEM_SOFT_DIRTY */ - /* Encode and de-code a swap entry */ #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 66276c1d23bb..0ec056012618 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -297,6 +297,7 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_SOFT_DIRTY; @@ -332,6 +333,8 @@ static inline int pte_file_soft_dirty(pte_t pte) return pte_flags(pte) & _PAGE_SOFT_DIRTY; } +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -865,6 +868,7 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, { } +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { VM_BUG_ON(pte_present_nonuma(pte)); @@ -882,6 +886,7 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } +#endif #include #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From af4459d3636790735fccd83f0337c8380a0a4cc2 Mon Sep 17 00:00:00 2001 From: Emil Medve Date: Wed, 4 Jun 2014 16:08:19 -0700 Subject: arch/x86/mm/numa.c: use for_each_memblock() Signed-off-by: Emil Medve Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1d045f9c390f..a32b706c401a 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -559,7 +559,7 @@ static void __init numa_clear_kernel_node_hotplug(void) int i, nid; nodemask_t numa_kernel_nodes = NODE_MASK_NONE; unsigned long start, end; - struct memblock_type *type = &memblock.reserved; + struct memblock_region *r; /* * At this time, all memory regions reserved by memblock are @@ -573,8 +573,8 @@ static void __init numa_clear_kernel_node_hotplug(void) } /* Mark all kernel nodes. */ - for (i = 0; i < type->cnt; i++) - node_set(type->regions[i].nid, numa_kernel_nodes); + for_each_memblock(reserved, r) + node_set(r->nid, numa_kernel_nodes); /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { -- cgit v1.2.3 From 65eb71823b01051ca6e256e9cc8259141a849052 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Wed, 4 Jun 2014 16:10:43 -0700 Subject: hwpoison: remove unused global variable in do_machine_check() Remove an unused global variable mce_entry and relative operations in do_machine_check(). Signed-off-by: Chen Yucong Cc: Naoya Horiguchi Cc: Wu Fengguang Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/mce.h | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 5 ----- 2 files changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6e4ce2df87cf..958b90f761e5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -176,8 +176,6 @@ int mce_available(struct cpuinfo_x86 *c); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); -extern atomic_t mce_entry; - typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6cc800381d14..bb92f38153b2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define SPINUNIT 100 /* 100ns */ -atomic_t mce_entry; - DEFINE_PER_CPU(unsigned, mce_exception_count); struct mce_bank *mce_banks __read_mostly; @@ -1040,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - atomic_inc(&mce_entry); - this_cpu_inc(mce_exception_count); if (!cfg->banks) @@ -1171,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: - atomic_dec(&mce_entry); sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); -- cgit v1.2.3 From f6187769dae48234f3877df3c4d99294cc2254fa Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:12 -0700 Subject: sys_sgetmask/sys_ssetmask: add CONFIG_SGETMASK_SYSCALL sys_sgetmask and sys_ssetmask are obsolete system calls no longer supported in libc. This patch replaces architecture related __ARCH_WANT_SYS_SGETMAX by expert mode configuration.That option is enabled by default for those architectures. Signed-off-by: Fabian Frederick Cc: Steven Miao Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Koichi Yasutake Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Greg Ungerer Cc: Heiko Carstens Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/unistd.h | 1 - arch/cris/include/asm/unistd.h | 1 - arch/frv/include/asm/unistd.h | 1 - arch/m68k/include/asm/unistd.h | 1 - arch/microblaze/include/asm/unistd.h | 1 - arch/mips/include/asm/unistd.h | 1 - arch/mn10300/include/asm/unistd.h | 1 - arch/parisc/include/asm/unistd.h | 1 - arch/powerpc/include/asm/unistd.h | 1 - arch/sh/include/asm/unistd.h | 1 - arch/sparc/include/asm/unistd.h | 1 - arch/x86/include/asm/unistd.h | 1 - init/Kconfig | 10 ++++++++++ kernel/signal.c | 4 ++-- kernel/sys_ni.c | 2 ++ 15 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c35414bdf7bd..c8c8ff9eff61 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -12,7 +12,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index 5cc7d1991e48..0f40fed1ba25 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -15,7 +15,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 70ec7293dce7..17b5df8fc28a 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -13,7 +13,6 @@ /* #define __ARCH_WANT_SYS_GETHOSTNAME */ #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -/* #define __ARCH_WANT_SYS_SGETMASK */ /* #define __ARCH_WANT_SYS_SIGNAL */ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 33afa56ad47a..1fcdd344c7ad 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -13,7 +13,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index b14232b6878f..fd56a8f66489 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -19,7 +19,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 413d6c612bec..e55813029d5a 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_WAITPID #define __ARCH_WANT_SYS_SOCKETCALL diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index 9d4e2d1ef90e..0522468f488b 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -26,7 +26,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 74d835820ee7..5f4c68daa261 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -145,7 +145,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 9b892bbd9d84..5ce5552ab9f5 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index e77816c4b9bc..126fe8340b22 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -11,7 +11,6 @@ # define __ARCH_WANT_SYS_GETHOSTNAME # define __ARCH_WANT_SYS_IPC # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index dfa53fdd5cbc..0aac1e8f2968 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -25,7 +25,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 3f556c6a0157..2b19caa4081c 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -41,7 +41,6 @@ # define __ARCH_WANT_SYS_OLD_GETRLIMIT # define __ARCH_WANT_SYS_OLD_UNAME # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/init/Kconfig b/init/Kconfig index ce034ad4a162..9d76b99af1b9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1313,6 +1313,16 @@ config UID16 help This enables the legacy 16-bit UID syscall wrappers. +config SGETMASK_SYSCALL + bool "sgetmask/ssetmask syscalls support" if EXPERT + def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH + ---help--- + sys_sgetmask and sys_ssetmask are obsolete system calls + no longer supported in libc but still enabled by default in some + architectures. + + If unsure, leave the default option here. + config SYSFS_SYSCALL bool "Sysfs syscall support" if EXPERT default y diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..6e600aaa2af4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3496,7 +3496,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, } #endif -#ifdef __ARCH_WANT_SYS_SGETMASK +#ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. @@ -3517,7 +3517,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) return old; } -#endif /* __ARCH_WANT_SGETMASK */ +#endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); cond_syscall(sys_setresuid16); cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(sys_ipc); -- cgit v1.2.3 From a8fe19ebfbfd90ec17c02284717238b02efb9580 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Jun 2014 16:11:46 -0700 Subject: kernel/printk: use symbolic defines for console loglevels ... instead of naked numbers. Stuff in sysrq.c used to set it to 8 which is supposed to mean above default level so set it to DEBUG instead as we're terminating/killing all tasks and we want to be verbose there. Also, correct the check in x86_64_start_kernel which should be >= as we're clearly issuing the string there for all debug levels, not only the magical 10. Signed-off-by: Borislav Petkov Acked-by: Kees Cook Acked-by: Randy Dunlap Cc: Joe Perches Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/head64.c | 2 +- arch/x86/platform/uv/uv_nmi.c | 2 +- drivers/nubus/nubus.c | 18 +++++++++--------- drivers/tty/sysrq.c | 8 ++++---- include/linux/printk.h | 15 +++++++++++++-- init/main.c | 4 ++-- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/debug/kdb/kdb_io.c | 2 +- kernel/debug/kdb/kdb_main.c | 2 +- kernel/printk/printk.c | 13 +++---------- 10 files changed, 36 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 068054f4bf20..eda1a865641e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - if (console_loglevel == 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) early_printk("Kernel alive\n"); clear_page(init_level4_pgt); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index be27da60dc8f..c89c93320c12 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -85,7 +85,7 @@ static cpumask_var_t uv_nmi_cpu_mask; * Default is all stack dumps go to the console and buffer. * Lower level to send to log buffer only. */ -static int uv_nmi_loglevel = 7; +static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); /* diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index 43926cd25ae8..5066a7ef7b6c 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c @@ -473,7 +473,7 @@ static struct nubus_dev* __init if (slot == 0 && (unsigned long)dir.base % 2) dir.base += 1; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -568,7 +568,7 @@ static int __init nubus_get_vidnames(struct nubus_board* board, printk(KERN_INFO " video modes supported:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -629,7 +629,7 @@ static int __init nubus_get_vendorinfo(struct nubus_board* board, printk(KERN_INFO " vendor info:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -654,7 +654,7 @@ static int __init nubus_get_board_resource(struct nubus_board* board, int slot, struct nubus_dirent ent; nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -753,19 +753,19 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* This one takes us to where we want to go. */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); nubus_get_subdir(&ent, &dir); /* Resource ID 01, also an "Unknown Macintosh" */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* FIXME: the first one is *not* always the right one. We @@ -780,7 +780,7 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) path to that address... */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* Bwahahahaha... */ @@ -816,7 +816,7 @@ static struct nubus_board* __init nubus_add_board(int slot, int bytelanes) board->fblock = rp; /* Dump the format block for debugging purposes */ - if (console_loglevel >= 10) { + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) { int i; printk(KERN_DEBUG "Slot %X, format block at 0x%p\n", slot, rp); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ce396ecdf412..b767a64e49d9 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -88,7 +88,7 @@ static void sysrq_handle_loglevel(int key) int i; i = key - '0'; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk("Loglevel set to %d\n", i); console_loglevel = i; } @@ -343,7 +343,7 @@ static void send_sig_all(int sig) static void sysrq_handle_term(int key) { send_sig_all(SIGTERM); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_term_op = { .handler = sysrq_handle_term, @@ -387,7 +387,7 @@ static struct sysrq_key_op sysrq_thaw_op = { static void sysrq_handle_kill(int key) { send_sig_all(SIGKILL); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_kill_op = { .handler = sysrq_handle_kill, @@ -520,7 +520,7 @@ void __handle_sysrq(int key, bool check_mask) * routing in the consumers of /proc/kmsg. */ orig_log_level = console_loglevel; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk(KERN_INFO "SysRq : "); op_p = __sysrq_get_key_op(key); diff --git a/include/linux/printk.h b/include/linux/printk.h index 37f3a6589c1c..319ff7e53efb 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -30,6 +30,17 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL + +/* We show everything that is MORE important than this.. */ +#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ +#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ +#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ +#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ +#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ +#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ + extern int console_printk[]; #define console_loglevel (console_printk[0]) @@ -39,13 +50,13 @@ extern int console_printk[]; static inline void console_silent(void) { - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; } static inline void console_verbose(void) { if (console_loglevel) - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } struct va_format { diff --git a/init/main.c b/init/main.c index e08c0b2065a1..04fab8d74c89 100644 --- a/init/main.c +++ b/init/main.c @@ -203,13 +203,13 @@ EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) { - console_loglevel = 10; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } static int __init quiet_kernel(char *str) { - console_loglevel = 4; + console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; } diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,7 +21,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; kdb_set_current_task(p); if (addr) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -710,7 +710,7 @@ kdb_printit: } if (logging) { saved_loglevel = console_loglevel; - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; printk(KERN_INFO "%s", kdb_buffer); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) static void kdb_dumpregs(struct pt_regs *regs) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; show_regs(regs); kdb_trap_printk--; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 923c5d4e4202..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,18 +54,11 @@ #include "console_cmdline.h" #include "braille.h" -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ + CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; /* Deferred messaged from sched code are marked by this special level */ -- cgit v1.2.3 From 36fac0a214805bd7c8307cad1cde60a7b833266d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:45 -0700 Subject: signals: kill sigfindinword() It has no users and it doesn't look useful. I do not know why/when it was introduced, I can't even find any user in the git history. Signed-off-by: Oleg Nesterov Acked-by: Geert Uytterhoeven Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/include/asm/signal.h | 9 --------- arch/x86/include/asm/signal.h | 6 ------ include/linux/signal.h | 5 ----- 3 files changed, 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/m68k/include/asm/signal.h b/arch/m68k/include/asm/signal.h index 214320b50384..8c8ce5e1ee0e 100644 --- a/arch/m68k/include/asm/signal.h +++ b/arch/m68k/include/asm/signal.h @@ -60,15 +60,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig) __const_sigismember(set,sig) : \ __gen_sigismember(set,sig)) -static inline int sigfindinword(unsigned long word) -{ - asm ("bfffo %1{#0,#0},%0" - : "=d" (word) - : "d" (word & -word) - : "cc"); - return word ^ 31; -} - #endif /* !CONFIG_CPU_HAS_NO_BITFIELDS */ #ifndef __uClinux__ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 35e67a457182..31eab867e6d3 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -92,12 +92,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig) ? __const_sigismember((set), (sig)) \ : __gen_sigismember((set), (sig))) -static inline int sigfindinword(unsigned long word) -{ - asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc"); - return word; -} - struct pt_regs; #else /* __i386__ */ diff --git a/include/linux/signal.h b/include/linux/signal.h index 2ac423bdb676..ae744c314630 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -63,11 +63,6 @@ static inline int sigismember(sigset_t *set, int _sig) return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW)); } -static inline int sigfindinword(unsigned long word) -{ - return ffz(~word); -} - #endif /* __HAVE_ARCH_SIG_BITOPS */ static inline int sigisemptyset(sigset_t *set) -- cgit v1.2.3