From 1db491f77b6ed0f32f1d4a3ac40a5be9524f1914 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 15 Jan 2015 20:30:01 -0800 Subject: x86/mm: Reduce PAE-mode per task pgd allocation overhead from 4K to 32 bytes With more embedded systems emerging using Quark, among other things, 32-bit kernel matters again. 32-bit machine and kernel uses PAE paging, which currently wastes at least 4K of memory per process on Linux where we have to reserve an entire page to support a single 32-byte PGD structure. It would be a very good thing if we could eliminate that wastage. PAE paging is used to access more than 4GB memory on x86-32. And it is required for NX. In this patch, we still allocate one page for pgd for a Xen domain and 64-bit kernel because one page pgd is assumed in these cases. But we can save memory space by only allocating 32-byte pgd for 32-bit PAE kernel when it is not running as a Xen domain. Signed-off-by: Fenghua Yu Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Christoph Lameter Cc: Dave Hansen Cc: Glenn Williamson Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1421382601-46912-1-git-send-email-fenghua.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927f9e76..d223e1fe1dd2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -271,12 +271,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +/* + * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also + * assumes that pgd should be in one page. + * + * But kernel with PAE paging that is not running as a Xen domain + * only needs to allocate 32 bytes for pgd instead of one page. + */ +#ifdef CONFIG_X86_PAE + +#include + +#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) +#define PGD_ALIGN 32 + +static struct kmem_cache *pgd_cache; + +static int __init pgd_cache_init(void) +{ + /* + * When PAE kernel is running as a Xen domain, it does not use + * shared kernel pmd. And this requires a whole page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return 0; + + /* + * when PAE kernel is not running as a Xen domain, it uses + * shared kernel pmd. Shared kernel pmd does not require a whole + * page for pgd. We are able to just allocate a 32-byte for pgd. + * During boot time, we create a 32-byte slab for pgd table allocation. + */ + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, + SLAB_PANIC, NULL); + if (!pgd_cache) + return -ENOMEM; + + return 0; +} +core_initcall(pgd_cache_init); + +static inline pgd_t *_pgd_alloc(void) +{ + /* + * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. + * We allocate one page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return (pgd_t *)__get_free_page(PGALLOC_GFP); + + /* + * Now PAE kernel is not running as a Xen domain. We can allocate + * a 32-byte slab for pgd to save memory space. + */ + return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + if (!SHARED_KERNEL_PMD) + free_page((unsigned long)pgd); + else + kmem_cache_free(pgd_cache, pgd); +} +#else +static inline pgd_t *_pgd_alloc(void) +{ + return (pgd_t *)__get_free_page(PGALLOC_GFP); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_page((unsigned long)pgd); +} +#endif /* CONFIG_X86_PAE */ + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + pgd = _pgd_alloc(); if (pgd == NULL) goto out; @@ -306,7 +381,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(pmds); out_free_pgd: - free_page((unsigned long)pgd); + _pgd_free(pgd); out: return NULL; } @@ -316,7 +391,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - free_page((unsigned long)pgd); + _pgd_free(pgd); } /* -- cgit v1.2.3 From 1f40a8bfa91adfa8d1ac5013c08c76f6fd6691ad Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sun, 28 Dec 2014 17:15:24 +0100 Subject: x86/mm/pat: Ensure different messages in STRICT_DEVMEM and PAT cases STRICT_DEVMEM and PAT produce same failure accessing /dev/mem, which is quite confusing to the user. Make printk messages different to lessen confusion. Signed-off-by: Pavel Machek Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7ac68698406c..35af6771a95a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } #ifdef CONFIG_STRICT_DEVMEM -/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; @@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", - current->comm, from, to - 1); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; -- cgit v1.2.3 From 8d4a40bc0651ea51c196a3d3016d041c41ec19a2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 24 Feb 2015 10:13:28 +0100 Subject: x86/mm: Use early_memunmap() instead of early_iounmap() Memory mapped via early_memremap() should be unmapped with early_memunmap() instead of early_iounmap(). Signed-off-by: Juergen Gross Cc: matt.fleming@intel.com Link: http://lkml.kernel.org/r/1424769211-11378-2-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/devicetree.c | 4 ++-- arch/x86/kernel/e820.c | 2 +- arch/x86/kernel/setup.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3d3503351242..6367a780cc8c 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void) initial_boot_params = dt = early_memremap(initial_dtb, map_len); size = of_get_flat_dt_size(); if (map_len < size) { - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); initial_boot_params = dt = early_memremap(initial_dtb, size); map_len = size; } unflatten_and_copy_device_tree(); - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); } #else static inline void x86_flattree_get_config(void) { } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46201deee923..7d46bb260334 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - early_iounmap(sdata, data_len); + early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 98dc9317286e..733864a653ab 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -356,7 +356,7 @@ static void __init relocate_initrd(void) mapaddr = ramdisk_image & PAGE_MASK; p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); - early_iounmap(p, clen+slop); + early_memunmap(p, clen+slop); q += clen; ramdisk_image += clen; ramdisk_size -= clen; @@ -445,7 +445,7 @@ static void __init parse_setup_data(void) data_len = data->len + sizeof(struct setup_data); data_type = data->type; pa_next = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); switch (data_type) { case SETUP_E820_EXT: @@ -480,7 +480,7 @@ static void __init e820_reserve_setup_data(void) E820_RAM, E820_RESERVED_KERN); found = 1; pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } if (!found) return; @@ -501,7 +501,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } } -- cgit v1.2.3 From 954e12f7a800ce38b4722ca1d7a6d0293d377b55 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 24 Feb 2015 10:13:31 +0100 Subject: x86/mm, efi: Use early_ioremap() in arch/x86/platform/efi/efi-bgrt.c Use early_ioremap() to map an I/O-area instead of early_memremap(). Signed-off-by: Juergen Gross Cc: matt.fleming@intel.com Link: http://lkml.kernel.org/r/1424769211-11378-5-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi-bgrt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index d143d216d52b..d7f997f7c26d 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -67,7 +67,7 @@ void __init efi_bgrt_init(void) image = efi_lookup_mapped_addr(bgrt_tab->image_address); if (!image) { - image = early_memremap(bgrt_tab->image_address, + image = early_ioremap(bgrt_tab->image_address, sizeof(bmp_header)); ioremapped = true; if (!image) { @@ -89,7 +89,7 @@ void __init efi_bgrt_init(void) } if (ioremapped) { - image = early_memremap(bgrt_tab->image_address, + image = early_ioremap(bgrt_tab->image_address, bmp_header.size); if (!image) { pr_err("Ignoring BGRT: failed to map image memory\n"); -- cgit v1.2.3 From 6bbb614ec478961c7443086bdf7fd6784479c14a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Feb 2015 15:55:40 +0100 Subject: x86/mm: Unexport set_memory_ro() and set_memory_rw() This effectively unexports set_memory_ro() and set_memory_rw() functions, and thus reverts: a03352d2c1dc ("x86: export set_memory_ro and set_memory_rw"). They have been introduced for debugging purposes in e1000e, but no module user is in mainline kernel (anymore?) and we explicitly do not want modules to use these functions, as they i.e. protect eBPF (interpreted & JIT'ed) images from malicious modifications or bugs. Outside of eBPF scope, I believe also other set_memory_*() functions should be unexported on x86 for modules. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Bruce Allan Cc: H. Peter Anvin Cc: Jesse Brandeburg Cc: Thomas Gleixner Cc: davem@davemloft.net Link: http://lkml.kernel.org/r/a064393a0a5d319eebde5c761cfd743132d4f213.1425040940.git.daniel@iogearbox.net Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 536ea2fb6e33..81e8282d8c2f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1654,13 +1654,11 @@ int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } -EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } -EXPORT_SYMBOL_GPL(set_memory_rw); int set_memory_np(unsigned long addr, int numpages) { -- cgit v1.2.3 From d9fd579c218e22c897f0f1b9e132af9b436cf445 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 4 Mar 2015 17:24:11 -0800 Subject: x86/mm: Use IS_ENABLED() for direct_gbpages Replace #ifdef eyesore with IS_ENABLED() use. Signed-off-by: Luis R. Rodriguez Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: David Vrabel Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: JBeulich@suse.com Cc: Jan Beulich Cc: Joonsoo Kim Cc: Juergen Gross Cc: Linus Torvalds Cc: Pavel Machek Cc: Thomas Gleixner Cc: Tony Lindgren Cc: Toshi Kani Cc: Vlastimil Babka Cc: Xishi Qiu Cc: julia.lawall@lip6.fr Link: http://lkml.kernel.org/r/1425518654-3403-2-git-send-email-mcgrof@do-not-panic.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a110efca6d06..74f2b37fd073 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -131,11 +131,7 @@ void __init early_alloc_pgt_buf(void) int after_bootmem; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; +int direct_gbpages = IS_ENABLED(CONFIG_DIRECT_GBPAGES); static void __init init_gbpages(void) { -- cgit v1.2.3 From e5008abe929c160d36e44b8c2b644d4330d2e389 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 4 Mar 2015 17:24:12 -0800 Subject: x86/mm: Simplify enabling direct_gbpages direct_gbpages can be force enabled as an early parameter but not really have taken effect when DEBUG_PAGEALLOC or KMEMCHECK is enabled. You can also enable direct_gbpages right now if you have an x86_64 architecture but your CPU doesn't really have support for this feature. In both cases PG_LEVEL_1G won't actually be enabled but direct_gbpages is used in other areas under the assumptions that PG_LEVEL_1G was set. Fix this by putting together all requirements which make this feature sensible to enable under, and only enable both finally flipping on PG_LEVEL_1G and leaving PG_LEVEL_1G set when this is true. We only enable this feature then to be possible on sensible builds defined by the new ENABLE_DIRECT_GBPAGES. If the CPU has support for it you can either enable this by using the DIRECT_GBPAGES option or using the early kernel parameter. If a platform had support for this you can always force disable it as well. Signed-off-by: Luis R. Rodriguez Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: David Vrabel Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: JBeulich@suse.com Cc: Jan Beulich Cc: Joonsoo Kim Cc: Juergen Gross Cc: Linus Torvalds Cc: Pavel Machek Cc: Thomas Gleixner Cc: Tony Lindgren Cc: Toshi Kani Cc: Vlastimil Babka Cc: Xishi Qiu Cc: julia.lawall@lip6.fr Link: http://lkml.kernel.org/r/1425518654-3403-3-git-send-email-mcgrof@do-not-panic.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 18 +++++++++++++----- arch/x86/mm/init.c | 17 +++++++++-------- arch/x86/mm/pageattr.c | 2 -- 3 files changed, 22 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c2fb8a87dccb..4d06e1c8294a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1299,14 +1299,22 @@ config ARCH_DMA_ADDR_T_64BIT def_bool y depends on X86_64 || HIGHMEM64G +config ENABLE_DIRECT_GBPAGES + def_bool y + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK + config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EXPERT default y - depends on X86_64 - ---help--- - Allow the kernel linear mapping to use 1GB pages on CPUs that - support it. This can improve the kernel's performance a tiny bit by - reducing TLB pressure. If in doubt, say "Y". + depends on ENABLE_DIRECT_GBPAGES + ---help--- + Enable by default the kernel linear mapping to use 1GB pages on CPUs + that support it. This can improve the kernel's performance a tiny bit + by reducing TLB pressure. If in doubt, say "Y". If you've disabled + option but your platform is capable of handling support for this + you can use the gbpages kernel parameter. Likewise if you've enabled + this but you'd like to force disable this option you can use the + nogbpages kernel parameter. # Common NUMA Features config NUMA diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 74f2b37fd073..2ce2c8e8c99c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -131,16 +131,21 @@ void __init early_alloc_pgt_buf(void) int after_bootmem; +static int page_size_mask; + int direct_gbpages = IS_ENABLED(CONFIG_DIRECT_GBPAGES); static void __init init_gbpages(void) { -#ifdef CONFIG_X86_64 - if (direct_gbpages && cpu_has_gbpages) + if (!IS_ENABLED(CONFIG_ENABLE_DIRECT_GBPAGES)) { + direct_gbpages = 0; + return; + } + if (direct_gbpages && cpu_has_gbpages) { printk(KERN_INFO "Using GB pages for direct mapping\n"); - else + page_size_mask |= 1 << PG_LEVEL_1G; + } else direct_gbpages = 0; -#endif } struct map_range { @@ -149,8 +154,6 @@ struct map_range { unsigned page_size_mask; }; -static int page_size_mask; - static void __init probe_page_size_mask(void) { init_gbpages(); @@ -161,8 +164,6 @@ static void __init probe_page_size_mask(void) * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (direct_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; if (cpu_has_pse) page_size_mask |= 1 << PG_LEVEL_2M; #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 81e8282d8c2f..89af288ec674 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m) seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif -#ifdef CONFIG_X86_64 if (direct_gbpages) seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); -#endif } #else static inline void split_page_count(int level) { } -- cgit v1.2.3 From 73c8c861dc5bddf1b24c6aeffee2292c96cf8db2 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 4 Mar 2015 17:24:14 -0800 Subject: x86/mm: Use early_param_on_off() for direct_gbpages The enabler / disabler is pretty simple, just use the provided wrappers, this lets us easily relate the variable to the associated Kconfig entry. Signed-off-by: Luis R. Rodriguez Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: David Vrabel Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: JBeulich@suse.com Cc: Jan Beulich Cc: Joonsoo Kim Cc: Juergen Gross Cc: Linus Torvalds Cc: Pavel Machek Cc: Thomas Gleixner Cc: Tony Lindgren Cc: Toshi Kani Cc: Vlastimil Babka Cc: Xishi Qiu Cc: julia.lawall@lip6.fr Link: http://lkml.kernel.org/r/1425518654-3403-5-git-send-email-mcgrof@do-not-panic.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 ++- arch/x86/mm/init_64.c | 14 -------------- 2 files changed, 2 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2ce2c8e8c99c..c35ba8bce7cb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -133,7 +133,8 @@ int after_bootmem; static int page_size_mask; -int direct_gbpages = IS_ENABLED(CONFIG_DIRECT_GBPAGES); +early_param_on_off("gbpages", "nogbpages", + direct_gbpages, CONFIG_DIRECT_GBPAGES); static void __init init_gbpages(void) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 30eb05ae7061..3fba623e3ba5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, return 0; } -static int __init parse_direct_gbpages_off(char *arg) -{ - direct_gbpages = 0; - return 0; -} -early_param("nogbpages", parse_direct_gbpages_off); - -static int __init parse_direct_gbpages_on(char *arg) -{ - direct_gbpages = 1; - return 0; -} -early_param("gbpages", parse_direct_gbpages_on); - /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move -- cgit v1.2.3 From 10971ab269bbf22120edac95fcfa3c873a549bea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2015 08:18:23 +0100 Subject: x86/mm: Further simplify 1 GB kernel linear mappings handling It's a bit pointless to allow Kconfig configuration for 1GB kernel mappings, it's already hidden behind a 'default y' and CONFIG_EXPERT. Remove this complication and simplify the code by renaming CONFIG_ENABLE_DIRECT_GBPAGES to CONFIG_X86_DIRECT_GBPAGES and document the DEBUG_PAGE_ALLOC and KMEMCHECK quirks. Cc: Luis R. Rodriguez Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: David Vrabel Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: JBeulich@suse.com Cc: Jan Beulich Cc: Joonsoo Kim Cc: Juergen Gross Cc: Linus Torvalds Cc: Pavel Machek Cc: Thomas Gleixner Cc: Tony Lindgren Cc: Toshi Kani Cc: Vlastimil Babka Cc: Xishi Qiu Cc: julia.lawall@lip6.fr Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 20 ++++++-------------- arch/x86/mm/init.c | 7 +------ 2 files changed, 7 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4d06e1c8294a..d03847513b6d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1299,22 +1299,14 @@ config ARCH_DMA_ADDR_T_64BIT def_bool y depends on X86_64 || HIGHMEM64G -config ENABLE_DIRECT_GBPAGES +config X86_DIRECT_GBPAGES def_bool y depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK - -config DIRECT_GBPAGES - bool "Enable 1GB pages for kernel pagetables" if EXPERT - default y - depends on ENABLE_DIRECT_GBPAGES - ---help--- - Enable by default the kernel linear mapping to use 1GB pages on CPUs - that support it. This can improve the kernel's performance a tiny bit - by reducing TLB pressure. If in doubt, say "Y". If you've disabled - option but your platform is capable of handling support for this - you can use the gbpages kernel parameter. Likewise if you've enabled - this but you'd like to force disable this option you can use the - nogbpages kernel parameter. + ---help--- + Certain kernel features effectively disable kernel + linear 1 GB mappings (even if the CPU otherwise + supports them), so don't confuse the user by printing + that we have them enabled. # Common NUMA Features config NUMA diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c35ba8bce7cb..8704153f2675 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -133,15 +133,10 @@ int after_bootmem; static int page_size_mask; -early_param_on_off("gbpages", "nogbpages", - direct_gbpages, CONFIG_DIRECT_GBPAGES); +early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); static void __init init_gbpages(void) { - if (!IS_ENABLED(CONFIG_ENABLE_DIRECT_GBPAGES)) { - direct_gbpages = 0; - return; - } if (direct_gbpages && cpu_has_gbpages) { printk(KERN_INFO "Using GB pages for direct mapping\n"); page_size_mask |= 1 << PG_LEVEL_1G; -- cgit v1.2.3 From e61980a70245715ab39cbee2b9d6e6afc1ec37d4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2015 08:25:01 +0100 Subject: x86/mm: Simplify probe_page_size_mask() Now that we've simplified the gbpages config space, move the 'page_size_mask' initialization into probe_page_size_mask(), right next to the PSE and PGE enablement lines. Cc: Luis R. Rodriguez Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: David Vrabel Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: JBeulich@suse.com Cc: Jan Beulich Cc: Joonsoo Kim Cc: Juergen Gross Cc: Linus Torvalds Cc: Pavel Machek Cc: Thomas Gleixner Cc: Tony Lindgren Cc: Toshi Kani Cc: Vlastimil Babka Cc: Xishi Qiu Cc: julia.lawall@lip6.fr Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8704153f2675..6dc85d51cd98 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -131,29 +131,18 @@ void __init early_alloc_pgt_buf(void) int after_bootmem; -static int page_size_mask; - early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) { - printk(KERN_INFO "Using GB pages for direct mapping\n"); - page_size_mask |= 1 << PG_LEVEL_1G; - } else - direct_gbpages = 0; -} - struct map_range { unsigned long start; unsigned long end; unsigned page_size_mask; }; +static int page_size_mask; + static void __init probe_page_size_mask(void) { - init_gbpages(); - #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. @@ -173,6 +162,14 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } + + /* Enable 1 GB linear kernel mappings if available: */ + if (direct_gbpages && cpu_has_gbpages) { + printk(KERN_INFO "Using GB pages for direct mapping\n"); + page_size_mask |= 1 << PG_LEVEL_1G; + } else { + direct_gbpages = 0; + } } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From c709feda56886c38af3116254f84cbe6a78b3a5d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2015 08:58:44 +0100 Subject: x86/mm/pat: Initialize __cachemode2pte_tbl[] and __pte2cachemode_tbl[] in a bit more readable fashion The initialization of these two arrays is a bit difficult to follow: restructure it optically so that a 2D structure shows which bit in the PTE is set and which not. Also improve on comments a bit. No code or data changed: # arch/x86/mm/init.o: text data bss dec hex filename 4585 424 29776 34785 87e1 init.o.before 4585 424 29776 34785 87e1 init.o.after md5: a82e11ff58bcfd0af3a94662a701f65d init.o.before.asm a82e11ff58bcfd0af3a94662a701f65d init.o.after.asm Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Dave Hansen Cc: Jan Beulich Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Toshi Kani Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150305082135.GB5969@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6dc85d51cd98..4469563f8c3b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,29 +29,33 @@ /* * Tables translating between page_cache_type_t and pte encoding. - * Minimal supported modes are defined statically, modified if more supported - * cache modes are available. - * Index into __cachemode2pte_tbl is the cachemode. - * Index into __pte2cachemode_tbl are the caching attribute bits of the pte - * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + * + * Minimal supported modes are defined statically, they are modified + * during bootup if more supported cache modes are available. + * + * Index into __cachemode2pte_tbl[] is the cachemode. + * + * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { - [_PAGE_CACHE_MODE_WB] = 0, - [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, - [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, - [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, - [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, - [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, + [_PAGE_CACHE_MODE_WB ] = 0 | 0 , + [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 , + [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); + uint8_t __pte2cachemode_tbl[8] = { - [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, - [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, - [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, - [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, - [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; EXPORT_SYMBOL(__pte2cachemode_tbl); -- cgit v1.2.3 From 4e26d11f52684dc8b1632a8cfe450cb5197a8464 Mon Sep 17 00:00:00 2001 From: Hector Marco-Gisbert Date: Fri, 27 Mar 2015 12:38:21 +0100 Subject: x86/mm: Improve AMD Bulldozer ASLR workaround The ASLR implementation needs to special-case AMD F15h processors by clearing out bits [14:12] of the virtual address in order to avoid I$ cross invalidations and thus performance penalty for certain workloads. For details, see: dfb09f9b7ab0 ("x86, amd: Avoid cache aliasing penalties on AMD family 15h") This special case reduces the mmapped file's entropy by 3 bits. The following output is the run on an AMD Opteron 62xx class CPU processor under x86_64 Linux 4.0.0: $ for i in `seq 1 10`; do cat /proc/self/maps | grep "r-xp.*libc" ; done b7588000-b7736000 r-xp 00000000 00:01 4924 /lib/i386-linux-gnu/libc.so.6 b7570000-b771e000 r-xp 00000000 00:01 4924 /lib/i386-linux-gnu/libc.so.6 b75d0000-b777e000 r-xp 00000000 00:01 4924 /lib/i386-linux-gnu/libc.so.6 b75b0000-b775e000 r-xp 00000000 00:01 4924 /lib/i386-linux-gnu/libc.so.6 b7578000-b7726000 r-xp 00000000 00:01 4924 /lib/i386-linux-gnu/libc.so.6 ... Bits [12:14] are always 0, i.e. the address always ends in 0x8000 or 0x0000. 32-bit systems, as in the example above, are especially sensitive to this issue because 32-bit randomness for VA space is 8 bits (see mmap_rnd()). With the Bulldozer special case, this diminishes to only 32 different slots of mmap virtual addresses. This patch randomizes per boot the three affected bits rather than setting them to zero. Since all the shared pages have the same value at bits [12..14], there is no cache aliasing problems. This value gets generated during system boot and it is thus not known to a potential remote attacker. Therefore, the impact from the Bulldozer workaround gets diminished and ASLR randomness increased. More details at: http://hmarco.org/bugs/AMD-Bulldozer-linux-ASLR-weakness-reducing-mmaped-files-by-eight.html Original white paper by AMD dealing with the issue: http://developer.amd.com/wordpress/media/2012/10/SharedL1InstructionCacheonAMD15hCPU.pdf Mentored-by: Ismael Ripoll Signed-off-by: Hector Marco-Gisbert Signed-off-by: Borislav Petkov Acked-by: Kees Cook Cc: Alexander Viro Cc: Andrew Morton Cc: H. Peter Anvin Cc: Jan-Simon Cc: Thomas Gleixner Cc: linux-fsdevel@vger.kernel.org Link: http://lkml.kernel.org/r/1427456301-3764-1-git-send-email-hecmargi@upv.es Signed-off-by: Ingo Molnar --- arch/x86/include/asm/elf.h | 1 + arch/x86/kernel/cpu/amd.c | 4 ++++ arch/x86/kernel/sys_x86_64.c | 30 +++++++++++++++++++++++++++--- 3 files changed, 32 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ca3347a9dab5..bd292ce9be0a 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -365,6 +365,7 @@ enum align_flags { struct va_alignment { int flags; unsigned long mask; + unsigned long bits; } ____cacheline_aligned; extern struct va_alignment va_align; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..ec6a61b21b41 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.mask = (upperbit - 1) & PAGE_MASK; va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + + /* A random value per boot for bit slice [12:upper_bit) */ + va_align.bits = get_random_int() & va_align.mask; } } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 30277e27431a..10e0272d789a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -34,10 +34,26 @@ static unsigned long get_align_mask(void) return va_align.mask; } +/* + * To avoid aliasing in the I$ on AMD F15h, the bits defined by the + * va_align.bits, [12:upper_bit), are set to a random value instead of + * zeroing them. This random value is computed once per boot. This form + * of ASLR is known as "per-boot ASLR". + * + * To achieve this, the random value is added to the info.align_offset + * value before calling vm_unmapped_area() or ORed directly to the + * address. + */ +static unsigned long get_align_bits(void) +{ + return va_align.bits & get_align_mask(); +} + unsigned long align_vdso_addr(unsigned long addr) { unsigned long align_mask = get_align_mask(); - return (addr + align_mask) & ~align_mask; + addr = (addr + align_mask) & ~align_mask; + return addr | get_align_bits(); } static int __init control_va_addr_alignment(char *str) @@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } return vm_unmapped_area(&info); } @@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } addr = vm_unmapped_area(&info); if (!(addr & ~PAGE_MASK)) return addr; -- cgit v1.2.3