From 0c133b1e78cd34dd9d18da707dc6f46170e9129e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:07 +0300 Subject: module: prepare to handle ROX allocations for text In order to support ROX allocations for module text, it is necessary to handle modifications to the code, such as relocations and alternatives patching, without write access to that memory. One option is to use text patching, but this would make module loading extremely slow and will expose executable code that is not finally formed. A better way is to have memory allocated with ROX permissions contain invalid instructions and keep a writable, but not executable copy of the module text. The relocations and alternative patches would be done on the writable copy using the addresses of the ROX memory. Once the module is completely ready, the updated text will be copied to ROX memory using text patching in one go and the writable copy will be freed. Add support for that to module initialization code and provide necessary interfaces in execmem. Link: https://lkml.kernel.org/r/20241023162711.2579610-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewd-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/execmem.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux/execmem.h') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 32cef1144117..dfdf19f8a5e8 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -46,9 +46,11 @@ enum execmem_type { /** * enum execmem_range_flags - options for executable memory allocations * @EXECMEM_KASAN_SHADOW: allocate kasan shadow + * @EXECMEM_ROX_CACHE: allocations should use ROX cache of huge pages */ enum execmem_range_flags { EXECMEM_KASAN_SHADOW = (1 << 0), + EXECMEM_ROX_CACHE = (1 << 1), }; /** @@ -123,6 +125,27 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +/** + * execmem_update_copy - copy an update to executable memory + * @dst: destination address to update + * @src: source address containing the data + * @size: how many bytes of memory shold be copied + * + * Copy @size bytes from @src to @dst using text poking if the memory at + * @dst is read-only. + * + * Return: a pointer to @dst or NULL on error + */ +void *execmem_update_copy(void *dst, const void *src, size_t size); + +/** + * execmem_is_rox - check if execmem is read-only + * @type - the execmem type to check + * + * Return: %true if the @type is read-only, %false if it's writable + */ +bool execmem_is_rox(enum execmem_type type); + #if defined(CONFIG_EXECMEM) && !defined(CONFIG_ARCH_WANTS_EXECMEM_LATE) void execmem_init(void); #else -- cgit v1.2.3 From 2e45474ab14f0f17c1091c503a13ff2fe2a84486 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:10 +0300 Subject: execmem: add support for cache of large ROX pages Using large pages to map text areas reduces iTLB pressure and improves performance. Extend execmem_alloc() with an ability to use huge pages with ROX permissions as a cache for smaller allocations. To populate the cache, a writable large page is allocated from vmalloc with VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as ROX. The direct map alias of that large page is exculded from the direct map. Portions of that large page are handed out to execmem_alloc() callers without any changes to the permissions. When the memory is freed with execmem_free() it is invalidated again so that it won't contain stale instructions. An architecture has to implement execmem_fill_trapping_insns() callback and select ARCH_HAS_EXECMEM_ROX configuration option to be able to use the ROX cache. The cache is enabled on per-range basis when an architecture sets EXECMEM_ROX_CACHE flag in definition of an execmem_range. Link: https://lkml.kernel.org/r/20241023162711.2579610-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/Kconfig | 8 ++ include/linux/execmem.h | 14 +++ mm/execmem.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++-- mm/internal.h | 1 + mm/vmalloc.c | 5 + 5 files changed, 345 insertions(+), 8 deletions(-) (limited to 'include/linux/execmem.h') diff --git a/arch/Kconfig b/arch/Kconfig index bd9f095d69fa..89b14e4edc61 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1024,6 +1024,14 @@ config ARCH_WANTS_EXECMEM_LATE enough entropy for module space randomization, for instance arm64. +config ARCH_HAS_EXECMEM_ROX + bool + depends on MMU && !HIGHMEM + help + For architectures that support allocations of executable memory + with read-only execute permissions. Architecture must implement + execmem_fill_trapping_insns() callback to enable this. + config HAVE_IRQ_EXIT_ON_IRQ_STACK bool help diff --git a/include/linux/execmem.h b/include/linux/execmem.h index dfdf19f8a5e8..1517fa196bf7 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -53,6 +53,20 @@ enum execmem_range_flags { EXECMEM_ROX_CACHE = (1 << 1), }; +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +/** + * execmem_fill_trapping_insns - set memory to contain instructions that + * will trap + * @ptr: pointer to memory to fill + * @size: size of the range to fill + * @writable: is the memory poited by @ptr is writable or ROX + * + * A hook for architecures to fill execmem ranges with invalid instructions. + * Architectures that use EXECMEM_ROX_CACHE must implement this. + */ +void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); +#endif + /** * struct execmem_range - definition of an address space suitable for code and * related data allocations diff --git a/mm/execmem.c b/mm/execmem.c index 0f6691e9ffe6..576a57e2161f 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -6,29 +6,41 @@ * Copyright (C) 2024 Mike Rapoport IBM. */ +#define pr_fmt(fmt) "execmem: " fmt + #include +#include #include #include +#include +#include #include #include +#include + +#include "internal.h" + static struct execmem_info *execmem_info __ro_after_init; static struct execmem_info default_execmem_info __ro_after_init; -static void *__execmem_alloc(struct execmem_range *range, size_t size) +#ifdef CONFIG_MMU +static void *execmem_vmalloc(struct execmem_range *range, size_t size, + pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; - unsigned long vm_flags = VM_FLUSH_RESET_PERMS; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; + unsigned int align = range->alignment; unsigned long start = range->start; unsigned long end = range->end; - unsigned int align = range->alignment; - pgprot_t pgprot = range->pgprot; void *p; if (kasan) vm_flags |= VM_DEFER_KMEMLEAK; + if (vm_flags & VM_ALLOW_HUGE_VMAP) + align = PMD_SIZE; + p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); @@ -41,7 +53,7 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size) } if (!p) { - pr_warn_ratelimited("execmem: unable to allocate memory\n"); + pr_warn_ratelimited("unable to allocate memory\n"); return NULL; } @@ -50,14 +62,298 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size) return NULL; } - return kasan_reset_tag(p); + return p; } +#else +static void *execmem_vmalloc(struct execmem_range *range, size_t size, + pgprot_t pgprot, unsigned long vm_flags) +{ + return vmalloc(size); +} +#endif /* CONFIG_MMU */ + +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +struct execmem_cache { + struct mutex mutex; + struct maple_tree busy_areas; + struct maple_tree free_areas; +}; + +static struct execmem_cache execmem_cache = { + .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), + .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, + execmem_cache.mutex), + .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN, + execmem_cache.mutex), +}; + +static inline unsigned long mas_range_len(struct ma_state *mas) +{ + return mas->last - mas->index + 1; +} + +static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) +{ + unsigned int nr = (1 << get_vm_area_page_order(vm)); + unsigned int updated = 0; + int err = 0; + + for (int i = 0; i < vm->nr_pages; i += nr) { + err = set_direct_map_valid_noflush(vm->pages[i], nr, valid); + if (err) + goto err_restore; + updated += nr; + } + + return 0; + +err_restore: + for (int i = 0; i < updated; i += nr) + set_direct_map_valid_noflush(vm->pages[i], nr, !valid); + + return err; +} + +static void execmem_cache_clean(struct work_struct *work) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct mutex *mutex = &execmem_cache.mutex; + MA_STATE(mas, free_areas, 0, ULONG_MAX); + void *area; + + mutex_lock(mutex); + mas_for_each(&mas, area, ULONG_MAX) { + size_t size = mas_range_len(&mas); + + if (IS_ALIGNED(size, PMD_SIZE) && + IS_ALIGNED(mas.index, PMD_SIZE)) { + struct vm_struct *vm = find_vm_area(area); + + execmem_set_direct_map_valid(vm, true); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + vfree(area); + } + } + mutex_unlock(mutex); +} + +static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); + +static int execmem_cache_add(void *ptr, size_t size) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, free_areas, addr - 1, addr + 1); + unsigned long lower, upper; + void *area = NULL; + int err; + + lower = addr; + upper = addr + size - 1; + + mutex_lock(mutex); + area = mas_walk(&mas); + if (area && mas.last == addr - 1) + lower = mas.index; + + area = mas_next(&mas, ULONG_MAX); + if (area && mas.index == addr + size) + upper = mas.last; + + mas_set_range(&mas, lower, upper); + err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); + mutex_unlock(mutex); + if (err) + return err; + + return 0; +} + +static bool within_range(struct execmem_range *range, struct ma_state *mas, + size_t size) +{ + unsigned long addr = mas->index; + + if (addr >= range->start && addr + size < range->end) + return true; + + if (range->fallback_start && + addr >= range->fallback_start && addr + size < range->fallback_end) + return true; + + return false; +} + +static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + MA_STATE(mas_free, free_areas, 0, ULONG_MAX); + MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr, last, area_size = 0; + void *area, *ptr = NULL; + int err; + + mutex_lock(mutex); + mas_for_each(&mas_free, area, ULONG_MAX) { + area_size = mas_range_len(&mas_free); + + if (area_size >= size && within_range(range, &mas_free, size)) + break; + } + + if (area_size < size) + goto out_unlock; + + addr = mas_free.index; + last = mas_free.last; + + /* insert allocated size to busy_areas at range [addr, addr + size) */ + mas_set_range(&mas_busy, addr, addr + size - 1); + err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); + if (err) + goto out_unlock; + + mas_store_gfp(&mas_free, NULL, GFP_KERNEL); + if (area_size > size) { + void *ptr = (void *)(addr + size); + + /* + * re-insert remaining free size to free_areas at range + * [addr + size, last] + */ + mas_set_range(&mas_free, addr + size, last); + err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); + if (err) { + mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); + goto out_unlock; + } + } + ptr = (void *)addr; + +out_unlock: + mutex_unlock(mutex); + return ptr; +} + +static int execmem_cache_populate(struct execmem_range *range, size_t size) +{ + unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + unsigned long start, end; + struct vm_struct *vm; + size_t alloc_size; + int err = -ENOMEM; + void *p; + + alloc_size = round_up(size, PMD_SIZE); + p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + if (!p) + return err; + + vm = find_vm_area(p); + if (!vm) + goto err_free_mem; + + /* fill memory with instructions that will trap */ + execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); + + start = (unsigned long)p; + end = start + alloc_size; + + vunmap_range(start, end); + + err = execmem_set_direct_map_valid(vm, false); + if (err) + goto err_free_mem; + + err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, + PMD_SHIFT); + if (err) + goto err_free_mem; + + err = execmem_cache_add(p, alloc_size); + if (err) + goto err_free_mem; + + return 0; + +err_free_mem: + vfree(p); + return err; +} + +static void *execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + void *p; + int err; + + p = __execmem_cache_alloc(range, size); + if (p) + return p; + + err = execmem_cache_populate(range, size); + if (err) + return NULL; + + return __execmem_cache_alloc(range, size); +} + +static bool execmem_cache_free(void *ptr) +{ + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, busy_areas, addr, addr); + size_t size; + void *area; + + mutex_lock(mutex); + area = mas_walk(&mas); + if (!area) { + mutex_unlock(mutex); + return false; + } + size = mas_range_len(&mas); + + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mutex_unlock(mutex); + + execmem_fill_trapping_insns(ptr, size, /* writable = */ false); + + execmem_cache_add(ptr, size); + + schedule_work(&execmem_cache_clean_work); + + return true; +} +#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ +static void *execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + return NULL; +} + +static bool execmem_cache_free(void *ptr) +{ + return false; +} +#endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; + bool use_cache = range->flags & EXECMEM_ROX_CACHE; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + pgprot_t pgprot = range->pgprot; + void *p; - return __execmem_alloc(range, size); + if (use_cache) + p = execmem_cache_alloc(range, size); + else + p = execmem_vmalloc(range, size, pgprot, vm_flags); + + return kasan_reset_tag(p); } void execmem_free(void *ptr) @@ -67,7 +363,9 @@ void execmem_free(void *ptr) * supported by vmalloc. */ WARN_ON(in_interrupt()); - vfree(ptr); + + if (!execmem_cache_free(ptr)) + vfree(ptr); } void *execmem_update_copy(void *dst, const void *src, size_t size) @@ -89,6 +387,17 @@ static bool execmem_validate(struct execmem_info *info) return false; } + if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { + for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) { + r = &info->ranges[i]; + + if (r->flags & EXECMEM_ROX_CACHE) { + pr_warn_once("ROX cache is not supported\n"); + r->flags &= ~EXECMEM_ROX_CACHE; + } + } + } + return true; } diff --git a/mm/internal.h b/mm/internal.h index c743c2b21dba..3dc745ba76dd 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1234,6 +1234,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); +unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5c0ea4e2b17d..74c0a5eae210 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3023,6 +3023,11 @@ static inline unsigned int vm_area_page_order(struct vm_struct *vm) #endif } +unsigned int get_vm_area_page_order(struct vm_struct *vm) +{ + return vm_area_page_order(vm); +} + static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC -- cgit v1.2.3 From 0f9b685626daa2f8e19a9788625c9b624c223e45 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:57 -0700 Subject: alloc_tag: populate memory for module tags as needed The memory reserved for module tags does not need to be backed by physical pages until there are tags to store there. Change the way we reserve this memory to allocate only virtual area for the tags and populate it with physical pages as needed when we load a module. [surenb@google.com: avoid execmem_vmap() when !MMU] Link: https://lkml.kernel.org/r/20241031233611.3833002-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/execmem.h | 12 ++++++++ include/linux/vmalloc.h | 3 ++ lib/Kconfig.debug | 1 + lib/alloc_tag.c | 73 +++++++++++++++++++++++++++++++++++++++++++------ mm/execmem.c | 16 +++++++++++ mm/internal.h | 6 ++++ mm/vmalloc.c | 4 +-- 7 files changed, 104 insertions(+), 11 deletions(-) (limited to 'include/linux/execmem.h') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 1517fa196bf7..64130ae19690 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -139,6 +139,18 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +#ifdef CONFIG_MMU +/** + * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory + * @size: size of the virtual mapping in bytes + * + * Maps virtually contiguous area in the range suitable for EXECMEM_MODULE_DATA. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *execmem_vmap(size_t size); +#endif + /** * execmem_update_copy - copy an update to executable memory * @dst: destination address to update diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 27408f21e501..31e9ffd936e3 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -202,6 +202,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, + struct page **pages, unsigned int page_shift); + /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..6798bbbcbd32 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -993,6 +993,7 @@ config CODE_TAGGING config MEM_ALLOC_PROFILING bool "Enable memory allocation profiling" default n + depends on MMU depends on PROC_FS depends on !DEBUG_FORCE_WEAK_PER_CPU select CODE_TAGGING diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 5f9cd1642d58..4a7fc081b789 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -8,14 +8,15 @@ #include #include #include +#include #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -static bool mem_profiling_support __meminitdata = true; +static bool mem_profiling_support = true; #else -static bool mem_profiling_support __meminitdata; +static bool mem_profiling_support; #endif static struct codetag_type *alloc_tag_cttype; @@ -154,7 +155,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } -static void __init shutdown_mem_profiling(void) +static void shutdown_mem_profiling(void) { if (mem_alloc_profiling_enabled()) static_branch_disable(&mem_alloc_profiling_key); @@ -179,6 +180,7 @@ static void __init procfs_init(void) #ifdef CONFIG_MODULES static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); +static struct vm_struct *vm_module_tags; /* A dummy object used to indicate an unloaded module */ static struct module unloaded_mod; /* A dummy object used to indicate a module prepended area */ @@ -252,6 +254,33 @@ repeat: return false; } +static int vm_module_tags_populate(void) +{ + unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT; + + if (phys_size < module_tags.size) { + struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages; + unsigned long addr = module_tags.start_addr + phys_size; + unsigned long more_pages; + unsigned long nr; + + more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT; + nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages, next_page); + if (nr < more_pages || + vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL, + next_page, PAGE_SHIFT) < 0) { + /* Clean up and error out */ + for (int i = 0; i < nr; i++) + __free_page(next_page[i]); + return -ENOMEM; + } + vm_module_tags->nr_pages += nr; + } + + return 0; +} + static void *reserve_module_tags(struct module *mod, unsigned long size, unsigned int prepend, unsigned long align) { @@ -310,8 +339,18 @@ unlock: if (IS_ERR(ret)) return ret; - if (module_tags.size < offset + size) + if (module_tags.size < offset + size) { + int grow_res; + module_tags.size = offset + size; + grow_res = vm_module_tags_populate(); + if (grow_res) { + shutdown_mem_profiling(); + pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n", + mod->name); + return ERR_PTR(grow_res); + } + } return (struct alloc_tag *)(module_tags.start_addr + offset); } @@ -372,12 +411,23 @@ static void replace_module(struct module *mod, struct module *new_mod) static int __init alloc_mod_tags_mem(void) { - /* Allocate space to copy allocation tags */ - module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA, - MODULE_ALLOC_TAG_VMAP_SIZE); - if (!module_tags.start_addr) + /* Map space to copy allocation tags */ + vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE); + if (!vm_module_tags) { + pr_err("Failed to map %lu bytes for module allocation tags\n", + MODULE_ALLOC_TAG_VMAP_SIZE); + module_tags.start_addr = 0; return -ENOMEM; + } + vm_module_tags->pages = kmalloc_array(get_vm_area_size(vm_module_tags) >> PAGE_SHIFT, + sizeof(struct page *), GFP_KERNEL | __GFP_ZERO); + if (!vm_module_tags->pages) { + free_vm_area(vm_module_tags); + return -ENOMEM; + } + + module_tags.start_addr = (unsigned long)vm_module_tags->addr; module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; return 0; @@ -385,8 +435,13 @@ static int __init alloc_mod_tags_mem(void) static void __init free_mod_tags_mem(void) { - execmem_free((void *)module_tags.start_addr); + int i; + module_tags.start_addr = 0; + for (i = 0; i < vm_module_tags->nr_pages; i++) + __free_page(vm_module_tags->pages[i]); + kfree(vm_module_tags->pages); + free_vm_area(vm_module_tags); } #else /* CONFIG_MODULES */ diff --git a/mm/execmem.c b/mm/execmem.c index 576a57e2161f..317b6a8d35be 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -64,6 +64,22 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size, return p; } + +struct vm_struct *execmem_vmap(size_t size) +{ + struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; + struct vm_struct *area; + + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->start, range->end, NUMA_NO_NODE, + GFP_KERNEL, __builtin_return_address(0)); + if (!area && range->fallback_start) + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->fallback_start, range->fallback_end, + NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); + + return area; +} #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) diff --git a/mm/internal.h b/mm/internal.h index 3dc745ba76dd..cd96848be245 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1263,6 +1263,12 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); +struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long shift, + unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, + const void *caller); + /* * mm/gup.c */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 74c0a5eae210..7ed39d104201 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -653,7 +653,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, * RETURNS: * 0 on success, -errno on failure. */ -static int vmap_pages_range(unsigned long addr, unsigned long end, +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; @@ -3106,7 +3106,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) vm->flags &= ~VM_UNINITIALIZED; } -static struct vm_struct *__get_vm_area_node(unsigned long size, +struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) -- cgit v1.2.3