diff options
Diffstat (limited to 'mm/vmalloc.c')
| -rw-r--r-- | mm/vmalloc.c | 130 |
1 files changed, 107 insertions, 23 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bb6ae08d18f5..1afca3568b9b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3416,6 +3416,32 @@ void vfree_atomic(const void *addr) schedule_work(&p->wq); } +/* + * vm_area_free_pages - free a range of pages from a vmalloc allocation + * @vm: the vm_struct containing the pages + * @start_idx: first page index to free (inclusive) + * @end_idx: last page index to free (exclusive) + * + * Free pages [start_idx, end_idx) updating NR_VMALLOC stat accounting. + * Freed vm->pages[] entries are set to NULL. + * Caller is responsible for unmapping (vunmap_range) and KASAN + * poisoning before calling this. + */ +static void vm_area_free_pages(struct vm_struct *vm, unsigned int start_idx, + unsigned int end_idx) +{ + unsigned int i; + + if (!(vm->flags & VM_MAP_PUT_PAGES)) { + for (i = start_idx; i < end_idx; i++) + mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); + } + free_pages_bulk(vm->pages + start_idx, end_idx - start_idx); + + for (i = start_idx; i < end_idx; i++) + vm->pages[i] = NULL; +} + /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -3436,7 +3462,6 @@ void vfree_atomic(const void *addr) void vfree(const void *addr) { struct vm_struct *vm; - int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); @@ -3459,19 +3484,8 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - for (i = 0; i < vm->nr_pages; i++) { - struct page *page = vm->pages[i]; - BUG_ON(!page); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_lruvec_page_state(page, NR_VMALLOC, -1); - __free_page(page); - cond_resched(); - } + vm_area_free_pages(vm, 0, vm->nr_pages); kvfree(vm->pages); kfree(vm); } @@ -3939,7 +3953,7 @@ fail: __GFP_NOFAIL | __GFP_ZERO |\ __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ - GFP_USER | __GFP_NOLOCKDEP) + GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN) static gfp_t vmalloc_fix_flags(gfp_t flags) { @@ -3980,6 +3994,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags) * * %__GFP_NOWARN can be used to suppress failure messages. * + * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages + * (when prot=%PAGE_KERNEL). + * * Can not be called from interrupt nor NMI contexts. * Return: the address of the area or %NULL on failure */ @@ -3993,6 +4010,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long original_align = align; unsigned int shift = PAGE_SHIFT; + bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN); if (WARN_ON_ONCE(!size)) return NULL; @@ -4023,7 +4041,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, again: area = __get_vm_area_node(size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, - gfp_mask, caller); + gfp_mask & ~__GFP_SKIP_KASAN, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, @@ -4041,7 +4059,7 @@ again: * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { - if (kasan_hw_tags_enabled()) { + if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) { /* * Modify protection bits to allow tagging. * This must be done before mapping. @@ -4078,7 +4096,8 @@ again: (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ - area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); + if (!skip_vmalloc_kasan) + area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -4324,16 +4343,70 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && nid != page_to_nid(vmalloc_to_page(p))) goto need_realloc; + } else { + /* + * If p is NULL, vrealloc behaves exactly like vmalloc. + * Skip the shrink and in-place grow paths. + */ + goto need_realloc; } - /* - * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What - * would be a good heuristic for when to shrink the vm_area? - */ if (size <= old_size) { + unsigned int new_nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + /* Zero out "freed" memory, potentially for future realloc. */ if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + + /* + * Free tail pages when shrink crosses a page boundary. + * + * Skip huge page allocations (page_order > 0) as partial + * freeing would require splitting. + * + * Skip VM_FLUSH_RESET_PERMS, as direct-map permissions must + * be reset before pages are returned to the allocator. + * + * Skip VM_USERMAP, as remap_vmalloc_range_partial() validates + * mapping requests against the unchanged vm->size; freeing + * tail pages would cause vmalloc_to_page() to return NULL for + * the unmapped range. + * + * Skip if either GFP_NOFS or GFP_NOIO are used. + * kmemleak_free_part() internally allocates with + * GFP_KERNEL, which could trigger a recursive deadlock + * if we are under filesystem or I/O reclaim. + */ + if (new_nr_pages < vm->nr_pages && !vm_area_page_order(vm) && + !(vm->flags & (VM_FLUSH_RESET_PERMS | VM_USERMAP)) && + gfp_has_io_fs(flags)) { + unsigned long addr = (unsigned long)kasan_reset_tag(p); + unsigned int old_nr_pages = vm->nr_pages; + + /* + * Use the node lock to synchronize with concurrent + * readers (vmalloc_info_show). + */ + struct vmap_node *vn = addr_to_node(addr); + + spin_lock(&vn->busy.lock); + vm->nr_pages = new_nr_pages; + spin_unlock(&vn->busy.lock); + + /* Notify kmemleak of the reduced allocation size before unmapping. */ + kmemleak_free_part( + (void *)addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + (unsigned long)(old_nr_pages - new_nr_pages) + << PAGE_SHIFT); + + vunmap_range(addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + addr + ((unsigned long)old_nr_pages + << PAGE_SHIFT)); + + vm_area_free_pages(vm, new_nr_pages, old_nr_pages); + } vm->requested_size = size; kasan_vrealloc(p, old_size, size); return (void *)p; @@ -4342,7 +4415,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align /* * We already have the bytes available in the allocation; use them. */ - if (size <= alloced_size) { + if (size <= vm->nr_pages << PAGE_SHIFT) { /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during @@ -4641,7 +4714,18 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) smp_rmb(); vaddr = (char *) va->va_start; - size = vm ? get_vm_area_size(vm) : va_size(va); + if (vm) + /* + * For VM_ALLOC areas, use nr_pages rather than + * get_vm_area_size() because vrealloc() may shrink + * the mapping without updating area->size. Other + * mapping types (vmap, ioremap) don't set nr_pages. + */ + size = (vm->flags & VM_ALLOC && vm->nr_pages) ? + (vm->nr_pages << PAGE_SHIFT) : + get_vm_area_size(vm); + else + size = va_size(va); if (addr >= vaddr + size) goto next_va; |
