summaryrefslogtreecommitdiff
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c130
1 files changed, 107 insertions, 23 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bb6ae08d18f5..1afca3568b9b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3416,6 +3416,32 @@ void vfree_atomic(const void *addr)
schedule_work(&p->wq);
}
+/*
+ * vm_area_free_pages - free a range of pages from a vmalloc allocation
+ * @vm: the vm_struct containing the pages
+ * @start_idx: first page index to free (inclusive)
+ * @end_idx: last page index to free (exclusive)
+ *
+ * Free pages [start_idx, end_idx) updating NR_VMALLOC stat accounting.
+ * Freed vm->pages[] entries are set to NULL.
+ * Caller is responsible for unmapping (vunmap_range) and KASAN
+ * poisoning before calling this.
+ */
+static void vm_area_free_pages(struct vm_struct *vm, unsigned int start_idx,
+ unsigned int end_idx)
+{
+ unsigned int i;
+
+ if (!(vm->flags & VM_MAP_PUT_PAGES)) {
+ for (i = start_idx; i < end_idx; i++)
+ mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1);
+ }
+ free_pages_bulk(vm->pages + start_idx, end_idx - start_idx);
+
+ for (i = start_idx; i < end_idx; i++)
+ vm->pages[i] = NULL;
+}
+
/**
* vfree - Release memory allocated by vmalloc()
* @addr: Memory base address
@@ -3436,7 +3462,6 @@ void vfree_atomic(const void *addr)
void vfree(const void *addr)
{
struct vm_struct *vm;
- int i;
if (unlikely(in_interrupt())) {
vfree_atomic(addr);
@@ -3459,19 +3484,8 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
- for (i = 0; i < vm->nr_pages; i++) {
- struct page *page = vm->pages[i];
- BUG_ON(!page);
- /*
- * High-order allocs for huge vmallocs are split, so
- * can be freed as an array of order-0 allocations
- */
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_lruvec_page_state(page, NR_VMALLOC, -1);
- __free_page(page);
- cond_resched();
- }
+ vm_area_free_pages(vm, 0, vm->nr_pages);
kvfree(vm->pages);
kfree(vm);
}
@@ -3939,7 +3953,7 @@ fail:
__GFP_NOFAIL | __GFP_ZERO |\
__GFP_NORETRY | __GFP_RETRY_MAYFAIL |\
GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
- GFP_USER | __GFP_NOLOCKDEP)
+ GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN)
static gfp_t vmalloc_fix_flags(gfp_t flags)
{
@@ -3980,6 +3994,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags)
*
* %__GFP_NOWARN can be used to suppress failure messages.
*
+ * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages
+ * (when prot=%PAGE_KERNEL).
+ *
* Can not be called from interrupt nor NMI contexts.
* Return: the address of the area or %NULL on failure
*/
@@ -3993,6 +4010,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
+ bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN);
if (WARN_ON_ONCE(!size))
return NULL;
@@ -4023,7 +4041,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
again:
area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
- gfp_mask, caller);
+ gfp_mask & ~__GFP_SKIP_KASAN, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
@@ -4041,7 +4059,7 @@ again:
* kasan_unpoison_vmalloc().
*/
if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
- if (kasan_hw_tags_enabled()) {
+ if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) {
/*
* Modify protection bits to allow tagging.
* This must be done before mapping.
@@ -4078,7 +4096,8 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
+ if (!skip_vmalloc_kasan)
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -4324,16 +4343,70 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
nid != page_to_nid(vmalloc_to_page(p)))
goto need_realloc;
+ } else {
+ /*
+ * If p is NULL, vrealloc behaves exactly like vmalloc.
+ * Skip the shrink and in-place grow paths.
+ */
+ goto need_realloc;
}
- /*
- * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
- * would be a good heuristic for when to shrink the vm_area?
- */
if (size <= old_size) {
+ unsigned int new_nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
/* Zero out "freed" memory, potentially for future realloc. */
if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
+
+ /*
+ * Free tail pages when shrink crosses a page boundary.
+ *
+ * Skip huge page allocations (page_order > 0) as partial
+ * freeing would require splitting.
+ *
+ * Skip VM_FLUSH_RESET_PERMS, as direct-map permissions must
+ * be reset before pages are returned to the allocator.
+ *
+ * Skip VM_USERMAP, as remap_vmalloc_range_partial() validates
+ * mapping requests against the unchanged vm->size; freeing
+ * tail pages would cause vmalloc_to_page() to return NULL for
+ * the unmapped range.
+ *
+ * Skip if either GFP_NOFS or GFP_NOIO are used.
+ * kmemleak_free_part() internally allocates with
+ * GFP_KERNEL, which could trigger a recursive deadlock
+ * if we are under filesystem or I/O reclaim.
+ */
+ if (new_nr_pages < vm->nr_pages && !vm_area_page_order(vm) &&
+ !(vm->flags & (VM_FLUSH_RESET_PERMS | VM_USERMAP)) &&
+ gfp_has_io_fs(flags)) {
+ unsigned long addr = (unsigned long)kasan_reset_tag(p);
+ unsigned int old_nr_pages = vm->nr_pages;
+
+ /*
+ * Use the node lock to synchronize with concurrent
+ * readers (vmalloc_info_show).
+ */
+ struct vmap_node *vn = addr_to_node(addr);
+
+ spin_lock(&vn->busy.lock);
+ vm->nr_pages = new_nr_pages;
+ spin_unlock(&vn->busy.lock);
+
+ /* Notify kmemleak of the reduced allocation size before unmapping. */
+ kmemleak_free_part(
+ (void *)addr + ((unsigned long)new_nr_pages
+ << PAGE_SHIFT),
+ (unsigned long)(old_nr_pages - new_nr_pages)
+ << PAGE_SHIFT);
+
+ vunmap_range(addr + ((unsigned long)new_nr_pages
+ << PAGE_SHIFT),
+ addr + ((unsigned long)old_nr_pages
+ << PAGE_SHIFT));
+
+ vm_area_free_pages(vm, new_nr_pages, old_nr_pages);
+ }
vm->requested_size = size;
kasan_vrealloc(p, old_size, size);
return (void *)p;
@@ -4342,7 +4415,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
/*
* We already have the bytes available in the allocation; use them.
*/
- if (size <= alloced_size) {
+ if (size <= vm->nr_pages << PAGE_SHIFT) {
/*
* No need to zero memory here, as unused memory will have
* already been zeroed at initial allocation time or during
@@ -4641,7 +4714,18 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
smp_rmb();
vaddr = (char *) va->va_start;
- size = vm ? get_vm_area_size(vm) : va_size(va);
+ if (vm)
+ /*
+ * For VM_ALLOC areas, use nr_pages rather than
+ * get_vm_area_size() because vrealloc() may shrink
+ * the mapping without updating area->size. Other
+ * mapping types (vmap, ioremap) don't set nr_pages.
+ */
+ size = (vm->flags & VM_ALLOC && vm->nr_pages) ?
+ (vm->nr_pages << PAGE_SHIFT) :
+ get_vm_area_size(vm);
+ else
+ size = va_size(va);
if (addr >= vaddr + size)
goto next_va;