summaryrefslogtreecommitdiff
path: root/mm/nommu.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-11 03:53:04 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-11 03:53:04 +0300
commit27bc50fc90647bbf7b734c3fc306a5e61350da53 (patch)
tree75fc525fbfec8c07a97a7875a89592317bcad4ca /mm/nommu.c
parent70442fc54e6889a2a77f0e9554e8188a1557f00e (diff)
parentbbff39cc6cbcb86ccfacb2dcafc79912a9f9df69 (diff)
downloadlinux-27bc50fc90647bbf7b734c3fc306a5e61350da53.tar.xz
Merge tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Yu Zhao's Multi-Gen LRU patches are here. They've been under test in linux-next for a couple of months without, to my knowledge, any negative reports (or any positive ones, come to that). - Also the Maple Tree from Liam Howlett. An overlapping range-based tree for vmas. It it apparently slightly more efficient in its own right, but is mainly targeted at enabling work to reduce mmap_lock contention. Liam has identified a number of other tree users in the kernel which could be beneficially onverted to mapletrees. Yu Zhao has identified a hard-to-hit but "easy to fix" lockdep splat at [1]. This has yet to be addressed due to Liam's unfortunately timed vacation. He is now back and we'll get this fixed up. - Dmitry Vyukov introduces KMSAN: the Kernel Memory Sanitizer. It uses clang-generated instrumentation to detect used-unintialized bugs down to the single bit level. KMSAN keeps finding bugs. New ones, as well as the legacy ones. - Yang Shi adds a userspace mechanism (madvise) to induce a collapse of memory into THPs. - Zach O'Keefe has expanded Yang Shi's madvise(MADV_COLLAPSE) to support file/shmem-backed pages. - userfaultfd updates from Axel Rasmussen - zsmalloc cleanups from Alexey Romanov - cleanups from Miaohe Lin: vmscan, hugetlb_cgroup, hugetlb and memory-failure - Huang Ying adds enhancements to NUMA balancing memory tiering mode's page promotion, with a new way of detecting hot pages. - memcg updates from Shakeel Butt: charging optimizations and reduced memory consumption. - memcg cleanups from Kairui Song. - memcg fixes and cleanups from Johannes Weiner. - Vishal Moola provides more folio conversions - Zhang Yi removed ll_rw_block() :( - migration enhancements from Peter Xu - migration error-path bugfixes from Huang Ying - Aneesh Kumar added ability for a device driver to alter the memory tiering promotion paths. For optimizations by PMEM drivers, DRM drivers, etc. - vma merging improvements from Jakub Matěn. - NUMA hinting cleanups from David Hildenbrand. - xu xin added aditional userspace visibility into KSM merging activity. - THP & KSM code consolidation from Qi Zheng. - more folio work from Matthew Wilcox. - KASAN updates from Andrey Konovalov. - DAMON cleanups from Kaixu Xia. - DAMON work from SeongJae Park: fixes, cleanups. - hugetlb sysfs cleanups from Muchun Song. - Mike Kravetz fixes locking issues in hugetlbfs and in hugetlb core. Link: https://lkml.kernel.org/r/CAOUHufZabH85CeUN-MEMgL8gJGzJEWUrkiM58JkTbBhh-jew0Q@mail.gmail.com [1] * tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (555 commits) hugetlb: allocate vma lock for all sharable vmas hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer hugetlb: fix vma lock handling during split vma and range unmapping mglru: mm/vmscan.c: fix imprecise comments mm/mglru: don't sync disk for each aging cycle mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol mm: memcontrol: use do_memsw_account() in a few more places mm: memcontrol: deprecate swapaccounting=0 mode mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled mm/secretmem: remove reduntant return value mm/hugetlb: add available_huge_pages() func mm: remove unused inline functions from include/linux/mm_inline.h selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd selftests/vm: add thp collapse shmem testing selftests/vm: add thp collapse file and tmpfs testing selftests/vm: modularize thp collapse memory operations selftests/vm: dedup THP helpers mm/khugepaged: add tracepoint to hpage_collapse_scan_file() mm/madvise: add file and shmem support to MADV_COLLAPSE ...
Diffstat (limited to 'mm/nommu.c')
-rw-r--r--mm/nommu.c260
1 files changed, 141 insertions, 119 deletions
diff --git a/mm/nommu.c b/mm/nommu.c
index e819cbc21b39..214c70e1d059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -19,7 +19,6 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
-#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -545,26 +544,27 @@ static void put_nommu_region(struct vm_region *region)
__put_nommu_region(region);
}
-/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * and tree and add to the address space's page tree also if not an anonymous
- * page
- * - should be called with mm->mmap_lock held writelocked
- */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- struct vm_area_struct *pvma, *prev;
- struct address_space *mapping;
- struct rb_node **p, *parent, *rb_prev;
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
+}
- BUG_ON(!vma->vm_region);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
+{
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
+}
+static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
+{
mm->map_count++;
vma->vm_mm = mm;
/* add the VMA to the mapping */
if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
@@ -572,67 +572,51 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
- /* add the VMA to the tree */
- parent = rb_prev = NULL;
- p = &mm->mm_rb.rb_node;
- while (*p) {
- parent = *p;
- pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
-
- /* sort by: start addr, end addr, VMA struct addr in that order
- * (the latter is necessary as we may get identical VMAs) */
- if (vma->vm_start < pvma->vm_start)
- p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma->vm_end < pvma->vm_end)
- p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma < pvma)
- p = &(*p)->rb_left;
- else if (vma > pvma) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else
- BUG();
- }
-
- rb_link_node(&vma->vm_rb, parent, p);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+/*
+ * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm().
+ * @mas: The maple state with preallocations.
+ * @mm: The mm_struct
+ * @vma: The vma to add
+ *
+ */
+static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ BUG_ON(!vma->vm_region);
- /* add VMA to the VMA list also */
- prev = NULL;
- if (rb_prev)
- prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ setup_vma_to_mm(vma, mm);
- __vma_link_list(mm, vma, prev);
+ /* add the VMA to the tree */
+ vma_mas_store(vma, mas);
}
/*
- * delete a VMA from its owning mm_struct and address space
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_lock held writelocked
*/
-static void delete_vma_from_mm(struct vm_area_struct *vma)
-{
- int i;
- struct address_space *mapping;
- struct mm_struct *mm = vma->vm_mm;
- struct task_struct *curr = current;
-
- mm->map_count--;
- for (i = 0; i < VMACACHE_SIZE; i++) {
- /* if the vma is cached, invalidate the entire cache */
- if (curr->vmacache.vmas[i] == vma) {
- vmacache_invalidate(mm);
- break;
- }
+static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
}
+ mas_add_vma_to_mm(&mas, mm, vma);
+ return 0;
+}
+static void cleanup_vma_from_mm(struct vm_area_struct *vma)
+{
+ vma->vm_mm->map_count--;
/* remove the VMA from the mapping */
if (vma->vm_file) {
+ struct address_space *mapping;
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
@@ -641,11 +625,24 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
+/*
+ * delete a VMA from its owning mm_struct and address space
+ */
+static int delete_vma_from_mm(struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
- /* remove from the MM's tree and list */
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+ cleanup_vma_from_mm(vma);
- __vma_unlink_list(mm, vma);
+ /* remove from the MM's tree and list */
+ vma_mas_remove(vma, &mas);
+ return 0;
}
/*
@@ -661,31 +658,26 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
vm_area_free(vma);
}
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
+
/*
* look up the first VMA in which addr resides, NULL if none
* - should be called with mm->mmap_lock at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- /* check the cache first */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end > addr) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
-
- return NULL;
+ return mas_walk(&mas);
}
EXPORT_SYMBOL(find_vma);
@@ -717,26 +709,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long end = addr + len;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- /* check the cache first */
- vma = vmacache_find_exact(mm, addr, end);
- if (vma)
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start < addr)
- continue;
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end == end) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ vma = mas_walk(&mas);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start != addr)
+ return NULL;
+ if (vma->vm_end != end)
+ return NULL;
- return NULL;
+ return vma;
}
/*
@@ -1069,6 +1052,7 @@ unsigned long do_mmap(struct file *file,
vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
*populate = 0;
@@ -1087,6 +1071,7 @@ unsigned long do_mmap(struct file *file,
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
if (!region)
@@ -1096,6 +1081,9 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ goto error_maple_preallocate;
+
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1236,7 +1224,7 @@ unsigned long do_mmap(struct file *file,
current->mm->total_vm += len >> PAGE_SHIFT;
share:
- add_vma_to_mm(current->mm, vma);
+ mas_add_vma_to_mm(&mas, current->mm, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1262,6 +1250,7 @@ error:
sharing_violation:
up_write(&nommu_region_sem);
+ mas_destroy(&mas);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
@@ -1278,6 +1267,14 @@ error_getting_region:
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
+
+error_maple_preallocate:
+ kmem_cache_free(vm_region_jar, region);
+ vm_area_free(vma);
+ pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1343,6 +1340,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
struct vm_region *region;
unsigned long npages;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
@@ -1357,9 +1355,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return -ENOMEM;
new = vm_area_dup(vma);
- if (!new) {
- kmem_cache_free(vm_region_jar, region);
- return -ENOMEM;
+ if (!new)
+ goto err_vma_dup;
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ goto err_mas_preallocate;
}
/* most fields are the same, copy all, and then fixup */
@@ -1378,7 +1380,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- delete_vma_from_mm(vma);
down_write(&nommu_region_sem);
delete_nommu_region(vma->vm_region);
if (new_below) {
@@ -1391,9 +1392,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
add_nommu_region(vma->vm_region);
add_nommu_region(new->vm_region);
up_write(&nommu_region_sem);
- add_vma_to_mm(mm, vma);
- add_vma_to_mm(mm, new);
+
+ setup_vma_to_mm(vma, mm);
+ setup_vma_to_mm(new, mm);
+ mas_set_range(&mas, vma->vm_start, vma->vm_end - 1);
+ mas_store(&mas, vma);
+ vma_mas_store(new, &mas);
return 0;
+
+err_mas_preallocate:
+ vm_area_free(new);
+err_vma_dup:
+ kmem_cache_free(vm_region_jar, region);
+ return -ENOMEM;
}
/*
@@ -1408,12 +1419,14 @@ static int shrink_vma(struct mm_struct *mm,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ return -ENOMEM;
if (from > vma->vm_start)
vma->vm_end = from;
else
vma->vm_start = to;
- add_vma_to_mm(mm, vma);
+ if (add_vma_to_mm(mm, vma))
+ return -ENOMEM;
/* cut the backing region down to size */
region = vma->vm_region;
@@ -1441,9 +1454,10 @@ static int shrink_vma(struct mm_struct *mm,
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *vma;
unsigned long end;
- int ret;
+ int ret = 0;
len = PAGE_ALIGN(len);
if (len == 0)
@@ -1452,7 +1466,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
end = start + len;
/* find the first potentially overlapping VMA */
- vma = find_vma(mm, start);
+ vma = mas_find(&mas, end - 1);
if (!vma) {
static int limit;
if (limit < 5) {
@@ -1471,7 +1485,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
return -EINVAL;
if (end == vma->vm_end)
goto erase_whole_vma;
- vma = vma->vm_next;
+ vma = mas_next(&mas, end - 1);
} while (vma);
return -EINVAL;
} else {
@@ -1493,9 +1507,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
}
erase_whole_vma:
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ ret = -ENOMEM;
delete_vma(mm, vma);
- return 0;
+ return ret;
}
int vm_munmap(unsigned long addr, size_t len)
@@ -1520,6 +1535,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
*/
void exit_mmap(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
if (!mm)
@@ -1527,12 +1543,18 @@ void exit_mmap(struct mm_struct *mm)
mm->total_vm = 0;
- while ((vma = mm->mmap)) {
- mm->mmap = vma->vm_next;
- delete_vma_from_mm(vma);
+ /*
+ * Lock the mm to avoid assert complaining even though this is the only
+ * user of the mm
+ */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ cleanup_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
}
+ __mt_destroy(&mm->mm_mt);
+ mmap_write_unlock(mm);
}
int vm_brk(unsigned long addr, unsigned long len)