diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/gup_benchmark.c | 15 | ||||
-rw-r--r-- | mm/madvise.c | 125 | ||||
-rw-r--r-- | mm/memcontrol.c | 75 | ||||
-rw-r--r-- | mm/memory-failure.c | 18 | ||||
-rw-r--r-- | mm/memory.c | 16 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 46 | ||||
-rw-r--r-- | mm/migrate.c | 71 | ||||
-rw-r--r-- | mm/mmap.c | 74 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/percpu.c | 3 | ||||
-rw-r--r-- | mm/slab.h | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 147 | ||||
-rw-r--r-- | mm/zsmalloc.c | 10 |
14 files changed, 394 insertions, 219 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c7f30f8b282b..d42423f884a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -816,6 +816,9 @@ config DEVICE_PRIVATE memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. +config VMAP_PFN + bool + config FRAME_VECTOR bool diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 464cae1fa3ea..8b3e5b5cd8fa 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -72,6 +72,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, int nr; struct page **pages; int ret = 0; + bool needs_mmap_lock = + cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK; if (gup->size > ULONG_MAX) return -EINVAL; @@ -81,6 +83,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, if (!pages) return -ENOMEM; + if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) { + ret = -EINTR; + goto free_pages; + } + i = 0; nr = gup->nr_pages_per_call; start_time = ktime_get(); @@ -120,9 +127,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i, NULL); break; default: - kvfree(pages); ret = -EINVAL; - goto out; + goto unlock; } if (nr <= 0) @@ -150,8 +156,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); +unlock: + if (needs_mmap_lock) + mmap_read_unlock(current->mm); +free_pages: kvfree(pages); -out: return ret; } diff --git a/mm/madvise.c b/mm/madvise.c index fd1f448b4e1d..416a56b8e757 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,8 @@ #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> @@ -27,7 +29,6 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> -#include <linux/sched/mm.h> #include <asm/tlb.h> @@ -258,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { + struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; @@ -294,10 +296,10 @@ static long madvise_willneed(struct vm_area_struct *vma, get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return 0; } @@ -766,6 +768,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, unsigned long start, unsigned long end, int behavior) { + struct mm_struct *mm = vma->vm_mm; + *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -773,8 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ - mmap_read_lock(current->mm); - vma = find_vma(current->mm, start); + mmap_read_lock(mm); + vma = find_vma(mm, start); if (!vma) return -ENOMEM; if (start < vma->vm_start) { @@ -828,6 +832,7 @@ static long madvise_remove(struct vm_area_struct *vma, loff_t offset; int error; struct file *f; + struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ @@ -855,13 +860,13 @@ static long madvise_remove(struct vm_area_struct *vma, get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return error; } @@ -984,6 +989,18 @@ madvise_behavior_valid(int behavior) } } +static bool +process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + return true; + default: + return false; + } +} + /* * The madvise(2) system call. * @@ -1031,6 +1048,11 @@ madvise_behavior_valid(int behavior) * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure hanppens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. * * return values: * zero - success @@ -1045,7 +1067,7 @@ madvise_behavior_valid(int behavior) * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -int do_madvise(unsigned long start, size_t len_in, int behavior) +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -1083,10 +1105,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) write = madvise_need_mmap_write(behavior); if (write) { - if (mmap_write_lock_killable(current->mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; } else { - mmap_read_lock(current->mm); + mmap_read_lock(mm); } /* @@ -1094,7 +1116,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; @@ -1131,19 +1153,92 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) if (prev) vma = prev->vm_next; else /* madvise_remove dropped mmap_lock */ - vma = find_vma(current->mm, start); + vma = find_vma(mm, start); } out: blk_finish_plug(&plug); if (write) - mmap_write_unlock(current->mm); + mmap_write_unlock(mm); else - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { - return do_madvise(start, len_in, behavior); + return do_madvise(current->mm, start, len_in, behavior); +} + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + size_t, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec *iov = iovstack; + struct iov_iter iter; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len; + unsigned int f_flags; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + goto out; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto free_iov; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + if (task->mm != current->mm && + !process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } + + if (ret == 0) + ret = total_len - iov_iter_count(&iter); + + mmput(mm); + return ret; + +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +free_iov: + kfree(iov); +out: + return ret; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f74a158cfa8..3a24e3b619f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,6 +73,9 @@ EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +/* Active memory cgroup to use from an interrupt context */ +DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); + /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket; @@ -1061,23 +1064,56 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) } EXPORT_SYMBOL(get_mem_cgroup_from_page); -/** - * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. - */ -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +static __always_inline struct mem_cgroup *active_memcg(void) { - if (unlikely(current->active_memcg)) { - struct mem_cgroup *memcg; + if (in_interrupt()) + return this_cpu_read(int_active_memcg); + else + return current->active_memcg; +} - rcu_read_lock(); +static __always_inline struct mem_cgroup *get_active_memcg(void) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = active_memcg(); + if (memcg) { /* current->active_memcg must hold a ref. */ - if (WARN_ON_ONCE(!css_tryget(¤t->active_memcg->css))) + if (WARN_ON_ONCE(!css_tryget(&memcg->css))) memcg = root_mem_cgroup; else memcg = current->active_memcg; - rcu_read_unlock(); - return memcg; } + rcu_read_unlock(); + + return memcg; +} + +static __always_inline bool memcg_kmem_bypass(void) +{ + /* Allow remote memcg charging from any context. */ + if (unlikely(active_memcg())) + return false; + + /* Memcg to charge can't be determined. */ + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + + return false; +} + +/** + * If active memcg is set, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + if (memcg_kmem_bypass()) + return NULL; + + if (unlikely(active_memcg())) + return get_active_memcg(); + return get_mem_cgroup_from_mm(current->mm); } @@ -2933,12 +2969,12 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg; - if (unlikely(!current->mm && !current->active_memcg)) + if (memcg_kmem_bypass()) return NULL; rcu_read_lock(); - if (unlikely(current->active_memcg)) - memcg = rcu_dereference(current->active_memcg); + if (unlikely(active_memcg())) + memcg = active_memcg(); else memcg = mem_cgroup_from_task(current); @@ -3059,19 +3095,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) struct mem_cgroup *memcg; int ret = 0; - if (memcg_kmem_bypass()) - return 0; - memcg = get_mem_cgroup_from_current(); - if (!mem_cgroup_is_root(memcg)) { + if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { page->mem_cgroup = memcg; __SetPageKmemcg(page); return 0; } + css_put(&memcg->css); } - css_put(&memcg->css); return ret; } @@ -5290,12 +5323,12 @@ static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; long error = -ENOMEM; - memalloc_use_memcg(parent); + old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (IS_ERR(memcg)) return ERR_CAST(memcg); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a2184b721fbf..c0bb186bba62 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1673,16 +1673,6 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private) -{ - struct migration_target_control mtc = { - .nid = page_to_nid(p), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, - }; - - return alloc_migration_target(p, (unsigned long)&mtc); -} - /* * Safely get reference count of an arbitrary page. * Returns 0 for a free page, -EIO for a zero refcount page @@ -1797,6 +1787,10 @@ static int __soft_offline_page(struct page *page) char const *msg_page[] = {"page", "hugepage"}; bool huge = PageHuge(page); LIST_HEAD(pagelist); + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; /* * Check PageHWPoison again inside page lock because PageHWPoison @@ -1833,8 +1827,8 @@ static int __soft_offline_page(struct page *page) } if (isolate_page(hpage, &pagelist)) { - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC, MR_MEMORY_FAILURE); + ret = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (!ret) { bool release = !huge; diff --git a/mm/memory.c b/mm/memory.c index 589afe45d0b3..c48f8df6e502 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2391,13 +2391,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); - do { - if (create || !pte_none(*pte)) { - err = fn(pte++, addr, data); - if (err) - break; - } - } while (addr += PAGE_SIZE, addr != end); + if (fn) { + do { + if (create || !pte_none(*pte)) { + err = fn(pte++, addr, data); + if (err) + break; + } + } while (addr += PAGE_SIZE, addr != end); + } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6f203574ca1d..b44d4c7ba73b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1290,27 +1290,6 @@ found: return 0; } -static struct page *new_node_page(struct page *page, unsigned long private) -{ - nodemask_t nmask = node_states[N_MEMORY]; - struct migration_target_control mtc = { - .nid = page_to_nid(page), - .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, - }; - - /* - * try to allocate from a different node but reuse this node if there - * are no other online nodes to be used (e.g. we are offlining a part - * of the only existing node) - */ - node_clear(mtc.nid, nmask); - if (nodes_empty(nmask)) - node_set(mtc.nid, nmask); - - return alloc_migration_target(page, (unsigned long)&mtc); -} - static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { @@ -1370,9 +1349,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) put_page(page); } if (!list_empty(&source)) { - /* Allocate a new page from the nearest neighbor node */ - ret = migrate_pages(&source, new_node_page, NULL, 0, - MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + nodemask_t nmask = node_states[N_MEMORY]; + struct migration_target_control mtc = { + .nmask = &nmask, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; + + /* + * We have checked that migration range is on a single zone so + * we can use the nid of the first page to all the others. + */ + mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + + /* + * try to allocate from a different node but reuse this node + * if there are no other online nodes to be used (e.g. we are + * offlining a part of the only existing node) + */ + node_clear(mtc.nid, nmask); + if (nodes_empty(nmask)) + node_set(mtc.nid, nmask); + ret = migrate_pages(&source, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) { list_for_each_entry(page, &source, lru) { pr_warn("migrating pfn %lx failed ret:%d ", diff --git a/mm/migrate.c b/mm/migrate.c index 4cf1af88c1dd..5ca5842df5db 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1864,33 +1864,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, return nr_pages ? -EFAULT : 0; } -/* - * Move a list of pages in the address space of the currently executing - * process. - */ -static int kernel_move_pages(pid_t pid, unsigned long nr_pages, - const void __user * __user *pages, - const int __user *nodes, - int __user *status, int flags) +static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) { struct task_struct *task; struct mm_struct *mm; - int err; - nodemask_t task_nodes; - - /* Check flags */ - if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) - return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) - return -EPERM; + /* + * There is no need to check if current process has the right to modify + * the specified process when they are same. + */ + if (!pid) { + mmget(current->mm); + *mem_nodes = cpuset_mems_allowed(current); + return current->mm; + } /* Find the mm_struct */ rcu_read_lock(); - task = pid ? find_task_by_vpid(pid) : current; + task = find_task_by_vpid(pid); if (!task) { rcu_read_unlock(); - return -ESRCH; + return ERR_PTR(-ESRCH); } get_task_struct(task); @@ -1900,22 +1894,47 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); - err = -EPERM; + mm = ERR_PTR(-EPERM); goto out; } rcu_read_unlock(); - err = security_task_movememory(task); - if (err) + mm = ERR_PTR(security_task_movememory(task)); + if (IS_ERR(mm)) goto out; - - task_nodes = cpuset_mems_allowed(task); + *mem_nodes = cpuset_mems_allowed(task); mm = get_task_mm(task); +out: put_task_struct(task); - if (!mm) + mm = ERR_PTR(-EINVAL); + return mm; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +static int kernel_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + struct mm_struct *mm; + int err; + nodemask_t task_nodes; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + mm = find_mm_struct(pid, &task_nodes); + if (IS_ERR(mm)) + return PTR_ERR(mm); + if (nodes) err = do_pages_move(mm, task_nodes, nr_pages, pages, nodes, status, flags); @@ -1924,10 +1943,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, mmput(mm); return err; - -out: - put_task_struct(task); - return err; } SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, diff --git a/mm/mmap.c b/mm/mmap.c index ebb92f5515a1..d91ecb00d38c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -558,6 +558,50 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, return 0; } +/* + * vma_next() - Get the next VMA. + * @mm: The mm_struct. + * @vma: The current vma. + * + * If @vma is NULL, return the first vma in the mm. + * + * Returns: The next VMA after @vma. + */ +static inline struct vm_area_struct *vma_next(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + if (!vma) + return mm->mmap; + + return vma->vm_next; +} + +/* + * munmap_vma_range() - munmap VMAs that overlap a range. + * @mm: The mm struct + * @start: The start of the range. + * @len: The length of the range. + * @pprev: pointer to the pointer that will be set to previous vm_area_struct + * @rb_link: the rb_node + * @rb_parent: the parent rb_node + * + * Find all the vm_area_struct that overlap from @start to + * @end and munmap them. Set @pprev to the previous vm_area_struct. + * + * Returns: -ENOMEM on munmap failure or 0 on success. + */ +static inline int +munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, + struct vm_area_struct **pprev, struct rb_node ***link, + struct rb_node **parent, struct list_head *uf) +{ + + while (find_vma_links(mm, start, start + len, pprev, link, parent)) + if (do_munmap(mm, start, len, uf)) + return -ENOMEM; + + return 0; +} static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -1128,10 +1172,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - if (prev) - next = prev->vm_next; - else - next = mm->mmap; + next = vma_next(mm, prev); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; @@ -1707,13 +1748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -ENOMEM; } - /* Clear old maps */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } - + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* * Private writable mapping: check memory availability */ @@ -2632,7 +2669,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; + struct vm_area_struct *next = vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); @@ -2831,7 +2868,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (error) return error; } - vma = prev ? prev->vm_next : mm->mmap; + vma = vma_next(mm, prev); if (unlikely(uf)) { /* @@ -3049,14 +3086,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (error) return error; - /* - * Clear old maps. this also does some error checking for us - */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* Check against address space limits *after* clearing old maps... */ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) diff --git a/mm/nommu.c b/mm/nommu.c index 0df7ca321314..0faf39b32cdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -354,13 +354,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) -{ - BUG(); - return NULL; -} -EXPORT_SYMBOL_GPL(alloc_vm_area); - void free_vm_area(struct vm_struct *area) { BUG(); diff --git a/mm/percpu.c b/mm/percpu.c index 1ed1a349eab8..66a93f096394 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1584,8 +1584,7 @@ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) || - memcg_kmem_bypass()) + if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) return PCPU_CHUNK_ROOT; objcg = get_obj_cgroup_from_current(); diff --git a/mm/slab.h b/mm/slab.h index 06c6587765a3..6d7c6a5056ba 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -280,9 +280,6 @@ static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s, { struct obj_cgroup *objcg; - if (memcg_kmem_bypass()) - return NULL; - objcg = get_obj_cgroup_from_current(); if (!objcg) return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 04ac98bf5045..6ae491a8b210 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/mm/vmalloc.c - * * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 @@ -2321,20 +2319,21 @@ static void __vfree(const void *addr) } /** - * vfree - release memory allocated by vmalloc() - * @addr: memory base address + * vfree - Release memory allocated by vmalloc() + * @addr: Memory base address * - * Free the virtually continuous memory area starting at @addr, as - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is - * NULL, no operation is performed. + * Free the virtually continuous memory area starting at @addr, as obtained + * from one of the vmalloc() family of APIs. This will usually also free the + * physical memory underlying the virtual allocation, but that memory is + * reference counted, so it will not be freed until the last user goes away. * - * Must not be called in NMI context (strictly speaking, only if we don't - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea) + * If @addr is NULL, no operation is performed. * + * Context: * May sleep if called *not* from interrupt context. - * - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) + * Must not be called in NMI context (strictly speaking, it could be + * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea). */ void vfree(const void *addr) { @@ -2376,8 +2375,11 @@ EXPORT_SYMBOL(vunmap); * @flags: vm_area->flags * @prot: page protection for the mapping * - * Maps @count pages from @pages into contiguous kernel virtual - * space. + * Maps @count pages from @pages into contiguous kernel virtual space. + * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself + * (which must be kmalloc or vmalloc memory) and one reference per pages in it + * are transferred from the caller to vmap(), and will be freed / dropped when + * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ @@ -2403,28 +2405,73 @@ void *vmap(struct page **pages, unsigned int count, return NULL; } + if (flags & VM_MAP_PUT_PAGES) + area->pages = pages; return area->addr; } EXPORT_SYMBOL(vmap); +#ifdef CONFIG_VMAP_PFN +struct vmap_pfn_data { + unsigned long *pfns; + pgprot_t prot; + unsigned int idx; +}; + +static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) +{ + struct vmap_pfn_data *data = private; + + if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) + return -EINVAL; + *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); + return 0; +} + +/** + * vmap_pfn - map an array of PFNs into virtually contiguous space + * @pfns: array of PFNs + * @count: number of pages to map + * @prot: page protection for the mapping + * + * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns + * the start address of the mapping. + */ +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) +{ + struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; + struct vm_struct *area; + + area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, + __builtin_return_address(0)); + if (!area) + return NULL; + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + count * PAGE_SIZE, vmap_pfn_apply, &data)) { + free_vm_area(area); + return NULL; + } + return area->addr; +} +EXPORT_SYMBOL_GPL(vmap_pfn); +#endif /* CONFIG_VMAP_PFN */ + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { - struct page **pages; - unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; - const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? - 0 : - __GFP_HIGHMEM; + unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + unsigned int array_size = nr_pages * sizeof(struct page *), i; + struct page **pages; - nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; - array_size = (nr_pages * sizeof(struct page *)); + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - node, area->caller); + pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2442,9 +2489,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask|highmem_mask); + page = alloc_page(gfp_mask); else - page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); + page = alloc_pages_node(node, gfp_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ @@ -3032,54 +3079,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -static int f(pte_t *pte, unsigned long addr, void *data) -{ - pte_t ***p = data; - - if (p) { - *(*p) = pte; - (*p)++; - } - return 0; -} - -/** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * @ptes: returns the PTEs for the address space - * - * Returns: NULL on failure, vm_struct on success - * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. - * - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) - * allocated for the VM area are returned. - */ -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_IOREMAP, - __builtin_return_address(0)); - if (area == NULL) - return NULL; - - /* - * This ensures that page tables are constructed for this region - * of kernel virtual address space and mapped into init_mm. - */ - if (apply_to_page_range(&init_mm, (unsigned long)area->addr, - size, f, ptes ? &ptes : NULL)) { - free_vm_area(area); - return NULL; - } - - return area; -} -EXPORT_SYMBOL_GPL(alloc_vm_area); - void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c36fdff9a371..918c7b019b3d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1122,10 +1122,16 @@ static inline int __zs_cpu_up(struct mapping_area *area) */ if (area->vm) return 0; - area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); + area->vm = get_vm_area(PAGE_SIZE * 2, 0); if (!area->vm) return -ENOMEM; - return 0; + + /* + * Populate ptes in advance to avoid pte allocation with GFP_KERNEL + * in non-preemtible context of zs_map_object. + */ + return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr, + PAGE_SIZE * 2, NULL, NULL); } static inline void __zs_cpu_down(struct mapping_area *area) |