diff options
Diffstat (limited to 'fs/proc/task_mmu.c')
-rw-r--r-- | fs/proc/task_mmu.c | 261 |
1 files changed, 209 insertions, 52 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 38a5a3e9cba2..3d6d8a9f13fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -29,6 +29,9 @@ #include <asm/tlbflush.h> #include "internal.h" +#define SENTINEL_VMA_END -1 +#define SENTINEL_VMA_GATE -2 + #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) @@ -36,9 +39,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); + anon = get_mm_counter_sum(mm, MM_ANONPAGES); + file = get_mm_counter_sum(mm, MM_FILEPAGES); + shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -59,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; - swap = get_mm_counter(mm, MM_SWAPENTS); + swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); @@ -92,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } @@ -127,15 +130,134 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, - loff_t *ppos) +#ifdef CONFIG_PER_VMA_LOCK + +static void unlock_vma(struct proc_maps_private *priv) +{ + if (priv->locked_vma) { + vma_end_read(priv->locked_vma); + priv->locked_vma = NULL; + } +} + +static const struct seq_operations proc_pid_maps_op; + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_private *priv) { - struct vm_area_struct *vma = vma_next(&priv->iter); + /* + * smaps and numa_maps perform page table walk, therefore require + * mmap_lock but maps can be read with locking just the vma and + * walking the vma tree under rcu read protection. + */ + if (m->op != &proc_pid_maps_op) { + if (mmap_read_lock_killable(priv->mm)) + return false; + priv->mmap_locked = true; + } else { + rcu_read_lock(); + priv->locked_vma = NULL; + priv->mmap_locked = false; + } + + return true; +} + +static inline void unlock_vma_range(struct proc_maps_private *priv) +{ + if (priv->mmap_locked) { + mmap_read_unlock(priv->mm); + } else { + unlock_vma(priv); + rcu_read_unlock(); + } +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + struct vm_area_struct *vma; + + if (priv->mmap_locked) + return vma_next(&priv->iter); + + unlock_vma(priv); + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); + if (!IS_ERR_OR_NULL(vma)) + priv->locked_vma = vma; + + return vma; +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + if (priv->mmap_locked) + return false; + + rcu_read_unlock(); + mmap_read_lock(priv->mm); + /* Reinitialize the iterator after taking mmap_lock */ + vma_iter_set(&priv->iter, pos); + priv->mmap_locked = true; + + return true; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_private *priv) +{ + return mmap_read_lock_killable(priv->mm) == 0; +} + +static inline void unlock_vma_range(struct proc_maps_private *priv) +{ + mmap_read_unlock(priv->mm); +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + return vma_next(&priv->iter); +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + return false; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma; + +retry: + vma = get_next_vma(priv, *ppos); + /* EINTR of EAGAIN is possible */ + if (IS_ERR(vma)) { + if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) + goto retry; + + return vma; + } + + /* Store previous position to be able to restart if needed */ + priv->last_pos = *ppos; if (vma) { - *ppos = vma->vm_start; + /* + * Track the end of the reported vma to ensure position changes + * even if previous vma was merged with the next vma and we + * found the extended vma with the same vm_start. + */ + *ppos = vma->vm_end; } else { - *ppos = -2UL; + *ppos = SENTINEL_VMA_GATE; vma = get_gate_vma(priv->mm); } @@ -145,11 +267,11 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = *ppos; + loff_t last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ - if (last_addr == -1UL) + if (last_addr == SENTINEL_VMA_END) return NULL; priv->task = get_proc_task(priv->inode); @@ -163,28 +285,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return NULL; } - if (mmap_read_lock_killable(mm)) { + if (!lock_vma_range(m, priv)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } - vma_iter_init(&priv->iter, mm, last_addr); + /* + * Reset current position if last_addr was set before + * and it's not a sentinel. + */ + if (last_addr > 0) + *ppos = last_addr = priv->last_pos; + vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); hold_task_mempolicy(priv); - if (last_addr == -2UL) + if (last_addr == SENTINEL_VMA_GATE) return get_gate_vma(mm); - return proc_get_vma(priv, ppos); + return proc_get_vma(m, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - if (*ppos == -2UL) { - *ppos = -1UL; + if (*ppos == SENTINEL_VMA_GATE) { + *ppos = SENTINEL_VMA_END; return NULL; } - return proc_get_vma(m->private, ppos); + return proc_get_vma(m, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -196,7 +324,7 @@ static void m_stop(struct seq_file *m, void *v) return; release_task_mempolicy(priv); - mmap_read_unlock(mm); + unlock_vma_range(priv); mmput(mm); put_task_struct(priv->task); priv->task = NULL; @@ -212,8 +340,8 @@ static int proc_maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -707,6 +835,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; + bool exclusive; + int mapcount; /* * First accumulate quantities that depend only on |size| and the type @@ -747,18 +877,29 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, dirty, locked, present); return; } + + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + mapcount = folio_average_page_mapcount(folio); + exclusive = !folio_maybe_mapped_shared(folio); + } + /* * We obtain a snapshot of the mapcount. Without holding the folio lock * this snapshot can be slightly wrong as we cannot always read the * mapcount atomically. */ for (i = 0; i < nr; i++, page++) { - int mapcount = folio_precise_page_mapcount(folio, page); unsigned long pss = PAGE_SIZE << PSS_SHIFT; + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { + mapcount = folio_precise_page_mapcount(folio, page); + exclusive = mapcount < 2; + } + if (mapcount >= 2) pss /= mapcount; smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, - dirty, locked, mapcount < 2); + dirty, locked, exclusive); } } @@ -1023,7 +1164,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (folio) { /* We treat non-present entries as "maybe shared". */ - if (!present || folio_likely_mapped_shared(folio) || + if (!present || folio_maybe_mapped_shared(folio) || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else @@ -1312,8 +1453,8 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - ret = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; single_release(inode, file); goto out_free; @@ -1632,6 +1773,7 @@ struct pagemapread { #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_UFFD_WP BIT_ULL(57) +#define PM_GUARD_REGION BIT_ULL(58) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) @@ -1651,6 +1793,13 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) return 0; } +static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) +{ + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + return folio_precise_page_mapcount(folio, page) == 1; + return !folio_maybe_mapped_shared(folio); +} + static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { @@ -1732,6 +1881,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, page = pfn_swap_entry_to_page(entry); if (pte_marker_entry_uffd_wp(entry)) flags |= PM_UFFD_WP; + if (is_guard_swp_entry(entry)) + flags |= PM_GUARD_REGION; } if (page) { @@ -1739,7 +1890,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (!folio_test_anon(folio)) flags |= PM_FILE; if ((flags & PM_PRESENT) && - folio_precise_page_mapcount(folio, page) == 1) + __folio_page_mapped_exclusively(folio, page)) flags |= PM_MMAP_EXCLUSIVE; } if (vma->vm_flags & VM_SOFTDIRTY) @@ -1810,11 +1961,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } for (; addr != end; addr += PAGE_SIZE, idx++) { - unsigned long cur_flags = flags; + u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && - folio_precise_page_mapcount(folio, page + idx) == 1) + __folio_page_mapped_exclusively(folio, page)) cur_flags |= PM_MMAP_EXCLUSIVE; pme = make_pme(frame, cur_flags); @@ -1879,7 +2030,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (!folio_test_anon(folio)) flags |= PM_FILE; - if (!folio_likely_mapped_shared(folio) && + if (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; @@ -1931,7 +2082,8 @@ static const struct mm_walk_ops pagemap_ops = { * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected - * Bits 58-60 zero + * Bit 58 pte is a guard region + * Bits 59-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -2045,8 +2197,8 @@ static int pagemap_open(struct inode *inode, struct file *file) struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } @@ -2063,7 +2215,8 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -2104,12 +2257,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pte_to_swp_entry(pte); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) - categories |= PAGE_IS_FILE; - } + swp = pte_to_swp_entry(pte); + if (is_guard_swp_entry(swp)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + is_pfn_swap_entry(swp) && + !folio_test_anon(pfn_swap_entry_folio(swp))) + categories |= PAGE_IS_FILE; + if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } @@ -2155,7 +2310,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_FILE; } - if (is_zero_pfn(pmd_pfn(pmd))) + if (is_huge_zero_pmd(pmd)) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; @@ -2455,22 +2610,19 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, spinlock_t *ptl; int ret; - arch_enter_lazy_mmu_mode(); - ret = pagemap_scan_thp_entry(pmd, start, end, walk); - if (ret != -ENOENT) { - arch_leave_lazy_mmu_mode(); + if (ret != -ENOENT) return ret; - } ret = 0; start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); if (!pte) { - arch_leave_lazy_mmu_mode(); walk->action = ACTION_AGAIN; return 0; } + arch_enter_lazy_mmu_mode(); + if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { @@ -2539,8 +2691,8 @@ flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); - pte_unmap_unlock(start_pte, ptl); arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); cond_resched(); return ret; @@ -2855,7 +3007,12 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { struct folio *folio = page_folio(page); - int count = folio_precise_page_mapcount(folio, page); + int count; + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + count = folio_precise_page_mapcount(folio, page); + else + count = folio_average_page_mapcount(folio); md->pages += nr_pages; if (pte_dirty || folio_test_dirty(folio)) |