diff options
Diffstat (limited to 'mm/madvise.c')
-rw-r--r-- | mm/madvise.c | 343 |
1 files changed, 253 insertions, 90 deletions
diff --git a/mm/madvise.c b/mm/madvise.c index 0ceae57da7da..1d44a35ae85c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,6 +48,11 @@ struct madvise_walk_private { bool pageout; }; +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -387,7 +392,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ - if (folio_likely_mapped_shared(folio)) + if (folio_maybe_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -486,7 +491,7 @@ restart: if (nr < folio_nr_pages(folio)) { int err; - if (folio_likely_mapped_shared(folio)) + if (folio_maybe_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; @@ -503,6 +508,7 @@ restart: pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -721,7 +727,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (nr < folio_nr_pages(folio)) { int err; - if (folio_likely_mapped_shared(folio)) + if (folio_maybe_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; @@ -736,6 +742,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, start_pte = pte; if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -794,12 +801,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static int madvise_free_single_vma(struct vm_area_struct *vma, +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - struct mmu_gather tlb; + struct mmu_gather *tlb = madv_behavior->tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -815,17 +823,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - tlb_start_vma(&tlb, vma); + tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + &madvise_free_walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); - return 0; } @@ -848,10 +853,17 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + struct zap_details details = { + .reclaim_pt = true, + .even_cows = true, + }; + + zap_page_range_single_batched( + madv_behavior->tlb, vma, start, end - start, &details); return 0; } @@ -888,8 +900,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -928,13 +941,23 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ end = vma->vm_end; } - VM_WARN_ON(start >= end); + /* + * If the memory region between start and end was + * originally backed by 4kB pages and then remapped to + * be backed by hugepages while mmap_lock was dropped, + * the adjustment for hugetlb vma above may have rounded + * end down to the start address. + */ + if (start == end) + return 0; + VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma( + madv_behavior, vma, start, end); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(madv_behavior, vma, start, end); else return -EINVAL; } @@ -1037,13 +1060,7 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) if (!allow_locked) disallowed |= VM_LOCKED; - if (!vma_is_anonymous(vma)) - return false; - - if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) - return false; - - return true; + return !(vma->vm_flags & disallowed); } static bool is_guard_pte_marker(pte_t ptent) @@ -1241,8 +1258,10 @@ static long madvise_guard_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; @@ -1262,7 +1281,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1384,7 +1403,32 @@ static int madvise_inject_error(int behavior, return 0; } -#endif + +static bool is_memory_failure(int behavior) +{ + switch (behavior) { + case MADV_HWPOISON: + case MADV_SOFT_OFFLINE: + return true; + default: + return false; + } +} + +#else + +static int madvise_inject_error(int behavior, + unsigned long start, unsigned long end) +{ + return 0; +} + +static bool is_memory_failure(int behavior) +{ + return false; +} + +#endif /* CONFIG_MEMORY_FAILURE */ static bool madvise_behavior_valid(int behavior) @@ -1454,10 +1498,10 @@ static bool process_madvise_remote_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1515,7 +1559,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1524,7 +1568,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1556,10 +1600,142 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +static int madvise_lock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return 0; + + if (madvise_need_mmap_write(behavior)) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + } else { + mmap_read_lock(mm); + } + return 0; +} + +static void madvise_unlock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return; + + if (madvise_need_mmap_write(behavior)) + mmap_write_unlock(mm); + else + mmap_read_unlock(mm); +} + +static bool madvise_batch_tlb_flush(int behavior) +{ + switch (behavior) { + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return true; + default: + return false; + } +} + +static void madvise_init_tlb(struct madvise_behavior *madv_behavior, + struct mm_struct *mm) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_gather_mmu(madv_behavior->tlb, mm); +} + +static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_finish_mmu(madv_behavior->tlb); +} + +static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +{ + size_t len; + + if (!madvise_behavior_valid(behavior)) + return false; + + if (!PAGE_ALIGNED(start)) + return false; + len = PAGE_ALIGN(len_in); + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return false; + + if (start + len < start) + return false; + + return true; +} + +/* + * madvise_should_skip() - Return if the request is invalid or nothing. + * @start: Start address of madvise-requested address range. + * @len_in: Length of madvise-requested address range. + * @behavior: Requested madvise behavor. + * @err: Pointer to store an error code from the check. + * + * If the specified behaviour is invalid or nothing would occur, we skip the + * operation. This function returns true in the cases, otherwise false. In + * the former case we store an error on @err. + */ +static bool madvise_should_skip(unsigned long start, size_t len_in, + int behavior, int *err) +{ + if (!is_valid_madvise(start, len_in, behavior)) { + *err = -EINVAL; + return true; + } + if (start + PAGE_ALIGN(len_in) == start) { + *err = 0; + return true; + } + return false; +} + +static bool is_madvise_populate(int behavior) +{ + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return true; + default: + return false; + } +} + +static int madvise_do_behavior(struct mm_struct *mm, + unsigned long start, size_t len_in, + struct madvise_behavior *madv_behavior) +{ + int behavior = madv_behavior->behavior; + struct blk_plug plug; + unsigned long end; + int error; + + if (is_memory_failure(behavior)) + return madvise_inject_error(behavior, start, start + len_in); + start = untagged_addr_remote(mm, start); + end = start + PAGE_ALIGN(len_in); + + blk_start_plug(&plug); + if (is_madvise_populate(behavior)) + error = madvise_populate(mm, start, end, behavior); + else + error = madvise_walk_vmas(mm, start, end, madv_behavior, + madvise_vma_behavior); + blk_finish_plug(&plug); + return error; +} + /* * The madvise(2) system call. * @@ -1634,63 +1810,22 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end; int error; - int write; - size_t len; - struct blk_plug plug; - - if (!madvise_behavior_valid(behavior)) - return -EINVAL; - - if (!PAGE_ALIGNED(start)) - return -EINVAL; - len = PAGE_ALIGN(len_in); - - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; - - end = start + len; - if (end < start) - return -EINVAL; - - if (end == start) - return 0; - -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_inject_error(behavior, start, start + len_in); -#endif - - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } - - start = untagged_addr_remote(mm, start); - end = start + len; - - blk_start_plug(&plug); - switch (behavior) { - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - error = madvise_populate(mm, start, end, behavior); - break; - default: - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); - break; - } - blk_finish_plug(&plug); + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); + if (madvise_should_skip(start, len_in, behavior, &error)) + return error; + error = madvise_lock(mm, behavior); + if (error) + return error; + madvise_init_tlb(&madv_behavior, mm); + error = madvise_do_behavior(mm, start, len_in, &madv_behavior); + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); return error; } @@ -1706,19 +1841,36 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, { ssize_t ret = 0; size_t total_len; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; total_len = iov_iter_count(iter); + ret = madvise_lock(mm, behavior); + if (ret) + return ret; + madvise_init_tlb(&madv_behavior, mm); + while (iov_iter_count(iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), - iter_iov_len(iter), behavior); + unsigned long start = (unsigned long)iter_iov_addr(iter); + size_t len_in = iter_iov_len(iter); + int error; + + if (madvise_should_skip(start, len_in, behavior, &error)) + ret = error; + else + ret = madvise_do_behavior(mm, start, len_in, + &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * - * As we have already dropped locks, it is safe to just loop and + * We drop and reacquire locks so it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ @@ -1727,13 +1879,24 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, ret = -EINTR; break; } + + /* Drop and reacquire lock to unwind race. */ + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); + ret = madvise_lock(mm, behavior); + if (ret) + goto out; + madvise_init_tlb(&madv_behavior, mm); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); +out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; |