summaryrefslogtreecommitdiff
path: root/mm/madvise.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/madvise.c')
-rw-r--r--mm/madvise.c343
1 files changed, 253 insertions, 90 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index 0ceae57da7da..1d44a35ae85c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -48,6 +48,11 @@ struct madvise_walk_private {
bool pageout;
};
+struct madvise_behavior {
+ int behavior;
+ struct mmu_gather *tlb;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_lock for writing. Others, which simply traverse vmas, need
@@ -387,7 +392,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -486,7 +491,7 @@ restart:
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
@@ -503,6 +508,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -721,7 +727,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (!folio_trylock(folio))
continue;
@@ -736,6 +742,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -794,12 +801,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static int madvise_free_single_vma(struct vm_area_struct *vma,
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+ struct mmu_gather *tlb = madv_behavior->tlb;
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -815,17 +823,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- tlb_start_vma(&tlb, vma);
+ tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, range.start, range.end,
- &madvise_free_walk_ops, &tlb);
- tlb_end_vma(&tlb, vma);
+ &madvise_free_walk_ops, tlb);
+ tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-
return 0;
}
@@ -848,10 +853,17 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range_single(vma, start, end - start, NULL);
+ struct zap_details details = {
+ .reclaim_pt = true,
+ .even_cows = true,
+ };
+
+ zap_page_range_single_batched(
+ madv_behavior->tlb, vma, start, end - start, &details);
return 0;
}
@@ -888,8 +900,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- int behavior)
+ struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
@@ -928,13 +941,23 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(vma, start, end);
+ return madvise_dontneed_single_vma(
+ madv_behavior, vma, start, end);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(vma, start, end);
+ return madvise_free_single_vma(madv_behavior, vma, start, end);
else
return -EINVAL;
}
@@ -1037,13 +1060,7 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
if (!allow_locked)
disallowed |= VM_LOCKED;
- if (!vma_is_anonymous(vma))
- return false;
-
- if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
- return false;
-
- return true;
+ return !(vma->vm_flags & disallowed);
}
static bool is_guard_pte_marker(pte_t ptent)
@@ -1241,8 +1258,10 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
static int madvise_vma_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long behavior)
+ void *behavior_arg)
{
+ struct madvise_behavior *arg = behavior_arg;
+ int behavior = arg->behavior;
int error;
struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
@@ -1262,7 +1281,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
+ return madvise_dontneed_free(vma, prev, start, end, arg);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1384,7 +1403,32 @@ static int madvise_inject_error(int behavior,
return 0;
}
-#endif
+
+static bool is_memory_failure(int behavior)
+{
+ switch (behavior) {
+ case MADV_HWPOISON:
+ case MADV_SOFT_OFFLINE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#else
+
+static int madvise_inject_error(int behavior,
+ unsigned long start, unsigned long end)
+{
+ return 0;
+}
+
+static bool is_memory_failure(int behavior)
+{
+ return false;
+}
+
+#endif /* CONFIG_MEMORY_FAILURE */
static bool
madvise_behavior_valid(int behavior)
@@ -1454,10 +1498,10 @@ static bool process_madvise_remote_valid(int behavior)
*/
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long arg,
+ unsigned long end, void *arg,
int (*visit)(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long arg))
+ unsigned long end, void *arg))
{
struct vm_area_struct *vma;
struct vm_area_struct *prev;
@@ -1515,7 +1559,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long anon_name)
+ void *anon_name)
{
int error;
@@ -1524,7 +1568,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (struct anon_vma_name *)anon_name);
+ anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1556,10 +1600,142 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+ return madvise_walk_vmas(mm, start, end, anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
+
+static int madvise_lock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return 0;
+
+ if (madvise_need_mmap_write(behavior)) {
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ } else {
+ mmap_read_lock(mm);
+ }
+ return 0;
+}
+
+static void madvise_unlock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return;
+
+ if (madvise_need_mmap_write(behavior))
+ mmap_write_unlock(mm);
+ else
+ mmap_read_unlock(mm);
+}
+
+static bool madvise_batch_tlb_flush(int behavior)
+{
+ switch (behavior) {
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior,
+ struct mm_struct *mm)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_gather_mmu(madv_behavior->tlb, mm);
+}
+
+static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_finish_mmu(madv_behavior->tlb);
+}
+
+static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+{
+ size_t len;
+
+ if (!madvise_behavior_valid(behavior))
+ return false;
+
+ if (!PAGE_ALIGNED(start))
+ return false;
+ len = PAGE_ALIGN(len_in);
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return false;
+
+ if (start + len < start)
+ return false;
+
+ return true;
+}
+
+/*
+ * madvise_should_skip() - Return if the request is invalid or nothing.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ * @behavior: Requested madvise behavor.
+ * @err: Pointer to store an error code from the check.
+ *
+ * If the specified behaviour is invalid or nothing would occur, we skip the
+ * operation. This function returns true in the cases, otherwise false. In
+ * the former case we store an error on @err.
+ */
+static bool madvise_should_skip(unsigned long start, size_t len_in,
+ int behavior, int *err)
+{
+ if (!is_valid_madvise(start, len_in, behavior)) {
+ *err = -EINVAL;
+ return true;
+ }
+ if (start + PAGE_ALIGN(len_in) == start) {
+ *err = 0;
+ return true;
+ }
+ return false;
+}
+
+static bool is_madvise_populate(int behavior)
+{
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int madvise_do_behavior(struct mm_struct *mm,
+ unsigned long start, size_t len_in,
+ struct madvise_behavior *madv_behavior)
+{
+ int behavior = madv_behavior->behavior;
+ struct blk_plug plug;
+ unsigned long end;
+ int error;
+
+ if (is_memory_failure(behavior))
+ return madvise_inject_error(behavior, start, start + len_in);
+ start = untagged_addr_remote(mm, start);
+ end = start + PAGE_ALIGN(len_in);
+
+ blk_start_plug(&plug);
+ if (is_madvise_populate(behavior))
+ error = madvise_populate(mm, start, end, behavior);
+ else
+ error = madvise_walk_vmas(mm, start, end, madv_behavior,
+ madvise_vma_behavior);
+ blk_finish_plug(&plug);
+ return error;
+}
+
/*
* The madvise(2) system call.
*
@@ -1634,63 +1810,22 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
int error;
- int write;
- size_t len;
- struct blk_plug plug;
-
- if (!madvise_behavior_valid(behavior))
- return -EINVAL;
-
- if (!PAGE_ALIGNED(start))
- return -EINVAL;
- len = PAGE_ALIGN(len_in);
-
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
-
- end = start + len;
- if (end < start)
- return -EINVAL;
-
- if (end == start)
- return 0;
-
-#ifdef CONFIG_MEMORY_FAILURE
- if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
- return madvise_inject_error(behavior, start, start + len_in);
-#endif
-
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
-
- start = untagged_addr_remote(mm, start);
- end = start + len;
-
- blk_start_plug(&plug);
- switch (behavior) {
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- error = madvise_populate(mm, start, end, behavior);
- break;
- default:
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
- break;
- }
- blk_finish_plug(&plug);
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ return error;
+ error = madvise_lock(mm, behavior);
+ if (error)
+ return error;
+ madvise_init_tlb(&madv_behavior, mm);
+ error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
return error;
}
@@ -1706,19 +1841,36 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
{
ssize_t ret = 0;
size_t total_len;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
total_len = iov_iter_count(iter);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ return ret;
+ madvise_init_tlb(&madv_behavior, mm);
+
while (iov_iter_count(iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
- iter_iov_len(iter), behavior);
+ unsigned long start = (unsigned long)iter_iov_addr(iter);
+ size_t len_in = iter_iov_len(iter);
+ int error;
+
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ ret = error;
+ else
+ ret = madvise_do_behavior(mm, start, len_in,
+ &madv_behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
* the operation in aggregate, and would be surprising to the
* user.
*
- * As we have already dropped locks, it is safe to just loop and
+ * We drop and reacquire locks so it is safe to just loop and
* try again. We check for fatal signals in case we need exit
* early anyway.
*/
@@ -1727,13 +1879,24 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
ret = -EINTR;
break;
}
+
+ /* Drop and reacquire lock to unwind race. */
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ goto out;
+ madvise_init_tlb(&madv_behavior, mm);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;