summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c239
1 files changed, 191 insertions, 48 deletions
diff --git a/mm/memory.c b/mm/memory.c
index f65beaad319b..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
+#include <linux/oom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -215,12 +216,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
-/* tlb_gather_mmu
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @fullmm argument is used when @mm is without
- * users and we're going to destroy the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
@@ -275,10 +272,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
* Called at the end of the shootdown operation to free up any resources
* that were required.
*/
-void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
struct mmu_gather_batch *batch, *next;
+ if (force)
+ __tlb_adjust_range(tlb, start, end - start);
+
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -398,6 +399,34 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+/* tlb_gather_mmu
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @fullmm argument is used when @mm is without
+ * users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ arch_tlb_gather_mmu(tlb, mm, start, end);
+ inc_tlb_flush_pending(tlb->mm);
+}
+
+void tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If there are parallel threads are doing PTE changes on same range
+ * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+ * flush by batching, a thread has stable TLB entry can fail to flush
+ * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+ * forcefully if we detect parallel PTE batching threads.
+ */
+ bool force = mm_tlb_flush_nested(tlb->mm);
+
+ arch_tlb_finish_mmu(tlb, start, end, force);
+ dec_tlb_flush_pending(tlb->mm);
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -1484,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
unmap_single_vma(&tlb, vma, start, end, NULL);
+
+ /*
+ * zap_page_range does not specify whether mmap_sem should be
+ * held for read or write. That allows parallel zap_page_range
+ * operations to unmap a PTE and defer a flush meaning that
+ * this call observes pte_none and fails to flush the TLB.
+ * Rather than adding a complex API, ensure that no stale
+ * TLB entries exist when this call returns.
+ */
+ flush_tlb_range(vma, start, end);
+ }
+
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -1647,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot)
+ pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
@@ -1659,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (!pte)
goto out;
retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
+ if (!pte_none(*pte)) {
+ if (mkwrite) {
+ /*
+ * For read faults on private mappings the PFN passed
+ * in may not match the PFN we have mapped if the
+ * mapped PFN is a writeable COW page. In the mkwrite
+ * case we are creating a writable PTE for a shared
+ * mapping and we expect the PFNs to match.
+ */
+ if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ goto out_unlock;
+ entry = *pte;
+ goto out_mkwrite;
+ } else
+ goto out_unlock;
+ }
/* Ok, finally just insert the thing.. */
if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+ if (mkwrite) {
+ entry = pte_mkyoung(entry);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
+
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1737,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+ ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ false);
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn_prot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
@@ -1773,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, pgprot);
}
- return insert_pfn(vma, addr, pfn, pgprot);
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, false);
+
}
EXPORT_SYMBOL(vm_insert_mixed);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -2542,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
* not dirty accountable.
*/
if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
- int total_mapcount;
+ int total_map_swapcount;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2557,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
- if (reuse_swap_page(vmf->page, &total_mapcount)) {
- if (total_mapcount == 1) {
+ if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+ if (total_map_swapcount == 1) {
/*
* The page is all ours. Move it to
* our anon_vma so the rmap code will
@@ -2675,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page, *swapcache;
+ struct page *page = NULL, *swapcache;
struct mem_cgroup *memcg;
+ struct vma_swap_readahead swap_ra;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
int ret = 0;
+ bool vma_readahead = swap_use_vma_readahead();
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (vma_readahead)
+ page = swap_readahead_detect(vmf, &swap_ra);
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+ if (page)
+ put_page(page);
goto out;
+ }
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
@@ -2700,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- page = lookup_swap_cache(entry);
+ if (!page)
+ page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+ vmf->address);
if (!page) {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
+ if (vma_readahead)
+ page = do_swap_page_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+ else
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -2865,6 +2955,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
+ int ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
@@ -2897,6 +2988,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (!pte_none(*vmf->pte))
goto unlock;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2931,6 +3025,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (!pte_none(*vmf->pte))
goto release;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2950,7 +3048,7 @@ setpte:
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
+ return ret;
release:
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
@@ -3224,7 +3322,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
int finish_fault(struct vm_fault *vmf)
{
struct page *page;
- int ret;
+ int ret = 0;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3232,7 +3330,15 @@ int finish_fault(struct vm_fault *vmf)
page = vmf->cow_page;
else
page = vmf->page;
- ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+ /*
+ * check even for read faults because we might have lost our CoWed
+ * page
+ */
+ if (!(vmf->vma->vm_flags & VM_SHARED))
+ ret = check_stable_address_space(vmf->vma->vm_mm);
+ if (!ret)
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
@@ -3872,19 +3978,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
- /*
- * This mm has been already reaped by the oom reaper and so the
- * refault cannot be trusted in general. Anonymous refaults would
- * lose data and give a zero page instead e.g. This is especially
- * problem for use_mm() because regular tasks will just die and
- * the corrupted data will not be visible anywhere while kthread
- * will outlive the oom victim and potentially propagate the date
- * further.
- */
- if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
- && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
- ret = VM_FAULT_SIGBUS;
-
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -3976,7 +4069,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
#endif /* __PAGETABLE_PMD_FOLDED */
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ unsigned long *start, unsigned long *end,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4003,17 +4097,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (!pmdpp)
goto out;
+ if (start && end) {
+ *start = address & PMD_MASK;
+ *end = *start + PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
+ if (start && end) {
+ *start = address & PAGE_MASK;
+ *end = *start + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -4021,6 +4127,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
out:
return -EINVAL;
}
@@ -4032,20 +4140,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+ ptepp, NULL, ptlp)));
return res;
}
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ unsigned long *start, unsigned long *end,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, start, end,
+ ptepp, pmdpp, ptlp)));
return res;
}
EXPORT_SYMBOL(follow_pte_pmd);
@@ -4308,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
}
}
void clear_huge_page(struct page *page,
- unsigned long addr, unsigned int pages_per_huge_page)
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
{
- int i;
+ int i, n, base, l;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
+ /* Clear sub-page to access last to keep its cache lines hot */
might_sleep();
- for (i = 0; i < pages_per_huge_page; i++) {
+ n = (addr_hint - addr) / PAGE_SIZE;
+ if (2 * n <= pages_per_huge_page) {
+ /* If sub-page to access in first half of huge page */
+ base = 0;
+ l = n;
+ /* Clear sub-pages at the end of huge page */
+ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ } else {
+ /* If sub-page to access in second half of huge page */
+ base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+ l = pages_per_huge_page - n;
+ /* Clear sub-pages at the begin of huge page */
+ for (i = 0; i < base; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ }
+ /*
+ * Clear remaining sub-pages in left-right-left-right pattern
+ * towards the sub-page to access
+ */
+ for (i = 0; i < l; i++) {
+ int left_idx = base + i;
+ int right_idx = base + 2 * l - 1 - i;
+
+ cond_resched();
+ clear_user_highpage(page + left_idx,
+ addr + left_idx * PAGE_SIZE);
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ clear_user_highpage(page + right_idx,
+ addr + right_idx * PAGE_SIZE);
}
}