summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c188
1 files changed, 130 insertions, 58 deletions
diff --git a/mm/memory.c b/mm/memory.c
index e0a9b0ce4f10..ba94dec5b259 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,10 @@
#include "internal.h"
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#endif
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
@@ -184,10 +188,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
return 1;
}
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return 0;
+
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
return 0;
+ tlb->batch_count++;
batch->next = NULL;
batch->nr = 0;
batch->max = MAX_GATHER_BATCH;
@@ -208,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->mm = mm;
tlb->fullmm = fullmm;
+ tlb->need_flush_all = 0;
tlb->start = -1UL;
tlb->end = 0;
tlb->need_flush = 0;
@@ -216,6 +225,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
tlb->active = &tlb->local;
+ tlb->batch_count = 0;
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
@@ -711,7 +721,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
(unsigned long)vma->vm_file->f_op->mmap);
dump_stack();
- add_taint(TAINT_BAD_PAGE);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static inline bool is_cow_mapping(vm_flags_t flags)
@@ -1453,10 +1463,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
EXPORT_SYMBOL_GPL(zap_vma_ptes);
/**
- * follow_page - look up a page descriptor from a user-virtual address
+ * follow_page_mask - look up a page descriptor from a user-virtual address
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
@@ -1464,8 +1475,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
*/
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+struct page *follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
{
pgd_t *pgd;
pud_t *pud;
@@ -1475,6 +1487,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
struct page *page;
struct mm_struct *mm = vma->vm_mm;
+ *page_mask = 0;
+
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
if (!IS_ERR(page)) {
BUG_ON(flags & FOLL_GET);
@@ -1521,6 +1535,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
page = follow_trans_huge_pmd(vma, address,
pmd, flags);
spin_unlock(&mm->page_table_lock);
+ *page_mask = HPAGE_PMD_NR - 1;
goto out;
}
} else
@@ -1534,8 +1549,24 @@ split_fallthrough:
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
- if (!pte_present(pte))
- goto no_page;
+ if (!pte_present(pte)) {
+ swp_entry_t entry;
+ /*
+ * KSM's break_ksm() relies upon recognizing a ksm page
+ * even while it is being migrated, so for that case we
+ * need migration_entry_wait().
+ */
+ if (likely(!(flags & FOLL_MIGRATION)))
+ goto no_page;
+ if (pte_none(pte) || pte_file(pte))
+ goto no_page;
+ entry = pte_to_swp_entry(pte);
+ if (!is_migration_entry(entry))
+ goto no_page;
+ pte_unmap_unlock(ptep, ptl);
+ migration_entry_wait(mm, pmd, address);
+ goto split_fallthrough;
+ }
if ((flags & FOLL_NUMA) && pte_numa(pte))
goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
@@ -1668,15 +1699,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas,
- int *nonblocking)
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *nonblocking)
{
- int i;
+ long i;
unsigned long vm_flags;
+ unsigned int page_mask;
- if (nr_pages <= 0)
+ if (!nr_pages)
return 0;
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
@@ -1752,6 +1784,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
get_page(page);
}
pte_unmap(pte);
+ page_mask = 0;
goto next_page;
}
@@ -1769,6 +1802,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
do {
struct page *page;
unsigned int foll_flags = gup_flags;
+ unsigned int page_increm;
/*
* If we have a pending SIGKILL, don't keep faulting
@@ -1778,7 +1812,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i ? i : -ERESTARTSYS;
cond_resched();
- while (!(page = follow_page(vma, start, foll_flags))) {
+ while (!(page = follow_page_mask(vma, start,
+ foll_flags, &page_mask))) {
int ret;
unsigned int fault_flags = 0;
@@ -1852,13 +1887,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
flush_anon_page(vma, page, start);
flush_dcache_page(page);
+ page_mask = 0;
}
next_page:
- if (vmas)
+ if (vmas) {
vmas[i] = vma;
- i++;
- start += PAGE_SIZE;
- nr_pages--;
+ page_mask = 0;
+ }
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ if (page_increm > nr_pages)
+ page_increm = nr_pages;
+ i += page_increm;
+ start += page_increm * PAGE_SIZE;
+ nr_pages -= page_increm;
} while (nr_pages && start < vma->vm_end);
} while (nr_pages);
return i;
@@ -1972,9 +2013,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
*
* See also get_user_pages_fast, for performance critical applications.
*/
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int nr_pages, int write, int force,
- struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages, int write,
+ int force, struct page **pages, struct vm_area_struct **vmas)
{
int flags = FOLL_TOUCH;
@@ -2352,6 +2393,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(remap_pfn_range);
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+ unsigned long vm_len, pfn, pages;
+
+ /* Check that the physical memory area passed in looks valid */
+ if (start + len < start)
+ return -EINVAL;
+ /*
+ * You *really* shouldn't map things that aren't page-aligned,
+ * but we've historically allowed it because IO memory might
+ * just have smaller alignment.
+ */
+ len += start & ~PAGE_MASK;
+ pfn = start >> PAGE_SHIFT;
+ pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ /* We start the mapping 'vm_pgoff' pages into the area */
+ if (vma->vm_pgoff > pages)
+ return -EINVAL;
+ pfn += vma->vm_pgoff;
+ pages -= vma->vm_pgoff;
+
+ /* Can we fit all of the mapping? */
+ vm_len = vma->vm_end - vma->vm_start;
+ if (vm_len >> PAGE_SHIFT > pages)
+ return -EINVAL;
+
+ /* Ok, let it rip */
+ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data)
@@ -2914,7 +3002,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
- struct page *page, *swapcache = NULL;
+ struct page *page, *swapcache;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -2965,9 +3053,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ swapcache = page;
goto out_release;
}
+ swapcache = page;
locked = lock_page_or_retry(page, mm, flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2985,16 +3075,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- if (ksm_might_need_to_copy(page, vma, address)) {
- swapcache = page;
- page = ksm_does_need_to_copy(page, vma, address);
-
- if (unlikely(!page)) {
- ret = VM_FAULT_OOM;
- page = swapcache;
- swapcache = NULL;
- goto out_page;
- }
+ page = ksm_might_need_to_copy(page, vma, address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ goto out_page;
}
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -3039,7 +3124,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
- do_page_add_anon_rmap(page, vma, address, exclusive);
+ if (page == swapcache)
+ do_page_add_anon_rmap(page, vma, address, exclusive);
+ else /* ksm created a completely new copy */
+ page_add_new_anon_rmap(page, vma, address);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
@@ -3047,7 +3135,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
- if (swapcache) {
+ if (page != swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
@@ -3080,7 +3168,7 @@ out_page:
unlock_page(page);
out_release:
page_cache_release(page);
- if (swapcache) {
+ if (page != swapcache) {
unlock_page(swapcache);
page_cache_release(swapcache);
}
@@ -3706,6 +3794,14 @@ retry:
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
+ /*
+ * If the pmd is splitting, return and retry the
+ * the fault. Alternative: wait until the split
+ * is done, and goto retry.
+ */
+ if (pmd_trans_splitting(orig_pmd))
+ return 0;
+
if (pmd_numa(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
@@ -3808,30 +3904,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-int make_pages_present(unsigned long addr, unsigned long end)
-{
- int ret, len, write;
- struct vm_area_struct * vma;
-
- vma = find_vma(current->mm, addr);
- if (!vma)
- return -ENOMEM;
- /*
- * We want to touch writable mappings with a write fault in order
- * to break COW, except for shared mappings because these don't COW
- * and we would not want to dirty them for nothing.
- */
- write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
- BUG_ON(addr >= end);
- BUG_ON(end > vma->vm_end);
- len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
- ret = get_user_pages(current, current->mm, addr,
- len, write, 0, NULL, NULL);
- if (ret < 0)
- return ret;
- return ret == len ? 0 : -EFAULT;
-}
-
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)