summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c240
1 files changed, 187 insertions, 53 deletions
diff --git a/mm/memory.c b/mm/memory.c
index cf6873e91c6a..d5d1653d60a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i;
}
+/**
+ * get_user_pages() - pin user pages in memory
+ * @tsk: task_struct of target task
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @len: number of pages from start to pin
+ * @write: whether pages will be written to by the caller
+ * @force: whether to force write access even if user mapping is
+ * readonly. This will result in the page being COWed even
+ * in MAP_SHARED mappings. You do not want this.
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If len is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -1971,6 +2021,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret = tmp;
goto unwritable_page;
}
+ if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ lock_page(old_page);
+ if (!old_page->mapping) {
+ ret = 0; /* retry the fault */
+ unlock_page(old_page);
+ goto unwritable_page;
+ }
+ } else
+ VM_BUG_ON(!PageLocked(old_page));
/*
* Since we dropped the lock we need to revalidate
@@ -1980,9 +2039,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
- page_cache_release(old_page);
- if (!pte_same(*page_table, orig_pte))
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ page_cache_release(old_page);
goto unlock;
+ }
page_mkwrite = 1;
}
@@ -2094,9 +2155,6 @@ gotten:
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
- if (vma->vm_file)
- file_update_time(vma->vm_file);
-
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2163,41 @@ unlock:
*
* do_no_page is protected similarly.
*/
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
+ if (!page_mkwrite) {
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ }
put_page(dirty_page);
+ if (page_mkwrite) {
+ struct address_space *mapping = dirty_page->mapping;
+
+ set_page_dirty(dirty_page);
+ unlock_page(dirty_page);
+ page_cache_release(dirty_page);
+ if (mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
}
return ret;
oom_free_new:
page_cache_release(new_page);
oom:
- if (old_page)
+ if (old_page) {
+ if (page_mkwrite) {
+ unlock_page(old_page);
+ page_cache_release(old_page);
+ }
page_cache_release(old_page);
+ }
return VM_FAULT_OOM;
unwritable_page:
@@ -2458,8 +2541,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
ret = VM_FAULT_OOM;
- unlock_page(page);
- goto out;
+ goto out_page;
}
/*
@@ -2521,6 +2603,7 @@ out:
out_nomap:
mem_cgroup_cancel_charge_swapin(ptr);
pte_unmap_unlock(page_table, ptl);
+out_page:
unlock_page(page);
page_cache_release(page);
return ret;
@@ -2664,27 +2747,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int tmp;
unlock_page(page);
- vmf.flags |= FAULT_FLAG_MKWRITE;
+ vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
- anon = 1; /* no anon but release vmf.page */
- goto out_unlocked;
- }
- lock_page(page);
- /*
- * XXX: this is not quite right (racy vs
- * invalidate) to unlock and relock the page
- * like this, however a better fix requires
- * reworking page_mkwrite locking API, which
- * is better done later.
- */
- if (!page->mapping) {
- ret = 0;
- anon = 1; /* no anon but release vmf.page */
- goto out;
+ goto unwritable_page;
}
+ if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ lock_page(page);
+ if (!page->mapping) {
+ ret = 0; /* retry the fault */
+ unlock_page(page);
+ goto unwritable_page;
+ }
+ } else
+ VM_BUG_ON(!PageLocked(page));
page_mkwrite = 1;
}
}
@@ -2736,19 +2814,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(page_table, ptl);
out:
- unlock_page(vmf.page);
-out_unlocked:
- if (anon)
- page_cache_release(vmf.page);
- else if (dirty_page) {
- if (vma->vm_file)
- file_update_time(vma->vm_file);
+ if (dirty_page) {
+ struct address_space *mapping = page->mapping;
- set_page_dirty_balance(dirty_page, page_mkwrite);
+ if (set_page_dirty(dirty_page))
+ page_mkwrite = 1;
+ unlock_page(dirty_page);
put_page(dirty_page);
+ if (page_mkwrite && mapping) {
+ /*
+ * Some device drivers do not set page.mapping but still
+ * dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+ } else {
+ unlock_page(vmf.page);
+ if (anon)
+ page_cache_release(vmf.page);
}
return ret;
+
+unwritable_page:
+ page_cache_release(page);
+ return ret;
}
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -3009,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
-#ifdef CONFIG_HAVE_IOREMAP_PROT
-int follow_phys(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned long *prot, resource_size_t *phys)
+static int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *ptep, pte;
- spinlock_t *ptl;
- resource_size_t phys_addr = 0;
- struct mm_struct *mm = vma->vm_mm;
- int ret = -EINVAL;
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- goto out;
+ pte_t *ptep;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3042,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
if (pmd_huge(*pmd))
goto out;
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!ptep)
goto out;
+ if (!pte_present(*ptep))
+ goto unlock;
+ *ptepp = ptep;
+ return 0;
+unlock:
+ pte_unmap_unlock(ptep, *ptlp);
+out:
+ return -EINVAL;
+}
+
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+ unsigned long *pfn)
+{
+ int ret = -EINVAL;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return ret;
+
+ ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+ if (ret)
+ return ret;
+ *pfn = pte_pfn(*ptep);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+int follow_phys(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned long *prot, resource_size_t *phys)
+{
+ int ret = -EINVAL;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out;
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ goto out;
pte = *ptep;
- if (!pte_present(pte))
- goto unlock;
+
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
- phys_addr = pte_pfn(pte);
- phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
*prot = pgprot_val(pte_pgprot(pte));
- *phys = phys_addr;
- ret = 0;
+ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+ ret = 0;
unlock:
pte_unmap_unlock(ptep, ptl);
out: