summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Mackall <mpm@selenic.com>2008-02-05 09:29:04 +0300
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-05 20:44:16 +0300
commit85863e475e59afb027b0113290e3796ee6020b7d (patch)
tree047cc687b98c0261bd3c083f17c090fbf082355f
parenta6198797cc3fd659b2d81cdf6bb6b9bba9cd93e9 (diff)
downloadlinux-85863e475e59afb027b0113290e3796ee6020b7d.tar.xz
maps4: add /proc/pid/pagemap interface
This interface provides a mapping for each page in an address space to its physical page frame number, allowing precise determination of what pages are mapped and what pages are shared between processes. New in this version: - headers gone again (as recommended by Dave Hansen and Alan Cox) - 64-bit entries (as per discussion with Andi Kleen) - swap pte information exported (from Dave Hansen) - page walker callback for holes (from Dave Hansen) - direct put_user I/O (as suggested by Rusty Russell) This patch folds in cleanups and swap PTE support from Dave Hansen <haveblue@us.ibm.com>. Signed-off-by: Matt Mackall <mpm@selenic.com> Cc: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/task_mmu.c200
3 files changed, 204 insertions, 2 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1bd646d3fe9a..9004db04efa0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -783,7 +783,7 @@ out_no_task:
}
#endif
-static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
+loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
@@ -2252,6 +2252,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
+ REG("pagemap", S_IRUSR, pagemap),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
@@ -2580,6 +2581,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
+ REG("pagemap", S_IRUSR, pagemap),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index ddfaeec37492..7d57e8069924 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -52,11 +52,13 @@ extern int proc_tid_stat(struct task_struct *, char *);
extern int proc_tgid_stat(struct task_struct *, char *);
extern int proc_pid_status(struct task_struct *, char *);
extern int proc_pid_statm(struct task_struct *, char *);
+extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
extern const struct file_operations proc_maps_operations;
extern const struct file_operations proc_numa_maps_operations;
extern const struct file_operations proc_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
void free_proc_entry(struct proc_dir_entry *de);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 308fc5451e43..bbd9b145051d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -5,7 +5,10 @@
#include <linux/highmem.h>
#include <linux/ptrace.h>
#include <linux/pagemap.h>
+#include <linux/ptrace.h>
#include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -519,6 +522,202 @@ const struct file_operations proc_clear_refs_operations = {
.write = clear_refs_write,
};
+struct pagemapread {
+ char __user *out, *end;
+};
+
+#define PM_ENTRY_BYTES sizeof(u64)
+#define PM_RESERVED_BITS 3
+#define PM_RESERVED_OFFSET (64 - PM_RESERVED_BITS)
+#define PM_RESERVED_MASK (((1LL<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET)
+#define PM_SPECIAL(nr) (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK)
+#define PM_NOT_PRESENT PM_SPECIAL(1LL)
+#define PM_SWAP PM_SPECIAL(2LL)
+#define PM_END_OF_BUFFER 1
+
+static int add_to_pagemap(unsigned long addr, u64 pfn,
+ struct pagemapread *pm)
+{
+ /*
+ * Make sure there's room in the buffer for an
+ * entire entry. Otherwise, only copy part of
+ * the pfn.
+ */
+ if (pm->out + PM_ENTRY_BYTES >= pm->end) {
+ if (copy_to_user(pm->out, &pfn, pm->end - pm->out))
+ return -EFAULT;
+ pm->out = pm->end;
+ return PM_END_OF_BUFFER;
+ }
+
+ if (put_user(pfn, pm->out))
+ return -EFAULT;
+ pm->out += PM_ENTRY_BYTES;
+ return 0;
+}
+
+static int pagemap_pte_hole(unsigned long start, unsigned long end,
+ void *private)
+{
+ struct pagemapread *pm = private;
+ unsigned long addr;
+ int err = 0;
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+u64 swap_pte_to_pagemap_entry(pte_t pte)
+{
+ swp_entry_t e = pte_to_swp_entry(pte);
+ return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
+}
+
+static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ void *private)
+{
+ struct pagemapread *pm = private;
+ pte_t *pte;
+ int err = 0;
+
+ for (; addr != end; addr += PAGE_SIZE) {
+ u64 pfn = PM_NOT_PRESENT;
+ pte = pte_offset_map(pmd, addr);
+ if (is_swap_pte(*pte))
+ pfn = swap_pte_to_pagemap_entry(*pte);
+ else if (pte_present(*pte))
+ pfn = pte_pfn(*pte);
+ /* unmap so we're not in atomic when we copy to userspace */
+ pte_unmap(pte);
+ err = add_to_pagemap(addr, pfn, pm);
+ if (err)
+ return err;
+ }
+
+ cond_resched();
+
+ return err;
+}
+
+static struct mm_walk pagemap_walk = {
+ .pmd_entry = pagemap_pte_range,
+ .pte_hole = pagemap_pte_hole
+};
+
+/*
+ * /proc/pid/pagemap - an array mapping virtual pages to pfns
+ *
+ * For each page in the address space, this file contains one 64-bit
+ * entry representing the corresponding physical page frame number
+ * (PFN) if the page is present. If there is a swap entry for the
+ * physical page, then an encoding of the swap file number and the
+ * page's offset into the swap file are returned. If no page is
+ * present at all, PM_NOT_PRESENT is returned. This allows determining
+ * precisely which pages are mapped (or in swap) and comparing mapped
+ * pages between processes.
+ *
+ * Efficient users of this interface will use /proc/pid/maps to
+ * determine which areas of memory are actually mapped and llseek to
+ * skip over unmapped regions.
+ */
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ struct page **pages, *page;
+ unsigned long uaddr, uend;
+ struct mm_struct *mm;
+ struct pagemapread pm;
+ int pagecount;
+ int ret = -ESRCH;
+
+ if (!task)
+ goto out;
+
+ ret = -EACCES;
+ if (!ptrace_may_attach(task))
+ goto out;
+
+ ret = -EINVAL;
+ /* file position must be aligned */
+ if (*ppos % PM_ENTRY_BYTES)
+ goto out;
+
+ ret = 0;
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out;
+
+ ret = -ENOMEM;
+ uaddr = (unsigned long)buf & PAGE_MASK;
+ uend = (unsigned long)(buf + count);
+ pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
+ pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto out_task;
+
+ down_read(&current->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, uaddr, pagecount,
+ 1, 0, pages, NULL);
+ up_read(&current->mm->mmap_sem);
+
+ if (ret < 0)
+ goto out_free;
+
+ pm.out = buf;
+ pm.end = buf + count;
+
+ if (!ptrace_may_attach(task)) {
+ ret = -EIO;
+ } else {
+ unsigned long src = *ppos;
+ unsigned long svpfn = src / PM_ENTRY_BYTES;
+ unsigned long start_vaddr = svpfn << PAGE_SHIFT;
+ unsigned long end_vaddr = TASK_SIZE_OF(task);
+
+ /* watch out for wraparound */
+ if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ start_vaddr = end_vaddr;
+
+ /*
+ * The odds are that this will stop walking way
+ * before end_vaddr, because the length of the
+ * user buffer is tracked in "pm", and the walk
+ * will stop when we hit the end of the buffer.
+ */
+ ret = walk_page_range(mm, start_vaddr, end_vaddr,
+ &pagemap_walk, &pm);
+ if (ret == PM_END_OF_BUFFER)
+ ret = 0;
+ /* don't need mmap_sem for these, but this looks cleaner */
+ *ppos += pm.out - buf;
+ if (!ret)
+ ret = pm.out - buf;
+ }
+
+ for (; pagecount; pagecount--) {
+ page = pages[pagecount-1];
+ if (!PageReserved(page))
+ SetPageDirty(page);
+ page_cache_release(page);
+ }
+ mmput(mm);
+out_free:
+ kfree(pages);
+out_task:
+ put_task_struct(task);
+out:
+ return ret;
+}
+
+const struct file_operations proc_pagemap_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = pagemap_read,
+};
+
#ifdef CONFIG_NUMA
extern int show_numa_map(struct seq_file *m, void *v);
@@ -552,4 +751,3 @@ const struct file_operations proc_numa_maps_operations = {
.release = seq_release_private,
};
#endif
-