summaryrefslogtreecommitdiff
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/kcore.c85
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/page.c9
-rw-r--r--fs/proc/proc_sysctl.c88
-rw-r--r--fs/proc/task_mmu.c3
6 files changed, 119 insertions, 82 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e34e06091775..05452c3b9872 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -96,6 +96,7 @@
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
+#include <linux/ksm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -3206,6 +3207,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
+ seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 71157ee35c1a..25b44b303b35 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -24,7 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
@@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
*i = ALIGN(*i + descsz, 4);
}
-static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- char *buf = file->private_data;
+ loff_t *fpos = &iocb->ki_pos;
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
@@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
size_t tsz;
int nphdr;
unsigned long start;
+ size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen;
int ret = 0;
@@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
};
tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
- if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
- if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
- tsz)) {
+ if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
+ iter) != tsz) {
kfree(phdrs);
ret = -EFAULT;
goto out;
}
kfree(phdrs);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
min(vmcoreinfo_size, notes_len - i));
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
- if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
kfree(notes);
ret = -EFAULT;
goto out;
}
kfree(notes);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
if (!m) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
switch (m->type) {
case KCORE_VMALLOC:
- vread(buf, (char *)start, tsz);
- /* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
+ {
+ const char *src = (char *)start;
+ size_t read = 0, left = tsz;
+
+ /*
+ * vmalloc uses spinlocks, so we optimistically try to
+ * read memory. If this fails, fault pages in and try
+ * again until we are done.
+ */
+ while (true) {
+ read += vread_iter(iter, src, left);
+ if (read == tsz)
+ break;
+
+ src += read;
+ left -= read;
+
+ if (fault_in_iov_iter_writeable(iter, left)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
break;
+ }
case KCORE_USER:
/* User page is handled prior to normal kernel page: */
- if (copy_to_user(buffer, (char *)start, tsz)) {
+ if (copy_to_iter((char *)start, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
*/
if (!page || PageOffline(page) ||
is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
+ * We use _copy_to_iter() to bypass usermode hardening
+ * which would otherwise prevent this operation.
*/
- if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
}
break;
default:
pr_warn_once("Unhandled KCORE type: %d\n", m->type);
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
skip:
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
@@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (ret)
return ret;
- filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!filp->private_data)
- return -ENOMEM;
-
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
-static int release_kcore(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
- return 0;
-}
-
static const struct proc_ops kcore_proc_ops = {
- .proc_read = read_kcore,
+ .proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
- .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 440960110a42..b43d0bd42762 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -6,6 +6,7 @@
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/percpu.h>
#include <linux/seq_file.h>
@@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
+#ifdef CONFIG_MEMTEST
+ if (early_memtest_done) {
+ unsigned long early_memtest_bad_size_kb;
+
+ early_memtest_bad_size_kb = early_memtest_bad_size>>10;
+ if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+ early_memtest_bad_size_kb = 1;
+ /* When 0 is reported, it means there actually was a successful test */
+ seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
+ }
+#endif
+
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 6249c347809a..195b077c0fac 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -125,7 +125,7 @@ u64 stable_page_flags(struct page *page)
/*
* pseudo flags for the well known (anonymous) memory mapped pages
*
- * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+ * Note that page->_mapcount is overloaded in SLAB, so the
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
@@ -165,9 +165,8 @@ u64 stable_page_flags(struct page *page)
/*
- * Caveats on high order pages: page->_refcount will only be set
- * -1 on the head page; SLUB/SLQB do the same for PG_slab;
- * SLOB won't set PG_slab at all on compound pages.
+ * Caveats on high order pages: PG_buddy and PG_slab will only be set
+ * on the head page.
*/
if (PageBuddy(page))
u |= 1 << KPF_BUDDY;
@@ -185,7 +184,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- if (PageTail(page) && PageSlab(compound_head(page)))
+ if (PageTail(page) && PageSlab(page))
u |= 1 << KPF_SLAB;
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 875771bf1f93..81dbb175017e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1282,11 +1282,43 @@ out:
return err;
}
+/* Find the directory for the ctl_table. If one is not found create it. */
+static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
+{
+ const char *name, *nextname;
+
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ /*
+ * namelen ensures if name is "foo/bar/yay" only foo is
+ * registered first. We traverse as if using mkdir -p and
+ * return a ctl_dir for the last directory entry.
+ */
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ break;
+ }
+ return dir;
+}
+
/**
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
+ * @table: the top-level table structure without any child. This table
+ * should not be free'd after registration. So it should not be
+ * used on stack. It can either be a global or dynamically allocated
+ * by the caller and free'd later after sysctl unregistration.
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
@@ -1307,9 +1339,12 @@ out:
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
+ * XXX: we should eventually modify these to use long min / max [0]
+ * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
+ * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
+ * sysctl_check_table() verifies this.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
@@ -1330,7 +1365,6 @@ struct ctl_table_header *__register_sysctl_table(
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
- const char *name, *nextname;
struct ctl_dir *dir;
struct ctl_table *entry;
struct ctl_node *node;
@@ -1351,28 +1385,13 @@ struct ctl_table_header *__register_sysctl_table(
spin_lock(&sysctl_lock);
dir = &set->dir;
- /* Reference moved down the diretory tree get_subdir */
+ /* Reference moved down the directory tree get_subdir */
dir->header.nreg++;
spin_unlock(&sysctl_lock);
- /* Find the directory for the ctl_table */
- for (name = path; name; name = nextname) {
- int namelen;
- nextname = strchr(name, '/');
- if (nextname) {
- namelen = nextname - name;
- nextname++;
- } else {
- namelen = strlen(name);
- }
- if (namelen == 0)
- continue;
-
- dir = get_subdir(dir, name, namelen);
- if (IS_ERR(dir))
- goto fail;
- }
-
+ dir = sysctl_mkdir_p(dir, path);
+ if (IS_ERR(dir))
+ goto fail;
spin_lock(&sysctl_lock);
if (insert_header(dir, header))
goto fail_put_dir_locked;
@@ -1393,8 +1412,15 @@ fail:
/**
* register_sysctl - register a sysctl table
- * @path: The path to the directory the sysctl table is in.
- * @table: the table structure
+ * @path: The path to the directory the sysctl table is in. If the path
+ * doesn't exist we will create it for you.
+ * @table: the table structure. The calller must ensure the life of the @table
+ * will be kept during the lifetime use of the syctl. It must not be freed
+ * until unregister_sysctl_table() is called with the given returned table
+ * with this registration. If your code is non modular then you don't need
+ * to call unregister_sysctl_table() and can instead use something like
+ * register_sysctl_init() which does not care for the result of the syctl
+ * registration.
*
* Register a sysctl table. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
@@ -1410,8 +1436,11 @@ EXPORT_SYMBOL(register_sysctl);
/**
* __register_sysctl_init() - register sysctl table to path
- * @path: path name for sysctl base
- * @table: This is the sysctl table that needs to be registered to the path
+ * @path: path name for sysctl base. If that path doesn't exist we will create
+ * it for you.
+ * @table: This is the sysctl table that needs to be registered to the path.
+ * The caller must ensure the life of the @table will be kept during the
+ * lifetime use of the sysctl.
* @table_name: The name of sysctl table, only used for log printing when
* registration fails
*
@@ -1423,10 +1452,7 @@ EXPORT_SYMBOL(register_sysctl);
* register_sysctl() failing on init are extremely low, and so for both reasons
* this function does not return any error as it is used by initialization code.
*
- * Context: Can only be called after your respective sysctl base path has been
- * registered. So for instance, most base directories are registered early on
- * init before init levels are processed through proc_sys_init() and
- * sysctl_init_bases().
+ * Context: if your base directory does not exist it will be created for you.
*/
void __init __register_sysctl_init(const char *path, struct ctl_table *table,
const char *table_name)
@@ -1556,6 +1582,7 @@ out:
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
+ * We are slowly deprecating this call so avoid its use.
*
* See __register_sysctl_table for more details.
*/
@@ -1627,6 +1654,7 @@ err_register_leaves:
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
+ * We are slowly deprecating this caller so avoid future uses of it.
*
* See __register_sysctl_paths for more details.
*/
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6a96e1713fd5..cb49479acd2e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
if (start >= vma->vm_end)
return;
-#ifdef CONFIG_SHMEM
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
ops = &smaps_shmem_walk_ops;
}
}
-#endif
+
/* mmap_lock is held in m_start */
if (!start)
walk_page_vma(vma, ops, mss);