diff options
Diffstat (limited to 'kernel')
77 files changed, 7389 insertions, 1992 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b7..c0cc67ad764c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,12 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o \ + signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o + async.o range.o groups.o lglock.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -25,6 +25,9 @@ endif obj-y += sched/ obj-y += power/ +ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) +obj-$(CONFIG_X86) += kcmp.o +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/capability.c b/kernel/capability.c index 3f1adb6c6470..493d97259484 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -419,3 +419,24 @@ bool nsown_capable(int cap) { return ns_capable(current_user_ns(), cap); } + +/** + * inode_capable - Check superior capability over inode + * @inode: The inode in question + * @cap: The capability in question + * + * Return true if the current task has the given superior capability + * targeted at it's own user namespace and that the given inode is owned + * by the current user namespace or a child namespace. + * + * Currently we check to see if an inode is owned by the current + * user namespace by seeing if the inode's owner maps into the + * current user namespace. + * + */ +bool inode_capable(const struct inode *inode, int cap) +{ + struct user_namespace *ns = current_user_ns(); + + return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); +} diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad8eae5bb801..b303dfc7dce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +static int css_unbias_refcnt(int refcnt) +{ + return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; +} + /* the current nr of refs, always >= 0 whether @css is deactivated or not */ static int css_refcnt(struct cgroup_subsys_state *css) { int v = atomic_read(&css->refcnt); - return v >= 0 ? v : v - CSS_DEACT_BIAS; + return css_unbias_refcnt(v); } /* convenient tests for these bits */ @@ -2214,9 +2219,9 @@ retry_find_task: * only need to check permissions on one of them. */ tcred = __task_cred(tsk); - if (cred->euid && - cred->euid != tcred->uid && - cred->euid != tcred->suid) { + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) { rcu_read_unlock(); ret = -EACCES; goto out_unlock_cgroup; @@ -3878,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, dput_work); + struct dentry *dentry = css->cgroup->dentry; + struct super_block *sb = dentry->d_sb; - dput(css->cgroup->dentry); + atomic_inc(&sb->s_active); + dput(dentry); + deactivate_super(sb); } static void init_cgroup_css(struct cgroup_subsys_state *css, @@ -4971,10 +4980,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int v; rcu_read_lock(); - atomic_dec(&css->refcnt); - switch (css_refcnt(css)) { + v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); + + switch (v) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); @@ -5132,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth); * @root: the css supporsed to be an ancestor of the child. * * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * this function reads css->id, the caller must hold rcu_read_lock(). * But, considering usual usage, the csses should be valid objects after test. * Assuming that the caller will do some action to the child if this returns * returns true, the caller must take "child";s reference count. @@ -5144,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, { struct css_id *child_id; struct css_id *root_id; - bool ret = true; - rcu_read_lock(); child_id = rcu_dereference(child->id); + if (!child_id) + return false; root_id = rcu_dereference(root->id); - if (!child_id - || !root_id - || (child_id->depth < root_id->depth) - || (child_id->stack[root_id->depth] != root_id->id)) - ret = false; - rcu_read_unlock(); - return ret; + if (!root_id) + return false; + if (child_id->depth < root_id->depth) + return false; + if (child_id->stack[root_id->depth] != root_id->id) + return false; + return true; } void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) diff --git a/kernel/compat.c b/kernel/compat.c index d2c67aa49ae6..c28a306ae05c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1073,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) return -EFAULT; sigset_from_compat(&newset, &newset32); - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 0e6353cf147a..a4eb5227a19e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,7 +10,10 @@ #include <linux/sched.h> #include <linux/unistd.h> #include <linux/cpu.h> +#include <linux/oom.h> +#include <linux/rcupdate.h> #include <linux/export.h> +#include <linux/bug.h> #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> @@ -173,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +/** + * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU + * @cpu: a CPU id + * + * This function walks all processes, finds a valid mm struct for each one and + * then clears a corresponding bit in mm's cpumask. While this all sounds + * trivial, there are various non-obvious corner cases, which this function + * tries to solve in a safe manner. + * + * Also note that the function uses a somewhat relaxed locking scheme, so it may + * be called only for an already offlined CPU. + */ +void clear_tasks_mm_cpumask(int cpu) +{ + struct task_struct *p; + + /* + * This function is called after the cpu is taken down and marked + * offline, so its not like new tasks will ever get this cpu set in + * their mm mask. -- Peter Zijlstra + * Thus, we may use rcu_read_lock() here, instead of grabbing + * full-fledged tasklist_lock. + */ + WARN_ON(cpu_online(cpu)); + rcu_read_lock(); + for_each_process(p) { + struct task_struct *t; + + /* + * Main thread might exit, but other threads may still have + * a valid mm. Find one. + */ + t = find_lock_task_mm(p); + if (!t) + continue; + cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); + task_unlock(t); + } + rcu_read_unlock(); +} + static inline void check_for_tasks(int cpu) { struct task_struct *p; diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 249152e15308..9656a3c36503 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); /** - * cpm_pm_enter - CPU low power entry notifier + * cpu_pm_enter - CPU low power entry notifier * * Notifies listeners that a single CPU is entering a low power state that may * cause some blocks in the same power domain as the cpu to reset. @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); * Must be called on the affected CPU with interrupts disabled. Platform is * responsible for ensuring that cpu_pm_enter is not called twice on the same * CPU before cpu_pm_exit is called. Notified drivers can include VFP - * co-processor, interrupt controller and it's PM extensions, local CPU + * co-processor, interrupt controller and its PM extensions, local CPU * timers context save/restore which shouldn't be interrupted. Hence it * must be called with interrupts disabled. * @@ -115,13 +115,13 @@ int cpu_pm_enter(void) EXPORT_SYMBOL_GPL(cpu_pm_enter); /** - * cpm_pm_exit - CPU low power exit notifier + * cpu_pm_exit - CPU low power exit notifier * * Notifies listeners that a single CPU is exiting a low power state that may * have caused some blocks in the same power domain as the cpu to reset. * * Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Return conditions are same as __raw_notifier_call_chain. @@ -139,7 +139,7 @@ int cpu_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_pm_exit); /** - * cpm_cluster_pm_enter - CPU cluster low power entry notifier + * cpu_cluster_pm_enter - CPU cluster low power entry notifier * * Notifies listeners that all cpus in a power domain are entering a low power * state that may cause some blocks in the same power domain to reset. @@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); * Must be called after cpu_pm_enter has been called on all cpus in the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Must be called with interrupts disabled. @@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); /** - * cpm_cluster_pm_exit - CPU cluster low power exit notifier + * cpu_cluster_pm_exit - CPU cluster low power exit notifier * * Notifies listeners that all cpus in a power domain are exiting form a * low power state that may have caused some blocks in the same power domain @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * Must be called after cpu_pm_exit has been called on all cpus in the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Return conditions are same as __raw_notifier_call_chain. diff --git a/kernel/cred.c b/kernel/cred.c index e70683d9ec32..de728ac50d82 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -49,6 +49,14 @@ struct cred init_cred = { .subscribers = ATOMIC_INIT(2), .magic = CRED_MAGIC, #endif + .uid = GLOBAL_ROOT_UID, + .gid = GLOBAL_ROOT_GID, + .suid = GLOBAL_ROOT_UID, + .sgid = GLOBAL_ROOT_GID, + .euid = GLOBAL_ROOT_UID, + .egid = GLOBAL_ROOT_GID, + .fsuid = GLOBAL_ROOT_UID, + .fsgid = GLOBAL_ROOT_GID, .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, @@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu) if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); + put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } @@ -198,13 +207,6 @@ void exit_creds(struct task_struct *tsk) validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); - - cred = (struct cred *) tsk->replacement_session_keyring; - if (cred) { - tsk->replacement_session_keyring = NULL; - validate_creds(cred); - put_cred(cred); - } } /** @@ -303,6 +305,7 @@ struct cred *prepare_creds(void) set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); + get_user_ns(new->user_ns); #ifdef CONFIG_KEYS key_get(new->thread_keyring); @@ -386,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - p->replacement_session_keyring = NULL; - if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && @@ -414,11 +415,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) goto error_put; } - /* cache user_ns in cred. Doesn't need a refcount because it will - * stay pinned by cred->user - */ - new->user_ns = new->user->user_ns; - #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ @@ -493,10 +489,10 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ - if (old->euid != new->euid || - old->egid != new->egid || - old->fsuid != new->fsuid || - old->fsgid != new->fsgid || + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || !cap_issubset(new->cap_permitted, old->cap_permitted)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); @@ -505,9 +501,9 @@ int commit_creds(struct cred *new) } /* alter the thread keyring */ - if (new->fsuid != old->fsuid) + if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(task); - if (new->fsgid != old->fsgid) + if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(task); /* do it @@ -524,16 +520,16 @@ int commit_creds(struct cred *new) alter_cred_subscribers(old, -2); /* send notifications */ - if (new->uid != old->uid || - new->euid != old->euid || - new->suid != old->suid || - new->fsuid != old->fsuid) + if (!uid_eq(new->uid, old->uid) || + !uid_eq(new->euid, old->euid) || + !uid_eq(new->suid, old->suid) || + !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); - if (new->gid != old->gid || - new->egid != old->egid || - new->sgid != old->sgid || - new->fsgid != old->fsgid) + if (!gid_eq(new->gid, old->gid) || + !gid_eq(new->egid, old->egid) || + !gid_eq(new->sgid, old->sgid) || + !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ @@ -678,6 +674,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); + get_user_ns(new->user_ns); get_group_info(new->group_info); #ifdef CONFIG_KEYS diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 22d901f9caf4..103f5d147b2f 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg endif obj-y := core.o ring_buffer.o callchain.o + obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_UPROBES) += uprobes.o + diff --git a/kernel/events/core.c b/kernel/events/core.c index 91a445925855..d7d71d6ec972 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) return !event->cgrp || event->cgrp == cpuctx->cgrp; } -static inline void perf_get_cgroup(struct perf_event *event) +static inline bool perf_tryget_cgroup(struct perf_event *event) { - css_get(&event->cgrp->css); + return css_tryget(&event->cgrp->css); } static inline void perf_put_cgroup(struct perf_event *event) @@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, event->cgrp = cgrp; /* must be done before we fput() the file */ - perf_get_cgroup(event); + if (!perf_tryget_cgroup(event)) { + event->cgrp = NULL; + ret = -ENOENT; + goto out; + } /* * all events in a group must monitor @@ -2039,8 +2043,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, * accessing the event control register. If a NMI hits, then it will * not restart the event. */ -static void __perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) { int ctxn; @@ -2279,8 +2283,8 @@ static void perf_branch_stack_sched_in(struct task_struct *prev, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -static void __perf_event_task_sched_in(struct task_struct *prev, - struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -2305,12 +2309,6 @@ static void __perf_event_task_sched_in(struct task_struct *prev, perf_branch_stack_sched_in(prev, task); } -void __perf_event_task_sched(struct task_struct *prev, struct task_struct *next) -{ - __perf_event_task_sched_out(prev, next); - __perf_event_task_sched_in(prev, next); -} - static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; @@ -3187,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c new file mode 100644 index 000000000000..985be4d80fe8 --- /dev/null +++ b/kernel/events/uprobes.c @@ -0,0 +1,1667 @@ +/* + * User-space Probes (UProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2012 + * Authors: + * Srikar Dronamraju + * Jim Keniston + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> /* read_mapping_page */ +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/rmap.h> /* anon_vma_prepare */ +#include <linux/mmu_notifier.h> /* set_pte_at_notify */ +#include <linux/swap.h> /* try_to_free_swap */ +#include <linux/ptrace.h> /* user_enable_single_step */ +#include <linux/kdebug.h> /* notifier mechanism */ + +#include <linux/uprobes.h> + +#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) +#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE + +static struct srcu_struct uprobes_srcu; +static struct rb_root uprobes_tree = RB_ROOT; + +static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ + +#define UPROBES_HASH_SZ 13 + +/* serialize (un)register */ +static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; + +#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) + +/* serialize uprobe->pending_list */ +static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; +#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) + +/* + * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe + * events active at this time. Probably a fine grained per inode count is + * better? + */ +static atomic_t uprobe_events = ATOMIC_INIT(0); + +/* + * Maintain a temporary per vma info that can be used to search if a vma + * has already been handled. This structure is introduced since extending + * vm_area_struct wasnt recommended. + */ +struct vma_info { + struct list_head probe_list; + struct mm_struct *mm; + loff_t vaddr; +}; + +struct uprobe { + struct rb_node rb_node; /* node in the rb tree */ + atomic_t ref; + struct rw_semaphore consumer_rwsem; + struct list_head pending_list; + struct uprobe_consumer *consumers; + struct inode *inode; /* Also hold a ref to inode */ + loff_t offset; + int flags; + struct arch_uprobe arch; +}; + +/* + * valid_vma: Verify if the specified vma is an executable vma + * Relax restrictions while unregistering: vm_flags might have + * changed after breakpoint was inserted. + * - is_register: indicates if we are in register context. + * - Return 1 if the specified virtual address is in an + * executable vma. + */ +static bool valid_vma(struct vm_area_struct *vma, bool is_register) +{ + if (!vma->vm_file) + return false; + + if (!is_register) + return true; + + if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) + return true; + + return false; +} + +static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) +{ + loff_t vaddr; + + vaddr = vma->vm_start + offset; + vaddr -= vma->vm_pgoff << PAGE_SHIFT; + + return vaddr; +} + +/** + * __replace_page - replace page in vma by new page. + * based on replace_page in mm/ksm.c + * + * @vma: vma that holds the pte pointing to page + * @page: the cowed page we are replacing by kpage + * @kpage: the modified page we replace page by + * + * Returns 0 on success, -EFAULT on failure. + */ +static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!ptep) + goto out; + + get_page(kpage); + page_add_new_anon_rmap(kpage, vma, addr); + + if (!PageAnon(page)) { + dec_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter(mm, MM_ANONPAGES); + } + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + + page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); + put_page(page); + pte_unmap_unlock(ptep, ptl); + err = 0; + +out: + return err; +} + +/** + * is_swbp_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_swbp_insn + * Returns true if @insn is a breakpoint instruction. + */ +bool __weak is_swbp_insn(uprobe_opcode_t *insn) +{ + return *insn == UPROBE_SWBP_INSN; +} + +/* + * NOTE: + * Expect the breakpoint instruction to be the smallest size instruction for + * the architecture. If an arch has variable length instruction and the + * breakpoint instruction is not of the smallest length instruction + * supported by that architecture then we need to modify read_opcode / + * write_opcode accordingly. This would never be a problem for archs that + * have fixed length instructions. + */ + +/* + * write_opcode - write the opcode at a given virtual address. + * @auprobe: arch breakpointing information. + * @mm: the probed process address space. + * @vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @vaddr. + * + * Called with mm->mmap_sem held (for read and with a reference to + * mm). + * + * For mm @mm, write the opcode at @vaddr. + * Return 0 (success) or a negative errno. + */ +static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, uprobe_opcode_t opcode) +{ + struct page *old_page, *new_page; + struct address_space *mapping; + void *vaddr_old, *vaddr_new; + struct vm_area_struct *vma; + struct uprobe *uprobe; + loff_t addr; + int ret; + + /* Read the page with vaddr into memory */ + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); + if (ret <= 0) + return ret; + + ret = -EINVAL; + + /* + * We are interested in text pages only. Our pages of interest + * should be mapped for read and execute only. We desist from + * adding probes in write mapped pages since the breakpoints + * might end up in the file copy. + */ + if (!valid_vma(vma, is_swbp_insn(&opcode))) + goto put_out; + + uprobe = container_of(auprobe, struct uprobe, arch); + mapping = uprobe->inode->i_mapping; + if (mapping != vma->vm_file->f_mapping) + goto put_out; + + addr = vma_address(vma, uprobe->offset); + if (vaddr != (unsigned long)addr) + goto put_out; + + ret = -ENOMEM; + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + if (!new_page) + goto put_out; + + __SetPageUptodate(new_page); + + /* + * lock page will serialize against do_wp_page()'s + * PageAnon() handling + */ + lock_page(old_page); + /* copy the page now that we've got it stable */ + vaddr_old = kmap_atomic(old_page); + vaddr_new = kmap_atomic(new_page); + + memcpy(vaddr_new, vaddr_old, PAGE_SIZE); + + /* poke the new insn in, ASSUMES we don't cross page boundary */ + vaddr &= ~PAGE_MASK; + BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + + kunmap_atomic(vaddr_new); + kunmap_atomic(vaddr_old); + + ret = anon_vma_prepare(vma); + if (ret) + goto unlock_out; + + lock_page(new_page); + ret = __replace_page(vma, old_page, new_page); + unlock_page(new_page); + +unlock_out: + unlock_page(old_page); + page_cache_release(new_page); + +put_out: + put_page(old_page); + + return ret; +} + +/** + * read_opcode - read the opcode at a given virtual address. + * @mm: the probed process address space. + * @vaddr: the virtual address to read the opcode. + * @opcode: location to store the read opcode. + * + * Called with mm->mmap_sem held (for read and with a reference to + * mm. + * + * For mm @mm, read the opcode at @vaddr and store it in @opcode. + * Return 0 (success) or a negative errno. + */ +static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) +{ + struct page *page; + void *vaddr_new; + int ret; + + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + if (ret <= 0) + return ret; + + lock_page(page); + vaddr_new = kmap_atomic(page); + vaddr &= ~PAGE_MASK; + memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); + kunmap_atomic(vaddr_new); + unlock_page(page); + + put_page(page); + + return 0; +} + +static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t opcode; + int result; + + result = read_opcode(mm, vaddr, &opcode); + if (result) + return result; + + if (is_swbp_insn(&opcode)) + return 1; + + return 0; +} + +/** + * set_swbp - store breakpoint at a given address. + * @auprobe: arch specific probepoint information. + * @mm: the probed process address space. + * @vaddr: the virtual address to insert the opcode. + * + * For mm @mm, store the breakpoint instruction at @vaddr. + * Return 0 (success) or a negative errno. + */ +int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +{ + int result; + + result = is_swbp_at_addr(mm, vaddr); + if (result == 1) + return -EEXIST; + + if (result) + return result; + + return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); +} + +/** + * set_orig_insn - Restore the original instruction. + * @mm: the probed process address space. + * @auprobe: arch specific probepoint information. + * @vaddr: the virtual address to insert the opcode. + * @verify: if true, verify existance of breakpoint instruction. + * + * For mm @mm, restore the original opcode (opcode) at @vaddr. + * Return 0 (success) or a negative errno. + */ +int __weak +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) +{ + if (verify) { + int result; + + result = is_swbp_at_addr(mm, vaddr); + if (!result) + return -EINVAL; + + if (result != 1) + return result; + } + return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); +} + +static int match_uprobe(struct uprobe *l, struct uprobe *r) +{ + if (l->inode < r->inode) + return -1; + + if (l->inode > r->inode) + return 1; + + if (l->offset < r->offset) + return -1; + + if (l->offset > r->offset) + return 1; + + return 0; +} + +static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe u = { .inode = inode, .offset = offset }; + struct rb_node *n = uprobes_tree.rb_node; + struct uprobe *uprobe; + int match; + + while (n) { + uprobe = rb_entry(n, struct uprobe, rb_node); + match = match_uprobe(&u, uprobe); + if (!match) { + atomic_inc(&uprobe->ref); + return uprobe; + } + + if (match < 0) + n = n->rb_left; + else + n = n->rb_right; + } + return NULL; +} + +/* + * Find a uprobe corresponding to a given inode:offset + * Acquires uprobes_treelock + */ +static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe *uprobe; + unsigned long flags; + + spin_lock_irqsave(&uprobes_treelock, flags); + uprobe = __find_uprobe(inode, offset); + spin_unlock_irqrestore(&uprobes_treelock, flags); + + return uprobe; +} + +static struct uprobe *__insert_uprobe(struct uprobe *uprobe) +{ + struct rb_node **p = &uprobes_tree.rb_node; + struct rb_node *parent = NULL; + struct uprobe *u; + int match; + + while (*p) { + parent = *p; + u = rb_entry(parent, struct uprobe, rb_node); + match = match_uprobe(uprobe, u); + if (!match) { + atomic_inc(&u->ref); + return u; + } + + if (match < 0) + p = &parent->rb_left; + else + p = &parent->rb_right; + + } + + u = NULL; + rb_link_node(&uprobe->rb_node, parent, p); + rb_insert_color(&uprobe->rb_node, &uprobes_tree); + /* get access + creation ref */ + atomic_set(&uprobe->ref, 2); + + return u; +} + +/* + * Acquire uprobes_treelock. + * Matching uprobe already exists in rbtree; + * increment (access refcount) and return the matching uprobe. + * + * No matching uprobe; insert the uprobe in rb_tree; + * get a double refcount (access + creation) and return NULL. + */ +static struct uprobe *insert_uprobe(struct uprobe *uprobe) +{ + unsigned long flags; + struct uprobe *u; + + spin_lock_irqsave(&uprobes_treelock, flags); + u = __insert_uprobe(uprobe); + spin_unlock_irqrestore(&uprobes_treelock, flags); + + /* For now assume that the instruction need not be single-stepped */ + uprobe->flags |= UPROBE_SKIP_SSTEP; + + return u; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe *uprobe, *cur_uprobe; + + uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); + if (!uprobe) + return NULL; + + uprobe->inode = igrab(inode); + uprobe->offset = offset; + init_rwsem(&uprobe->consumer_rwsem); + INIT_LIST_HEAD(&uprobe->pending_list); + + /* add to uprobes_tree, sorted on inode:offset */ + cur_uprobe = insert_uprobe(uprobe); + + /* a uprobe exists for this inode:offset combination */ + if (cur_uprobe) { + kfree(uprobe); + uprobe = cur_uprobe; + iput(inode); + } else { + atomic_inc(&uprobe_events); + } + + return uprobe; +} + +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + + if (!(uprobe->flags & UPROBE_RUN_HANDLER)) + return; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (!uc->filter || uc->filter(uc, current)) + uc->handler(uc, regs); + } + up_read(&uprobe->consumer_rwsem); +} + +/* Returns the previous consumer */ +static struct uprobe_consumer * +consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ + down_write(&uprobe->consumer_rwsem); + uc->next = uprobe->consumers; + uprobe->consumers = uc; + up_write(&uprobe->consumer_rwsem); + + return uc->next; +} + +/* + * For uprobe @uprobe, delete the consumer @uc. + * Return true if the @uc is deleted successfully + * or return false. + */ +static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ + struct uprobe_consumer **con; + bool ret = false; + + down_write(&uprobe->consumer_rwsem); + for (con = &uprobe->consumers; *con; con = &(*con)->next) { + if (*con == uc) { + *con = uc->next; + ret = true; + break; + } + } + up_write(&uprobe->consumer_rwsem); + + return ret; +} + +static int +__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, + unsigned long nbytes, unsigned long offset) +{ + struct file *filp = vma->vm_file; + struct page *page; + void *vaddr; + unsigned long off1; + unsigned long idx; + + if (!filp) + return -EINVAL; + + idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); + off1 = offset &= ~PAGE_MASK; + + /* + * Ensure that the page that has the original instruction is + * populated and in page-cache. + */ + page = read_mapping_page(mapping, idx, filp); + if (IS_ERR(page)) + return PTR_ERR(page); + + vaddr = kmap_atomic(page); + memcpy(insn, vaddr + off1, nbytes); + kunmap_atomic(vaddr); + page_cache_release(page); + + return 0; +} + +static int +copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping; + unsigned long nbytes; + int bytes; + + addr &= ~PAGE_MASK; + nbytes = PAGE_SIZE - addr; + mapping = uprobe->inode->i_mapping; + + /* Instruction at end of binary; copy only available bytes */ + if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) + bytes = uprobe->inode->i_size - uprobe->offset; + else + bytes = MAX_UINSN_BYTES; + + /* Instruction at the page-boundary; copy bytes in second page */ + if (nbytes < bytes) { + if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes)) + return -ENOMEM; + + bytes = nbytes; + } + return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); +} + +/* + * How mm->uprobes_state.count gets updated + * uprobe_mmap() increments the count if + * - it successfully adds a breakpoint. + * - it cannot add a breakpoint, but sees that there is a underlying + * breakpoint (via a is_swbp_at_addr()). + * + * uprobe_munmap() decrements the count if + * - it sees a underlying breakpoint, (via is_swbp_at_addr) + * (Subsequent uprobe_unregister wouldnt find the breakpoint + * unless a uprobe_mmap kicks in, since the old vma would be + * dropped just after uprobe_munmap.) + * + * uprobe_register increments the count if: + * - it successfully adds a breakpoint. + * + * uprobe_unregister decrements the count if: + * - it sees a underlying breakpoint and removes successfully. + * (via is_swbp_at_addr) + * (Subsequent uprobe_munmap wouldnt find the breakpoint + * since there is no underlying breakpoint after the + * breakpoint removal.) + */ +static int +install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, + struct vm_area_struct *vma, loff_t vaddr) +{ + unsigned long addr; + int ret; + + /* + * If probe is being deleted, unregister thread could be done with + * the vma-rmap-walk through. Adding a probe now can be fatal since + * nobody will be able to cleanup. Also we could be from fork or + * mremap path, where the probe might have already been inserted. + * Hence behave as if probe already existed. + */ + if (!uprobe->consumers) + return -EEXIST; + + addr = (unsigned long)vaddr; + + if (!(uprobe->flags & UPROBE_COPY_INSN)) { + ret = copy_insn(uprobe, vma, addr); + if (ret) + return ret; + + if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + return -EEXIST; + + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + if (ret) + return ret; + + uprobe->flags |= UPROBE_COPY_INSN; + } + + /* + * Ideally, should be updating the probe count after the breakpoint + * has been successfully inserted. However a thread could hit the + * breakpoint we just inserted even before the probe count is + * incremented. If this is the first breakpoint placed, breakpoint + * notifier might ignore uprobes and pass the trap to the thread. + * Hence increment before and decrement on failure. + */ + atomic_inc(&mm->uprobes_state.count); + ret = set_swbp(&uprobe->arch, mm, addr); + if (ret) + atomic_dec(&mm->uprobes_state.count); + + return ret; +} + +static void +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) +{ + if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) + atomic_dec(&mm->uprobes_state.count); +} + +/* + * There could be threads that have hit the breakpoint and are entering the + * notifier code and trying to acquire the uprobes_treelock. The thread + * calling delete_uprobe() that is removing the uprobe from the rb_tree can + * race with these threads and might acquire the uprobes_treelock compared + * to some of the breakpoint hit threads. In such a case, the breakpoint + * hit threads will not find the uprobe. The current unregistering thread + * waits till all other threads have hit a breakpoint, to acquire the + * uprobes_treelock before the uprobe is removed from the rbtree. + */ +static void delete_uprobe(struct uprobe *uprobe) +{ + unsigned long flags; + + synchronize_srcu(&uprobes_srcu); + spin_lock_irqsave(&uprobes_treelock, flags); + rb_erase(&uprobe->rb_node, &uprobes_tree); + spin_unlock_irqrestore(&uprobes_treelock, flags); + iput(uprobe->inode); + put_uprobe(uprobe); + atomic_dec(&uprobe_events); +} + +static struct vma_info * +__find_next_vma_info(struct address_space *mapping, struct list_head *head, + struct vma_info *vi, loff_t offset, bool is_register) +{ + struct prio_tree_iter iter; + struct vm_area_struct *vma; + struct vma_info *tmpvi; + unsigned long pgoff; + int existing_vma; + loff_t vaddr; + + pgoff = offset >> PAGE_SHIFT; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (!valid_vma(vma, is_register)) + continue; + + existing_vma = 0; + vaddr = vma_address(vma, offset); + + list_for_each_entry(tmpvi, head, probe_list) { + if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { + existing_vma = 1; + break; + } + } + + /* + * Another vma needs a probe to be installed. However skip + * installing the probe if the vma is about to be unlinked. + */ + if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { + vi->mm = vma->vm_mm; + vi->vaddr = vaddr; + list_add(&vi->probe_list, head); + + return vi; + } + } + + return NULL; +} + +/* + * Iterate in the rmap prio tree and find a vma where a probe has not + * yet been inserted. + */ +static struct vma_info * +find_next_vma_info(struct address_space *mapping, struct list_head *head, + loff_t offset, bool is_register) +{ + struct vma_info *vi, *retvi; + + vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); + if (!vi) + return ERR_PTR(-ENOMEM); + + mutex_lock(&mapping->i_mmap_mutex); + retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); + mutex_unlock(&mapping->i_mmap_mutex); + + if (!retvi) + kfree(vi); + + return retvi; +} + +static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +{ + struct list_head try_list; + struct vm_area_struct *vma; + struct address_space *mapping; + struct vma_info *vi, *tmpvi; + struct mm_struct *mm; + loff_t vaddr; + int ret; + + mapping = uprobe->inode->i_mapping; + INIT_LIST_HEAD(&try_list); + + ret = 0; + + for (;;) { + vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); + if (!vi) + break; + + if (IS_ERR(vi)) { + ret = PTR_ERR(vi); + break; + } + + mm = vi->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, (unsigned long)vi->vaddr); + if (!vma || !valid_vma(vma, is_register)) { + list_del(&vi->probe_list); + kfree(vi); + up_read(&mm->mmap_sem); + mmput(mm); + continue; + } + vaddr = vma_address(vma, uprobe->offset); + if (vma->vm_file->f_mapping->host != uprobe->inode || + vaddr != vi->vaddr) { + list_del(&vi->probe_list); + kfree(vi); + up_read(&mm->mmap_sem); + mmput(mm); + continue; + } + + if (is_register) + ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); + else + remove_breakpoint(uprobe, mm, vi->vaddr); + + up_read(&mm->mmap_sem); + mmput(mm); + if (is_register) { + if (ret && ret == -EEXIST) + ret = 0; + if (ret) + break; + } + } + + list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { + list_del(&vi->probe_list); + kfree(vi); + } + + return ret; +} + +static int __uprobe_register(struct uprobe *uprobe) +{ + return register_for_each_vma(uprobe, true); +} + +static void __uprobe_unregister(struct uprobe *uprobe) +{ + if (!register_for_each_vma(uprobe, false)) + delete_uprobe(uprobe); + + /* TODO : cant unregister? schedule a worker thread */ +} + +/* + * uprobe_register - register a probe + * @inode: the file in which the probe has to be placed. + * @offset: offset from the start of the file. + * @uc: information on howto handle the probe.. + * + * Apart from the access refcount, uprobe_register() takes a creation + * refcount (thro alloc_uprobe) if and only if this @uprobe is getting + * inserted into the rbtree (i.e first consumer for a @inode:@offset + * tuple). Creation refcount stops uprobe_unregister from freeing the + * @uprobe even before the register operation is complete. Creation + * refcount is released when the last @uc for the @uprobe + * unregisters. + * + * Return errno if it cannot successully install probes + * else return 0 (success) + */ +int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + struct uprobe *uprobe; + int ret; + + if (!inode || !uc || uc->next) + return -EINVAL; + + if (offset > i_size_read(inode)) + return -EINVAL; + + ret = 0; + mutex_lock(uprobes_hash(inode)); + uprobe = alloc_uprobe(inode, offset); + + if (uprobe && !consumer_add(uprobe, uc)) { + ret = __uprobe_register(uprobe); + if (ret) { + uprobe->consumers = NULL; + __uprobe_unregister(uprobe); + } else { + uprobe->flags |= UPROBE_RUN_HANDLER; + } + } + + mutex_unlock(uprobes_hash(inode)); + put_uprobe(uprobe); + + return ret; +} + +/* + * uprobe_unregister - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + struct uprobe *uprobe; + + if (!inode || !uc) + return; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return; + + mutex_lock(uprobes_hash(inode)); + + if (consumer_del(uprobe, uc)) { + if (!uprobe->consumers) { + __uprobe_unregister(uprobe); + uprobe->flags &= ~UPROBE_RUN_HANDLER; + } + } + + mutex_unlock(uprobes_hash(inode)); + if (uprobe) + put_uprobe(uprobe); +} + +/* + * Of all the nodes that correspond to the given inode, return the node + * with the least offset. + */ +static struct rb_node *find_least_offset_node(struct inode *inode) +{ + struct uprobe u = { .inode = inode, .offset = 0}; + struct rb_node *n = uprobes_tree.rb_node; + struct rb_node *close_node = NULL; + struct uprobe *uprobe; + int match; + + while (n) { + uprobe = rb_entry(n, struct uprobe, rb_node); + match = match_uprobe(&u, uprobe); + + if (uprobe->inode == inode) + close_node = n; + + if (!match) + return close_node; + + if (match < 0) + n = n->rb_left; + else + n = n->rb_right; + } + + return close_node; +} + +/* + * For a given inode, build a list of probes that need to be inserted. + */ +static void build_probe_list(struct inode *inode, struct list_head *head) +{ + struct uprobe *uprobe; + unsigned long flags; + struct rb_node *n; + + spin_lock_irqsave(&uprobes_treelock, flags); + + n = find_least_offset_node(inode); + + for (; n; n = rb_next(n)) { + uprobe = rb_entry(n, struct uprobe, rb_node); + if (uprobe->inode != inode) + break; + + list_add(&uprobe->pending_list, head); + atomic_inc(&uprobe->ref); + } + + spin_unlock_irqrestore(&uprobes_treelock, flags); +} + +/* + * Called from mmap_region. + * called with mm->mmap_sem acquired. + * + * Return -ve no if we fail to insert probes and we cannot + * bail-out. + * Return 0 otherwise. i.e: + * + * - successful insertion of probes + * - (or) no possible probes to be inserted. + * - (or) insertion of probes failed but we can bail-out. + */ +int uprobe_mmap(struct vm_area_struct *vma) +{ + struct list_head tmp_list; + struct uprobe *uprobe, *u; + struct inode *inode; + int ret, count; + + if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + return 0; + + inode = vma->vm_file->f_mapping->host; + if (!inode) + return 0; + + INIT_LIST_HEAD(&tmp_list); + mutex_lock(uprobes_mmap_hash(inode)); + build_probe_list(inode, &tmp_list); + + ret = 0; + count = 0; + + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + loff_t vaddr; + + list_del(&uprobe->pending_list); + if (!ret) { + vaddr = vma_address(vma, uprobe->offset); + + if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { + put_uprobe(uprobe); + continue; + } + + ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + + /* Ignore double add: */ + if (ret == -EEXIST) { + ret = 0; + + if (!is_swbp_at_addr(vma->vm_mm, vaddr)) + continue; + + /* + * Unable to insert a breakpoint, but + * breakpoint lies underneath. Increment the + * probe count. + */ + atomic_inc(&vma->vm_mm->uprobes_state.count); + } + + if (!ret) + count++; + } + put_uprobe(uprobe); + } + + mutex_unlock(uprobes_mmap_hash(inode)); + + if (ret) + atomic_sub(count, &vma->vm_mm->uprobes_state.count); + + return ret; +} + +/* + * Called in context of a munmap of a vma. + */ +void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + struct list_head tmp_list; + struct uprobe *uprobe, *u; + struct inode *inode; + + if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) + return; + + if (!atomic_read(&vma->vm_mm->uprobes_state.count)) + return; + + inode = vma->vm_file->f_mapping->host; + if (!inode) + return; + + INIT_LIST_HEAD(&tmp_list); + mutex_lock(uprobes_mmap_hash(inode)); + build_probe_list(inode, &tmp_list); + + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + loff_t vaddr; + + list_del(&uprobe->pending_list); + vaddr = vma_address(vma, uprobe->offset); + + if (vaddr >= start && vaddr < end) { + /* + * An unregister could have removed the probe before + * unmap. So check before we decrement the count. + */ + if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) + atomic_dec(&vma->vm_mm->uprobes_state.count); + } + put_uprobe(uprobe); + } + mutex_unlock(uprobes_mmap_hash(inode)); +} + +/* Slot allocation for XOL */ +static int xol_add_vma(struct xol_area *area) +{ + struct mm_struct *mm; + int ret; + + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) + return -ENOMEM; + + ret = -EALREADY; + mm = current->mm; + + down_write(&mm->mmap_sem); + if (mm->uprobes_state.xol_area) + goto fail; + + ret = -ENOMEM; + + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); + if (area->vaddr & ~PAGE_MASK) { + ret = area->vaddr; + goto fail; + } + + ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); + if (ret) + goto fail; + + smp_wmb(); /* pairs with get_xol_area() */ + mm->uprobes_state.xol_area = area; + ret = 0; + +fail: + up_write(&mm->mmap_sem); + if (ret) + __free_page(area->page); + + return ret; +} + +static struct xol_area *get_xol_area(struct mm_struct *mm) +{ + struct xol_area *area; + + area = mm->uprobes_state.xol_area; + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + + return area; +} + +/* + * xol_alloc_area - Allocate process's xol_area. + * This area will be used for storing instructions for execution out of + * line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *xol_alloc_area(void) +{ + struct xol_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (unlikely(!area)) + return NULL; + + area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); + + if (!area->bitmap) + goto fail; + + init_waitqueue_head(&area->wq); + if (!xol_add_vma(area)) + return area; + +fail: + kfree(area->bitmap); + kfree(area); + + return get_xol_area(current->mm); +} + +/* + * uprobe_clear_state - Free the area allocated for slots. + */ +void uprobe_clear_state(struct mm_struct *mm) +{ + struct xol_area *area = mm->uprobes_state.xol_area; + + if (!area) + return; + + put_page(area->page); + kfree(area->bitmap); + kfree(area); +} + +/* + * uprobe_reset_state - Free the area allocated for slots. + */ +void uprobe_reset_state(struct mm_struct *mm) +{ + mm->uprobes_state.xol_area = NULL; + atomic_set(&mm->uprobes_state.count, 0); +} + +/* + * - search for a free slot. + */ +static unsigned long xol_take_insn_slot(struct xol_area *area) +{ + unsigned long slot_addr; + int slot_nr; + + do { + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); + if (slot_nr < UINSNS_PER_PAGE) { + if (!test_and_set_bit(slot_nr, area->bitmap)) + break; + + slot_nr = UINSNS_PER_PAGE; + continue; + } + wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); + } while (slot_nr >= UINSNS_PER_PAGE); + + slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); + atomic_inc(&area->slot_count); + + return slot_addr; +} + +/* + * xol_get_insn_slot - If was not allocated a slot, then + * allocate a slot. + * Returns the allocated slot address or 0. + */ +static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +{ + struct xol_area *area; + unsigned long offset; + void *vaddr; + + area = get_xol_area(current->mm); + if (!area) { + area = xol_alloc_area(); + if (!area) + return 0; + } + current->utask->xol_vaddr = xol_take_insn_slot(area); + + /* + * Initialize the slot if xol_vaddr points to valid + * instruction slot. + */ + if (unlikely(!current->utask->xol_vaddr)) + return 0; + + current->utask->vaddr = slot_addr; + offset = current->utask->xol_vaddr & ~PAGE_MASK; + vaddr = kmap_atomic(area->page); + memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); + kunmap_atomic(vaddr); + + return current->utask->xol_vaddr; +} + +/* + * xol_free_insn_slot - If slot was earlier allocated by + * @xol_get_insn_slot(), make the slot available for + * subsequent requests. + */ +static void xol_free_insn_slot(struct task_struct *tsk) +{ + struct xol_area *area; + unsigned long vma_end; + unsigned long slot_addr; + + if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) + return; + + slot_addr = tsk->utask->xol_vaddr; + + if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) + return; + + area = tsk->mm->uprobes_state.xol_area; + vma_end = area->vaddr + PAGE_SIZE; + if (area->vaddr <= slot_addr && slot_addr < vma_end) { + unsigned long offset; + int slot_nr; + + offset = slot_addr - area->vaddr; + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; + if (slot_nr >= UINSNS_PER_PAGE) + return; + + clear_bit(slot_nr, area->bitmap); + atomic_dec(&area->slot_count); + if (waitqueue_active(&area->wq)) + wake_up(&area->wq); + + tsk->utask->xol_vaddr = 0; + } +} + +/** + * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs + * @regs: Reflects the saved state of the task after it has hit a breakpoint + * instruction. + * Return the address of the breakpoint instruction. + */ +unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; +} + +/* + * Called with no locks held. + * Called in context of a exiting or a exec-ing thread. + */ +void uprobe_free_utask(struct task_struct *t) +{ + struct uprobe_task *utask = t->utask; + + if (t->uprobe_srcu_id != -1) + srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); + + if (!utask) + return; + + if (utask->active_uprobe) + put_uprobe(utask->active_uprobe); + + xol_free_insn_slot(t); + kfree(utask); + t->utask = NULL; +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t) +{ + t->utask = NULL; + t->uprobe_srcu_id = -1; +} + +/* + * Allocate a uprobe_task object for the task. + * Called when the thread hits a breakpoint for the first time. + * + * Returns: + * - pointer to new uprobe_task on success + * - NULL otherwise + */ +static struct uprobe_task *add_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof *utask, GFP_KERNEL); + if (unlikely(!utask)) + return NULL; + + utask->active_uprobe = NULL; + current->utask = utask; + return utask; +} + +/* Prepare to single-step probed instruction out of line. */ +static int +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +{ + if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) + return 0; + + return -EFAULT; +} + +/* + * If we are singlestepping, then ensure this thread is not connected to + * non-fatal signals until completion of singlestep. When xol insn itself + * triggers the signal, restart the original insn even if the task is + * already SIGKILL'ed (since coredump should report the correct ip). This + * is even more important if the task has a handler for SIGSEGV/etc, The + * _same_ instruction should be repeated again after return from the signal + * handler, and SSTEP can never finish in this case. + */ +bool uprobe_deny_signal(void) +{ + struct task_struct *t = current; + struct uprobe_task *utask = t->utask; + + if (likely(!utask || !utask->active_uprobe)) + return false; + + WARN_ON_ONCE(utask->state != UTASK_SSTEP); + + if (signal_pending(t)) { + spin_lock_irq(&t->sighand->siglock); + clear_tsk_thread_flag(t, TIF_SIGPENDING); + spin_unlock_irq(&t->sighand->siglock); + + if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { + utask->state = UTASK_SSTEP_TRAPPED; + set_tsk_thread_flag(t, TIF_UPROBE); + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + } + } + + return true; +} + +/* + * Avoid singlestepping the original instruction if the original instruction + * is a NOP or can be emulated. + */ +static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) +{ + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) + return true; + + uprobe->flags &= ~UPROBE_SKIP_SSTEP; + return false; +} + +/* + * Run handler and ask thread to singlestep. + * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. + */ +static void handle_swbp(struct pt_regs *regs) +{ + struct vm_area_struct *vma; + struct uprobe_task *utask; + struct uprobe *uprobe; + struct mm_struct *mm; + unsigned long bp_vaddr; + + uprobe = NULL; + bp_vaddr = uprobe_get_swbp_addr(regs); + mm = current->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, bp_vaddr); + + if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { + struct inode *inode; + loff_t offset; + + inode = vma->vm_file->f_mapping->host; + offset = bp_vaddr - vma->vm_start; + offset += (vma->vm_pgoff << PAGE_SHIFT); + uprobe = find_uprobe(inode, offset); + } + + srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); + current->uprobe_srcu_id = -1; + up_read(&mm->mmap_sem); + + if (!uprobe) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + return; + } + + utask = current->utask; + if (!utask) { + utask = add_utask(); + /* Cannot allocate; re-execute the instruction. */ + if (!utask) + goto cleanup_ret; + } + utask->active_uprobe = uprobe; + handler_chain(uprobe, regs); + if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) + goto cleanup_ret; + + utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) { + user_enable_single_step(current); + return; + } + +cleanup_ret: + if (utask) { + utask->active_uprobe = NULL; + utask->state = UTASK_RUNNING; + } + if (uprobe) { + if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) + + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); + + put_uprobe(uprobe); + } +} + +/* + * Perform required fix-ups and disable singlestep. + * Allow pending signals to take effect. + */ +static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) +{ + struct uprobe *uprobe; + + uprobe = utask->active_uprobe; + if (utask->state == UTASK_SSTEP_ACK) + arch_uprobe_post_xol(&uprobe->arch, regs); + else if (utask->state == UTASK_SSTEP_TRAPPED) + arch_uprobe_abort_xol(&uprobe->arch, regs); + else + WARN_ON_ONCE(1); + + put_uprobe(uprobe); + utask->active_uprobe = NULL; + utask->state = UTASK_RUNNING; + user_disable_single_step(current); + xol_free_insn_slot(current); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* see uprobe_deny_signal() */ + spin_unlock_irq(¤t->sighand->siglock); +} + +/* + * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on + * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and + * allows the thread to return from interrupt. + * + * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and + * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from + * interrupt. + * + * While returning to userspace, thread notices the TIF_UPROBE flag and calls + * uprobe_notify_resume(). + */ +void uprobe_notify_resume(struct pt_regs *regs) +{ + struct uprobe_task *utask; + + utask = current->utask; + if (!utask || utask->state == UTASK_BP_HIT) + handle_swbp(regs); + else + handle_singlestep(utask, regs); +} + +/* + * uprobe_pre_sstep_notifier gets called from interrupt context as part of + * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. + */ +int uprobe_pre_sstep_notifier(struct pt_regs *regs) +{ + struct uprobe_task *utask; + + if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) + /* task is currently not uprobed */ + return 0; + + utask = current->utask; + if (utask) + utask->state = UTASK_BP_HIT; + + set_thread_flag(TIF_UPROBE); + current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); + + return 1; +} + +/* + * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier + * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. + */ +int uprobe_post_sstep_notifier(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (!current->mm || !utask || !utask->active_uprobe) + /* task is currently not uprobed */ + return 0; + + utask->state = UTASK_SSTEP_ACK; + set_thread_flag(TIF_UPROBE); + return 1; +} + +static struct notifier_block uprobe_exception_nb = { + .notifier_call = arch_uprobe_exception_notify, + .priority = INT_MAX-1, /* notified after kprobes, kgdb */ +}; + +static int __init init_uprobes(void) +{ + int i; + + for (i = 0; i < UPROBES_HASH_SZ; i++) { + mutex_init(&uprobes_mutex[i]); + mutex_init(&uprobes_mmap_mutex[i]); + } + init_srcu_struct(&uprobes_srcu); + + return register_die_notifier(&uprobe_exception_nb); +} +module_init(init_uprobes); + +static void __exit exit_uprobes(void) +{ +} +module_exit(exit_uprobes); diff --git a/kernel/exit.c b/kernel/exit.c index d8bd3b425fa7..2f59cc334516 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); + /* + * If we are the last child process in a pid namespace to be + * reaped, notify the reaper sleeping zap_pid_ns_processes(). + */ + if (IS_ENABLED(CONFIG_PID_NS)) { + struct task_struct *parent = p->real_parent; + + if ((task_active_pid_ns(parent)->child_reaper == parent) && + list_empty(&parent->children) && + (parent->flags & PF_EXITING)) + wake_up_process(parent); + } } list_del_rcu(&p->thread_group); } @@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; + sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state @@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; } else if (father->signal->has_child_subreaper) { struct task_struct *reaper; @@ -884,9 +891,9 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " - "left\n", - current->comm, free); + printk(KERN_WARNING "%s (%d) used greatest stack depth: " + "%lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -946,12 +953,13 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * an exiting task cleaning up the robust pi futexes, and in + * task_work_add() to avoid the race with exit_task_work(). */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - exit_irq_thread(); + exit_task_work(tsk); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -1214,7 +1222,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); - uid_t uid = __task_cred(p)->uid; + uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; if (!likely(wo->wo_flags & WEXITED)) @@ -1427,7 +1435,7 @@ static int wait_task_stopped(struct wait_opts *wo, if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; - uid = task_uid(p); + uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1500,7 +1508,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = task_uid(p); + uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); diff --git a/kernel/extable.c b/kernel/extable.c index 5339705b8241..fe35a634bf76 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex); extern struct exception_table_entry __start___ex_table[]; extern struct exception_table_entry __stop___ex_table[]; +/* Cleared by build time tools if the table is already sorted. */ +u32 __initdata main_extable_sort_needed = 1; + /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - sort_extable(__start___ex_table, __stop___ex_table); + if (main_extable_sort_needed) + sort_extable(__start___ex_table, __stop___ex_table); + else + pr_notice("__ex_table already sorted, skipping sort\n"); } /* Given an address, look for it in the exception tables. */ diff --git a/kernel/fork.c b/kernel/fork.c index ad54c833116a..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -69,6 +69,7 @@ #include <linux/oom.h> #include <linux/khugepaged.h> #include <linux/signalfd.h> +#include <linux/uprobes.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -292,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) int node = tsk_fork_get_node(orig); int err; - prepare_to_copy(orig); - tsk = alloc_task_struct_node(node); if (!tsk) return NULL; @@ -305,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } err = arch_dup_task_struct(tsk, orig); - if (err) - goto out; + /* + * We defer looking at err, because we will need this setup + * for the clean up path to work correctly. + */ tsk->stack = ti; - setup_thread_stack(tsk, orig); + + if (err) + goto out; + clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); @@ -387,7 +391,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned long len; + len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; @@ -453,6 +458,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; + + if (file && uprobe_mmap(tmp)) + goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); @@ -601,6 +609,7 @@ void mmput(struct mm_struct *mm) might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) { + uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ @@ -611,7 +620,6 @@ void mmput(struct mm_struct *mm) list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } - put_swap_token(mm); if (mm->binfmt) module_put(mm->binfmt->module); mmdrop(mm); @@ -779,12 +787,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) exit_pi_state_list(tsk); #endif + uprobe_free_utask(tsk); + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); - if (tsk->vfork_done) - complete_vfork_done(tsk); - /* * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave @@ -805,6 +812,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) } tsk->clear_child_tid = NULL; } + + /* + * All done, finally we can wake up parent and return this mm to him. + * Also kthread_stop() uses this completion for synchronization. + */ + if (tsk->vfork_done) + complete_vfork_done(tsk); } /* @@ -826,13 +840,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif + uprobe_reset_state(mm); if (!mm_init(mm, tsk)) goto fail_nomem; @@ -907,10 +918,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) goto fail_nomem; good_mm: - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - tsk->mm = mm; tsk->active_mm = mm; return 0; @@ -978,9 +985,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; + ioc_task_link(ioc); + tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); if (unlikely(!new_ioc)) @@ -1375,6 +1381,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif + uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ @@ -1413,6 +1420,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); + INIT_HLIST_HEAD(&p->task_works); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/groups.c b/kernel/groups.c index 99b53d1eb7ea..6b2588dd04ff 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize) group_info->blocks[0] = group_info->small_block; else { for (i = 0; i < nblocks; i++) { - gid_t *b; + kgid_t *b; b = (void *)__get_free_page(GFP_USER); if (!b) goto out_undo_partial_alloc; @@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free); static int groups_to_user(gid_t __user *grouplist, const struct group_info *group_info) { + struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_to_user(grouplist, group_info->blocks[i], len)) + for (i = 0; i < count; i++) { + gid_t gid; + gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); + if (put_user(gid, grouplist+i)) return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; } return 0; } @@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist, static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { + struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_from_user(group_info->blocks[i], grouplist, len)) + for (i = 0; i < count; i++) { + gid_t gid; + kgid_t kgid; + if (get_user(gid, grouplist+i)) return -EFAULT; - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; + kgid = make_kgid(user_ns, gid); + if (!gid_valid(kgid)) + return -EINVAL; + + GROUP_AT(group_info, i) = kgid; } return 0; } @@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info) for (base = 0; base < max; base++) { int left = base; int right = left + stride; - gid_t tmp = GROUP_AT(group_info, right); + kgid_t tmp = GROUP_AT(group_info, right); - while (left >= 0 && GROUP_AT(group_info, left) > tmp) { + while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { GROUP_AT(group_info, right) = GROUP_AT(group_info, left); right = left; @@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info) } /* a simple bsearch */ -int groups_search(const struct group_info *group_info, gid_t grp) +int groups_search(const struct group_info *group_info, kgid_t grp) { unsigned int left, right; @@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - if (grp > GROUP_AT(group_info, mid)) + if (gid_gt(grp, GROUP_AT(group_info, mid))) left = mid + 1; - else if (grp < GROUP_AT(group_info, mid)) + else if (gid_lt(grp, GROUP_AT(group_info, mid))) right = mid; else return 1; @@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) /* * Check whether we're fsgid/egid or in the supplemental group.. */ -int in_group_p(gid_t grp) +int in_group_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; - if (grp != cred->fsgid) + if (!gid_eq(grp, cred->fsgid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_group_p); -int in_egroup_p(gid_t grp) +int in_egroup_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; - if (grp != cred->egid) + if (!gid_eq(grp, cred->egid)) retval = groups_search(cred->group_info, grp); return retval; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + + return ktime_get_update_offsets(offs_real, offs_boot); +} + /* * Retrigger next event is called after clock was set * @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); - struct timespec realtime_offset, xtim, wtm, sleep; if (!hrtimer_hres_active()) return; - /* Optimized out for !HIGH_RES */ - get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - + hrtimer_update_base(base); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); } @@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); - /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); return 1; } +/* + * Called from timekeeping code to reprogramm the hrtimer interrupt + * device. If called from the timer interrupt context we defer it to + * softirq context. + */ +void clock_was_set_delayed(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; - entry_time = now = ktime_get(); + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); retry: expires_next.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1330,8 +1339,12 @@ retry: * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. */ - now = ktime_get(); + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; @@ -1343,6 +1356,7 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; + raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); if (delta.tv64 > cpu_base->max_hang_time.tv64) cpu_base->max_hang_time = delta; @@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (cpu_base->clock_was_set) { + cpu_base->clock_was_set = 0; + clock_was_set(); + } + hrtimer_peek_ahead_timers(); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc275e4f629b..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } handle_irq_event(desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); +extern int irq_do_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force); + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 0e0ba5f840b2..41c1564103f1 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "irq: " fmt + #include <linux/debugfs.h> #include <linux/hardirq.h> #include <linux/interrupt.h> @@ -56,14 +58,73 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, return domain; } +static void irq_domain_free(struct irq_domain *domain) +{ + of_node_put(domain->of_node); + kfree(domain); +} + static void irq_domain_add(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - pr_debug("irq: Allocated domain of type %d @0x%p\n", + pr_debug("Allocated domain of type %d @0x%p\n", + domain->revmap_type, domain); +} + +/** + * irq_domain_remove() - Remove an irq domain. + * @domain: domain to remove + * + * This routine is used to remove an irq domain. The caller must ensure + * that all mappings within the domain have been disposed of prior to + * use, depending on the revmap type. + */ +void irq_domain_remove(struct irq_domain *domain) +{ + mutex_lock(&irq_domain_mutex); + + switch (domain->revmap_type) { + case IRQ_DOMAIN_MAP_LEGACY: + /* + * Legacy domains don't manage their own irq_desc + * allocations, we expect the caller to handle irq_desc + * freeing on their own. + */ + break; + case IRQ_DOMAIN_MAP_TREE: + /* + * radix_tree_delete() takes care of destroying the root + * node when all entries are removed. Shout if there are + * any mappings left. + */ + WARN_ON(domain->revmap_data.tree.height); + break; + case IRQ_DOMAIN_MAP_LINEAR: + kfree(domain->revmap_data.linear.revmap); + domain->revmap_data.linear.size = 0; + break; + case IRQ_DOMAIN_MAP_NOMAP: + break; + } + + list_del(&domain->link); + + /* + * If the going away domain is the default one, reset it. + */ + if (unlikely(irq_default_domain == domain)) + irq_set_default_host(NULL); + + mutex_unlock(&irq_domain_mutex); + + pr_debug("Removed domain of type %d @0x%p\n", domain->revmap_type, domain); + + irq_domain_free(domain); } +EXPORT_SYMBOL_GPL(irq_domain_remove); static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) @@ -117,8 +178,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, if (WARN_ON(!irq_data || irq_data->domain)) { mutex_unlock(&irq_domain_mutex); - of_node_put(domain->of_node); - kfree(domain); + irq_domain_free(domain); return NULL; } } @@ -152,10 +212,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, irq_domain_add(domain); return domain; } +EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. + * @size: Number of interrupts in the domain. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer */ @@ -181,6 +243,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, irq_domain_add(domain); return domain; } +EXPORT_SYMBOL_GPL(irq_domain_add_linear); struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, unsigned int max_irq, @@ -195,6 +258,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, } return domain; } +EXPORT_SYMBOL_GPL(irq_domain_add_nomap); /** * irq_domain_add_tree() @@ -216,6 +280,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, } return domain; } +EXPORT_SYMBOL_GPL(irq_domain_add_tree); /** * irq_find_host() - Locates a domain for a given device node @@ -259,10 +324,11 @@ EXPORT_SYMBOL_GPL(irq_find_host); */ void irq_set_default_host(struct irq_domain *domain) { - pr_debug("irq: Default domain set to @0x%p\n", domain); + pr_debug("Default domain set to @0x%p\n", domain); irq_default_domain = domain; } +EXPORT_SYMBOL_GPL(irq_set_default_host); static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) @@ -272,7 +338,7 @@ static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, irq_data->hwirq = hwirq; irq_data->domain = domain; if (domain->ops->map(domain, virq, hwirq)) { - pr_debug("irq: -> mapping failed, freeing\n"); + pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq); irq_data->domain = NULL; irq_data->hwirq = 0; return -1; @@ -303,7 +369,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) virq = irq_alloc_desc_from(1, 0); if (!virq) { - pr_debug("irq: create_direct virq allocation failed\n"); + pr_debug("create_direct virq allocation failed\n"); return 0; } if (virq >= domain->revmap_data.nomap.max_irq) { @@ -312,7 +378,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) irq_free_desc(virq); return 0; } - pr_debug("irq: create_direct obtained virq %d\n", virq); + pr_debug("create_direct obtained virq %d\n", virq); if (irq_setup_virq(domain, virq, virq)) { irq_free_desc(virq); @@ -321,6 +387,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) return virq; } +EXPORT_SYMBOL_GPL(irq_create_direct_mapping); /** * irq_create_mapping() - Map a hardware interrupt into linux irq space @@ -338,23 +405,23 @@ unsigned int irq_create_mapping(struct irq_domain *domain, unsigned int hint; int virq; - pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); + pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); /* Look for default domain if nececssary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { - printk(KERN_WARNING "irq_create_mapping called for" - " NULL domain, hwirq=%lx\n", hwirq); + pr_warning("irq_create_mapping called for" + " NULL domain, hwirq=%lx\n", hwirq); WARN_ON(1); return 0; } - pr_debug("irq: -> using domain @%p\n", domain); + pr_debug("-> using domain @%p\n", domain); /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { - pr_debug("irq: -> existing mapping on virq %d\n", virq); + pr_debug("-> existing mapping on virq %d\n", virq); return virq; } @@ -370,7 +437,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, if (virq <= 0) virq = irq_alloc_desc_from(1, 0); if (virq <= 0) { - pr_debug("irq: -> virq allocation failed\n"); + pr_debug("-> virq allocation failed\n"); return 0; } @@ -380,7 +447,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return 0; } - pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", + pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); return virq; @@ -409,8 +476,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller, if (intsize > 0) return intspec[0]; #endif - printk(KERN_WARNING "irq: no irq domain found for %s !\n", - controller->full_name); + pr_warning("no irq domain found for %s !\n", + controller->full_name); return 0; } @@ -560,6 +627,7 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, */ return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); } +EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup); /** * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. @@ -584,6 +652,7 @@ void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, mutex_unlock(&revmap_trees_mutex); } } +EXPORT_SYMBOL_GPL(irq_radix_revmap_insert); /** * irq_linear_revmap() - Find a linux irq from a hw irq number. @@ -617,6 +686,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, return revmap[hwirq]; } +EXPORT_SYMBOL_GPL(irq_linear_revmap); #ifdef CONFIG_IRQ_DOMAIN_DEBUG static int virq_debug_show(struct seq_file *m, void *private) @@ -691,8 +761,8 @@ static int __init irq_debugfs_init(void) __initcall(irq_debugfs_init); #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ -int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hwirq) +static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hwirq) { return 0; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 585f6381f8e4..8c548232ba39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -7,6 +7,8 @@ * This file contains driver APIs to the irq subsystem. */ +#define pr_fmt(fmt) "genirq: " fmt + #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> @@ -14,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/task_work.h> #include "internals.h" @@ -139,6 +142,25 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif +int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + struct irq_chip *chip = irq_data_get_irq_chip(data); + int ret; + + ret = chip->irq_set_affinity(data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(data->affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + ret = 0; + } + + return ret; +} + int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -149,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = chip->irq_set_affinity(data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(data->affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - ret = 0; - } + ret = irq_do_set_affinity(data, mask, false); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -280,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { - struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; - int ret, node = desc->irq_data.node; + int node = desc->irq_data.node; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -308,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(mask, nodemask)) cpumask_and(mask, mask, nodemask); } - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } + irq_do_set_affinity(&desc->irq_data, mask, false); return 0; } #else @@ -565,7 +573,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", irq, chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -600,7 +608,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, ret = 0; break; default: - pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); } if (unmask) @@ -773,11 +781,39 @@ static void wake_threads_waitq(struct irq_desc *desc) wake_up(&desc->wait_for_threads); } +static void irq_thread_dtor(struct task_work *unused) +{ + struct task_struct *tsk = current; + struct irq_desc *desc; + struct irqaction *action; + + if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) + return; + + action = kthread_data(tsk); + + pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + + + desc = irq_to_desc(action->irq); + /* + * If IRQTF_RUNTHREAD is set, we need to decrement + * desc->threads_active and wake possible waiters. + */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + wake_threads_waitq(desc); + + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action); +} + /* * Interrupt handler thread */ static int irq_thread(void *data) { + struct task_work on_exit_work; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; @@ -793,7 +829,9 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->irq_thread = 1; + + init_task_work(&on_exit_work, irq_thread_dtor, NULL); + task_work_add(current, &on_exit_work, false); while (!irq_wait_for_interrupt(action)) { irqreturn_t action_ret; @@ -815,44 +853,11 @@ static int irq_thread(void *data) * cannot touch the oneshot mask at this point anymore as * __setup_irq() might have given out currents thread_mask * again. - * - * Clear irq_thread. Otherwise exit_irq_thread() would make - * fuzz about an active irq thread going into nirvana. */ - current->irq_thread = 0; + task_work_cancel(current, irq_thread_dtor); return 0; } -/* - * Called from do_exit() - */ -void exit_irq_thread(void) -{ - struct task_struct *tsk = current; - struct irq_desc *desc; - struct irqaction *action; - - if (!tsk->irq_thread) - return; - - action = kthread_data(tsk); - - pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); - - desc = irq_to_desc(action->irq); - - /* - * If IRQTF_RUNTHREAD is set, we need to decrement - * desc->threads_active and wake possible waiters. - */ - if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) - wake_threads_waitq(desc); - - /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action); -} - static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) @@ -1044,7 +1049,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * has. The type flags are unreliable as the * underlying chip implementation can override them. */ - pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", + pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", irq); ret = -EINVAL; goto out_mask; @@ -1095,7 +1100,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", + pr_warning("irq %d uses trigger mode %u; requested %u\n", irq, nmsk, omsk); } @@ -1133,7 +1138,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { - pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", + pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); @@ -1220,12 +1225,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; - /* Currently used only by UML, might disappear one day: */ -#ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->irq_data.chip->release) - desc->irq_data.chip->release(irq, dev_id); -#endif - /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) irq_shutdown(desc); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - int ret = chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, desc->pending_mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } - } + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) + irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); cpumask_clear(desc->pending_mask); } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 079f1d39a8b8..2169feeba529 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, - int symbol_offset) + int symbol_offset, int add_offset) { char *modname; const char *name; @@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address, if (name != buffer) strcpy(buffer, name); len = strlen(buffer); - buffer += len; offset -= symbol_offset; + if (add_offset) + len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); + if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); - else - len += sprintf(buffer, "+%#lx/%#lx", offset, size); + len += sprintf(buffer + len, " [%s]", modname); return len; } @@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address, */ int sprint_symbol(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, 0); + return __sprint_symbol(buffer, address, 0, 1); } - EXPORT_SYMBOL_GPL(sprint_symbol); /** + * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name + * and module name to @buffer if possible. If no symbol was found, just saves + * its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol_no_offset(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0, 0); +} +EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); + +/** * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup @@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol); */ int sprint_backtrace(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, -1); + return __sprint_symbol(buffer, address, -1, 1); } /* Look up a kernel symbol and print it to the kernel messages. */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c @@ -0,0 +1,196 @@ +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/fdtable.h> +#include <linux/string.h> +#include <linux/random.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/cache.h> +#include <linux/bug.h> +#include <linux/err.h> +#include <linux/kcmp.h> + +#include <asm/unistd.h> + +/* + * We don't expose the real in-memory order of objects for security reasons. + * But still the comparison results should be suitable for sorting. So we + * obfuscate kernel pointers values and compare the production instead. + * + * The obfuscation is done in two steps. First we xor the kernel pointer with + * a random value, which puts pointer into a new position in a reordered space. + * Secondly we multiply the xor production with a large odd random number to + * permute its bits even more (the odd multiplier guarantees that the product + * is unique ever after the high bits are truncated, since any odd number is + * relative prime to 2^n). + * + * Note also that the obfuscation itself is invisible to userspace and if needed + * it can be changed to an alternate scheme. + */ +static unsigned long cookies[KCMP_TYPES][2] __read_mostly; + +static long kptr_obfuscate(long v, int type) +{ + return (v ^ cookies[type][0]) * cookies[type][1]; +} + +/* + * 0 - equal, i.e. v1 = v2 + * 1 - less than, i.e. v1 < v2 + * 2 - greater than, i.e. v1 > v2 + * 3 - not equal but ordering unavailable (reserved for future) + */ +static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) +{ + long ret; + + ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + + return (ret < 0) | ((ret > 0) << 1); +} + +/* The caller must have pinned the task */ +static struct file * +get_file_raw_ptr(struct task_struct *task, unsigned int idx) +{ + struct file *file = NULL; + + task_lock(task); + rcu_read_lock(); + + if (task->files) + file = fcheck_files(task->files, idx); + + rcu_read_unlock(); + task_unlock(task); + + return file; +} + +static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +{ + if (likely(m2 != m1)) + mutex_unlock(m2); + mutex_unlock(m1); +} + +static int kcmp_lock(struct mutex *m1, struct mutex *m2) +{ + int err; + + if (m2 > m1) + swap(m1, m2); + + err = mutex_lock_killable(m1); + if (!err && likely(m1 != m2)) { + err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + if (err) + mutex_unlock(m1); + } + + return err; +} + +SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, + unsigned long, idx1, unsigned long, idx2) +{ + struct task_struct *task1, *task2; + int ret; + + rcu_read_lock(); + + /* + * Tasks are looked up in caller's PID namespace only. + */ + task1 = find_task_by_vpid(pid1); + task2 = find_task_by_vpid(pid2); + if (!task1 || !task2) + goto err_no_task; + + get_task_struct(task1); + get_task_struct(task2); + + rcu_read_unlock(); + + /* + * One should have enough rights to inspect task details. + */ + ret = kcmp_lock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); + if (ret) + goto err; + if (!ptrace_may_access(task1, PTRACE_MODE_READ) || + !ptrace_may_access(task2, PTRACE_MODE_READ)) { + ret = -EPERM; + goto err_unlock; + } + + switch (type) { + case KCMP_FILE: { + struct file *filp1, *filp2; + + filp1 = get_file_raw_ptr(task1, idx1); + filp2 = get_file_raw_ptr(task2, idx2); + + if (filp1 && filp2) + ret = kcmp_ptr(filp1, filp2, KCMP_FILE); + else + ret = -EBADF; + break; + } + case KCMP_VM: + ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); + break; + case KCMP_FILES: + ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); + break; + case KCMP_FS: + ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); + break; + case KCMP_SIGHAND: + ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); + break; + case KCMP_IO: + ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); + break; + case KCMP_SYSVSEM: +#ifdef CONFIG_SYSVIPC + ret = kcmp_ptr(task1->sysvsem.undo_list, + task2->sysvsem.undo_list, + KCMP_SYSVSEM); +#else + ret = -EOPNOTSUPP; +#endif + break; + default: + ret = -EINVAL; + break; + } + +err_unlock: + kcmp_unlock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); +err: + put_task_struct(task1); + put_task_struct(task2); + + return ret; + +err_no_task: + rcu_read_unlock(); + return -ESRCH; +} + +static __init int kcmp_cookies_init(void) +{ + int i; + + get_random_bytes(cookies, sizeof(cookies)); + + for (i = 0; i < KCMP_TYPES; i++) + cookies[i][1] |= (~(~0UL >> 1) | 1); + + return 0; +} +arch_initcall(kcmp_cookies_init); diff --git a/kernel/kfifo.c b/kernel/kfifo.c index c744b88c44e2..59dcf5b81d24 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize) return max; return len; } +EXPORT_SYMBOL(__kfifo_max_r); #define __KFIFO_PEEK(data, out, mask) \ ((data)[(out) & (mask)]) diff --git a/kernel/kmod.c b/kernel/kmod.c index 05698a7415fe..ff2c7cb86d77 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -221,13 +221,12 @@ fail: return 0; } -void call_usermodehelper_freeinfo(struct subprocess_info *info) +static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info); kfree(info); } -EXPORT_SYMBOL(call_usermodehelper_freeinfo); static void umh_complete(struct subprocess_info *sub_info) { @@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. - * depth: New value to assign to usermodehelper_disabled. + * @depth: New value to assign to usermodehelper_disabled. * * Change the value of usermodehelper_disabled (under umhelper_sem locked for * writing) and wakeup tasks waiting for it to change. @@ -479,6 +478,7 @@ static void helper_unlock(void) * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. */ +static struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask) { @@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, out: return sub_info; } -EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_setfns - set a cleanup/init function @@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ +static void call_usermodehelper_setfns(struct subprocess_info *info, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), @@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info, info->init = init; info->data = data; } -EXPORT_SYMBOL(call_usermodehelper_setfns); /** * call_usermodehelper_exec - start a usermode application @@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ +static int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); @@ -576,7 +576,25 @@ unlock: helper_unlock(); return retval; } -EXPORT_SYMBOL(call_usermodehelper_exec); + +int call_usermodehelper_fns( + char *path, char **argv, char **envp, int wait, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *), void *data) +{ + struct subprocess_info *info; + gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + + info = call_usermodehelper_setup(path, argv, envp, gfp_mask); + + if (info == NULL) + return -ENOMEM; + + call_usermodehelper_setfns(info, init, cleanup, data); + + return call_usermodehelper_exec(info, wait); +} +EXPORT_SYMBOL(call_usermodehelper_fns); static int proc_cap_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/lglock.c b/kernel/lglock.c new file mode 100644 index 000000000000..6535a667a5a7 --- /dev/null +++ b/kernel/lglock.c @@ -0,0 +1,89 @@ +/* See include/linux/lglock.h for description */ +#include <linux/module.h> +#include <linux/lglock.h> +#include <linux/cpu.h> +#include <linux/string.h> + +/* + * Note there is no uninit, so lglocks cannot be defined in + * modules (but it's fine to use them from there) + * Could be added though, just undo lg_lock_init + */ + +void lg_lock_init(struct lglock *lg, char *name) +{ + LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); +} +EXPORT_SYMBOL(lg_lock_init); + +void lg_local_lock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock); + +void lg_local_unlock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock); + +void lg_local_lock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock_cpu); + +void lg_local_unlock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock_cpu); + +void lg_global_lock(struct lglock *lg) +{ + int i; + + preempt_disable(); + rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_lock(lock); + } +} +EXPORT_SYMBOL(lg_global_lock); + +void lg_global_unlock(struct lglock *lg) +{ + int i; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_unlock(lock); + } + preempt_enable(); +} +EXPORT_SYMBOL(lg_global_unlock); diff --git a/kernel/module.c b/kernel/module.c index a4e60973ca73..4edbd9c11aca 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info, goto free_hdr; } - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { + if (hdr->e_shoff >= len || + hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { err = -ENOEXEC; goto free_hdr; } diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..d2a5f4ecc6dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,7 +27,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -int panic_on_oops; +int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; @@ -108,8 +108,6 @@ void panic(const char *fmt, ...) */ crash_kexec(NULL); - kmsg_dump(KMSG_DUMP_PANIC); - /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic @@ -117,6 +115,8 @@ void panic(const char *fmt, ...) */ smp_send_stop(); + kmsg_dump(KMSG_DUMP_PANIC); + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); bust_spinlocks(0); diff --git a/kernel/pid.c b/kernel/pid.c index 9f08dfabaf13..e86b291ad834 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -547,7 +547,8 @@ void __init pidhash_init(void) pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, - &pidhash_shift, NULL, 4096); + &pidhash_shift, NULL, + 0, 4096); pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 57bc1fd35b3c..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; - struct task_struct *task; + struct task_struct *task, *me = current; + + /* Ignore SIGCHLD causing any terminated children to autoreap */ + spin_lock_irq(&me->sighand->siglock); + me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; + spin_unlock_irq(&me->sighand->siglock); /* * The last thread in the cgroup-init thread group is terminating. @@ -179,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); + /* Firstly reap the EXIT_ZOMBIE children we may have. */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + /* + * sys_wait4() above can't reap the TASK_DEAD children. + * Make sure they all go away, see __unhash_process(). + */ + for (;;) { + bool need_wait = false; + + read_lock(&tasklist_lock); + if (!list_empty(¤t->children)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + need_wait = true; + } + read_unlock(&tasklist_lock); + + if (!need_wait) + break; + schedule(); + } + if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; @@ -191,6 +216,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -218,8 +244,8 @@ static struct ctl_table pid_ns_ctl_table[] = { }, { } }; - static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +#endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { @@ -253,7 +279,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + +#ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); +#endif return 0; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index deb5461e3216..8f9b4eb974e0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -103,6 +103,33 @@ config PM_SLEEP_SMP select HOTPLUG select HOTPLUG_CPU +config PM_AUTOSLEEP + bool "Opportunistic sleep" + depends on PM_SLEEP + default n + ---help--- + Allow the kernel to trigger a system transition into a global sleep + state automatically whenever there are no active wakeup sources. + +config PM_WAKELOCKS + bool "User space wakeup sources interface" + depends on PM_SLEEP + default n + ---help--- + Allow user space to create, activate and deactivate wakeup source + objects with the help of a sysfs-based interface. + +config PM_WAKELOCKS_LIMIT + int "Maximum number of user space wakeup sources (0 = no limit)" + range 0 100000 + default 100 + depends on PM_WAKELOCKS + +config PM_WAKELOCKS_GC + bool "Garbage collector for user space wakeup sources" + depends on PM_WAKELOCKS + default y + config PM_RUNTIME bool "Run-time PM core functionality" depends on !IA64_HP_SIM diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 66d808ec5252..29472bff11ef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o +obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o +obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 000000000000..ca304046d9e2 --- /dev/null +++ b/kernel/power/autosleep.c @@ -0,0 +1,127 @@ +/* + * kernel/power/autosleep.c + * + * Opportunistic sleep support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/pm_wakeup.h> + +#include "power.h" + +static suspend_state_t autosleep_state; +static struct workqueue_struct *autosleep_wq; +/* + * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source + * is active, otherwise a deadlock with try_to_suspend() is possible. + * Alternatively mutex_lock_interruptible() can be used. This will then fail + * if an auto_sleep cycle tries to freeze processes. + */ +static DEFINE_MUTEX(autosleep_lock); +static struct wakeup_source *autosleep_ws; + +static void try_to_suspend(struct work_struct *work) +{ + unsigned int initial_count, final_count; + + if (!pm_get_wakeup_count(&initial_count, true)) + goto out; + + mutex_lock(&autosleep_lock); + + if (!pm_save_wakeup_count(initial_count)) { + mutex_unlock(&autosleep_lock); + goto out; + } + + if (autosleep_state == PM_SUSPEND_ON) { + mutex_unlock(&autosleep_lock); + return; + } + if (autosleep_state >= PM_SUSPEND_MAX) + hibernate(); + else + pm_suspend(autosleep_state); + + mutex_unlock(&autosleep_lock); + + if (!pm_get_wakeup_count(&final_count, false)) + goto out; + + /* + * If the wakeup occured for an unknown reason, wait to prevent the + * system from trying to suspend and waking up in a tight loop. + */ + if (final_count == initial_count) + schedule_timeout_uninterruptible(HZ / 2); + + out: + queue_up_suspend_work(); +} + +static DECLARE_WORK(suspend_work, try_to_suspend); + +void queue_up_suspend_work(void) +{ + if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) + queue_work(autosleep_wq, &suspend_work); +} + +suspend_state_t pm_autosleep_state(void) +{ + return autosleep_state; +} + +int pm_autosleep_lock(void) +{ + return mutex_lock_interruptible(&autosleep_lock); +} + +void pm_autosleep_unlock(void) +{ + mutex_unlock(&autosleep_lock); +} + +int pm_autosleep_set_state(suspend_state_t state) +{ + +#ifndef CONFIG_HIBERNATION + if (state >= PM_SUSPEND_MAX) + return -EINVAL; +#endif + + __pm_stay_awake(autosleep_ws); + + mutex_lock(&autosleep_lock); + + autosleep_state = state; + + __pm_relax(autosleep_ws); + + if (state > PM_SUSPEND_ON) { + pm_wakep_autosleep_enabled(true); + queue_up_suspend_work(); + } else { + pm_wakep_autosleep_enabled(false); + } + + mutex_unlock(&autosleep_lock); + return 0; +} + +int __init pm_autosleep_init(void) +{ + autosleep_ws = wakeup_source_register("autosleep"); + if (!autosleep_ws) + return -ENOMEM; + + autosleep_wq = alloc_ordered_workqueue("autosleep", 0); + if (autosleep_wq) + return 0; + + wakeup_source_unregister(autosleep_ws); + return -ENOMEM; +} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e09dfbfeecee..238025f5472e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -25,7 +25,8 @@ #include <linux/freezer.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> -#include <scsi/scsi_scan.h> +#include <linux/ctype.h> +#include <linux/genhd.h> #include "power.h" @@ -722,6 +723,17 @@ static int software_resume(void) /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); + + /* + * name_to_dev_t is ineffective to verify parition if resume_file is in + * integer format. (e.g. major:minor) + */ + if (isdigit(resume_file[0]) && resume_wait) { + int partno; + while (!get_gendisk(swsusp_resume_device, &partno)) + msleep(10); + } + if (!swsusp_resume_device) { /* * Some device discovery might still be in progress; we need @@ -735,13 +747,6 @@ static int software_resume(void) async_synchronize_full(); } - /* - * We can't depend on SCSI devices being available after loading - * one of their modules until scsi_complete_async_scans() is - * called and the resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { error = -ENODEV; diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c12581f1c62..428f8a034e96 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, return (s - buf); } -static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) +static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state = PM_SUSPEND_STANDBY; @@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, #endif char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - /* First, check if we are requested to hibernate */ - if (len == 4 && !strncmp(buf, "disk", len)) { - error = hibernate(); - goto Exit; - } + /* Check hibernation first. */ + if (len == 4 && !strncmp(buf, "disk", len)) + return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { - error = pm_suspend(state); - break; - } - } + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + return state; #endif - Exit: + return PM_SUSPEND_ON; +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_state(buf, n); + if (state < PM_SUSPEND_MAX) + error = pm_suspend(state); + else if (state == PM_SUSPEND_MAX) + error = hibernate(); + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); return error ? error : n; } @@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj, { unsigned int val; - return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; + return pm_get_wakeup_count(&val, true) ? + sprintf(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, @@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj, const char *buf, size_t n) { unsigned int val; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) - return n; + error = n; } - return -EINVAL; + + out: + pm_autosleep_unlock(); + return error; } power_attr(wakeup_count); + +#ifdef CONFIG_PM_AUTOSLEEP +static ssize_t autosleep_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + suspend_state_t state = pm_autosleep_state(); + + if (state == PM_SUSPEND_ON) + return sprintf(buf, "off\n"); + +#ifdef CONFIG_SUSPEND + if (state < PM_SUSPEND_MAX) + return sprintf(buf, "%s\n", valid_state(state) ? + pm_states[state] : "error"); +#endif +#ifdef CONFIG_HIBERNATION + return sprintf(buf, "disk\n"); +#else + return sprintf(buf, "error"); +#endif +} + +static ssize_t autosleep_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state = decode_state(buf, n); + int error; + + if (state == PM_SUSPEND_ON + && strcmp(buf, "off") && strcmp(buf, "off\n")) + return -EINVAL; + + error = pm_autosleep_set_state(state); + return error ? error : n; +} + +power_attr(autosleep); +#endif /* CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS +static ssize_t wake_lock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, true); +} + +static ssize_t wake_lock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_lock(buf); + return error ? error : n; +} + +power_attr(wake_lock); + +static ssize_t wake_unlock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, false); +} + +static ssize_t wake_unlock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_unlock(buf); + return error ? error : n; +} + +power_attr(wake_unlock); + +#endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE @@ -409,6 +521,13 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_PM_AUTOSLEEP + &autosleep_attr.attr, +#endif +#ifdef CONFIG_PM_WAKELOCKS + &wake_lock_attr.attr, + &wake_unlock_attr.attr, +#endif #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif @@ -444,7 +563,10 @@ static int __init pm_init(void) power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; - return sysfs_create_group(power_kobj, &attr_group); + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return error; + return pm_autosleep_init(); } core_initcall(pm_init); diff --git a/kernel/power/power.h b/kernel/power/power.h index 98f3622d7407..b0bd4beaebfe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void) { } #endif + +#ifdef CONFIG_PM_AUTOSLEEP + +/* kernel/power/autosleep.c */ +extern int pm_autosleep_init(void); +extern int pm_autosleep_lock(void); +extern void pm_autosleep_unlock(void); +extern suspend_state_t pm_autosleep_state(void); +extern int pm_autosleep_set_state(suspend_state_t state); + +#else /* !CONFIG_PM_AUTOSLEEP */ + +static inline int pm_autosleep_init(void) { return 0; } +static inline int pm_autosleep_lock(void) { return 0; } +static inline void pm_autosleep_unlock(void) {} +static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } + +#endif /* !CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS + +/* kernel/power/wakelock.c */ +extern ssize_t pm_show_wakelocks(char *buf, bool show_active); +extern int pm_wake_lock(const char *buf); +extern int pm_wake_unlock(const char *buf); + +#endif /* !CONFIG_PM_WAKELOCKS */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index eef311a58a64..11e22c068e8b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -6,7 +6,7 @@ * * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> + * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. * @@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) return -ENOSPC; if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | + __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { @@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - } - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; - handle->reqd_free_pages = reqd_free_pages(); + + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + /* + * Recalculate the number of required free pages, to + * make sure we never take more than half. + */ + handle->reqd_free_pages = reqd_free_pages(); + } } out: return error; @@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, /* Maximum number of threads for compression/decompression. */ #define LZO_THREADS 3 -/* Maximum number of pages for read buffering. */ -#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) +/* Minimum/maximum number of pages for read buffering. */ +#define LZO_MIN_RD_PAGES 1024 +#define LZO_MAX_RD_PAGES 8192 /** @@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of free pages after all allocations have been done. - * We don't want to run out of pages when writing. - */ - handle->reqd_free_pages = reqd_free_pages(); - - /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); @@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, goto out_clean; } + /* + * Adjust the number of required free pages after all allocations have + * been done. We don't want to run out of pages when writing. + */ + handle->reqd_free_pages = reqd_free_pages(); + printk(KERN_INFO "PM: Using %u thread(s) for compression.\n" "PM: Compressing and saving image data (%u pages) ... ", @@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned i, thr, run_threads, nr_threads; unsigned ring = 0, pg = 0, ring_size = 0, have = 0, want, need, asked = 0; - unsigned long read_pages; + unsigned long read_pages = 0; unsigned char **page = NULL; struct dec_data *data = NULL; struct crc_data *crc = NULL; @@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = vmalloc(sizeof(*page) * LZO_READ_PAGES); + page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of pages for read buffering, in case we are short. + * Set the number of pages for read buffering. + * This is complete guesswork, because we'll only know the real + * picture once prepare_image() is called, which is much later on + * during the image load phase. We'll assume the worst case and + * say that none of the image pages are from high memory. */ - read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; - read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); + if (low_free_pages() > snapshot_get_image_size()) + read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; + read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT); + __GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); + if (!page[i]) { if (i < LZO_CMP_PAGES) { ring_size = i; diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -24,7 +24,6 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> -#include <scsi/scsi_scan.h> #include <asm/uaccess.h> @@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) * appear. */ wait_for_device_probe(); - scsi_complete_async_scans(); data->swap = -1; data->mode = O_WRONLY; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 000000000000..c8fba3380076 --- /dev/null +++ b/kernel/power/wakelock.c @@ -0,0 +1,259 @@ +/* + * kernel/power/wakelock.c + * + * User space wakeup sources support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + * + * This code is based on the analogous interface allowing user space to + * manipulate wakelocks on Android. + */ + +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/slab.h> + +static DEFINE_MUTEX(wakelocks_lock); + +struct wakelock { + char *name; + struct rb_node node; + struct wakeup_source ws; +#ifdef CONFIG_PM_WAKELOCKS_GC + struct list_head lru; +#endif +}; + +static struct rb_root wakelocks_tree = RB_ROOT; + +ssize_t pm_show_wakelocks(char *buf, bool show_active) +{ + struct rb_node *node; + struct wakelock *wl; + char *str = buf; + char *end = buf + PAGE_SIZE; + + mutex_lock(&wakelocks_lock); + + for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { + wl = rb_entry(node, struct wakelock, node); + if (wl->ws.active == show_active) + str += scnprintf(str, end - str, "%s ", wl->name); + } + if (str > buf) + str--; + + str += scnprintf(str, end - str, "\n"); + + mutex_unlock(&wakelocks_lock); + return (str - buf); +} + +#if CONFIG_PM_WAKELOCKS_LIMIT > 0 +static unsigned int number_of_wakelocks; + +static inline bool wakelocks_limit_exceeded(void) +{ + return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; +} + +static inline void increment_wakelocks_number(void) +{ + number_of_wakelocks++; +} + +static inline void decrement_wakelocks_number(void) +{ + number_of_wakelocks--; +} +#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ +static inline bool wakelocks_limit_exceeded(void) { return false; } +static inline void increment_wakelocks_number(void) {} +static inline void decrement_wakelocks_number(void) {} +#endif /* CONFIG_PM_WAKELOCKS_LIMIT */ + +#ifdef CONFIG_PM_WAKELOCKS_GC +#define WL_GC_COUNT_MAX 100 +#define WL_GC_TIME_SEC 300 + +static LIST_HEAD(wakelocks_lru_list); +static unsigned int wakelocks_gc_count; + +static inline void wakelocks_lru_add(struct wakelock *wl) +{ + list_add(&wl->lru, &wakelocks_lru_list); +} + +static inline void wakelocks_lru_most_recent(struct wakelock *wl) +{ + list_move(&wl->lru, &wakelocks_lru_list); +} + +static void wakelocks_gc(void) +{ + struct wakelock *wl, *aux; + ktime_t now; + + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + now = ktime_get(); + list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { + u64 idle_time_ns; + bool active; + + spin_lock_irq(&wl->ws.lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); + active = wl->ws.active; + spin_unlock_irq(&wl->ws.lock); + + if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) + break; + + if (!active) { + wakeup_source_remove(&wl->ws); + rb_erase(&wl->node, &wakelocks_tree); + list_del(&wl->lru); + kfree(wl->name); + kfree(wl); + decrement_wakelocks_number(); + } + } + wakelocks_gc_count = 0; +} +#else /* !CONFIG_PM_WAKELOCKS_GC */ +static inline void wakelocks_lru_add(struct wakelock *wl) {} +static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} +static inline void wakelocks_gc(void) {} +#endif /* !CONFIG_PM_WAKELOCKS_GC */ + +static struct wakelock *wakelock_lookup_add(const char *name, size_t len, + bool add_if_not_found) +{ + struct rb_node **node = &wakelocks_tree.rb_node; + struct rb_node *parent = *node; + struct wakelock *wl; + + while (*node) { + int diff; + + parent = *node; + wl = rb_entry(*node, struct wakelock, node); + diff = strncmp(name, wl->name, len); + if (diff == 0) { + if (wl->name[len]) + diff = -1; + else + return wl; + } + if (diff < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + if (!add_if_not_found) + return ERR_PTR(-EINVAL); + + if (wakelocks_limit_exceeded()) + return ERR_PTR(-ENOSPC); + + /* Not found, we have to add a new one. */ + wl = kzalloc(sizeof(*wl), GFP_KERNEL); + if (!wl) + return ERR_PTR(-ENOMEM); + + wl->name = kstrndup(name, len, GFP_KERNEL); + if (!wl->name) { + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws.name = wl->name; + wakeup_source_add(&wl->ws); + rb_link_node(&wl->node, parent, node); + rb_insert_color(&wl->node, &wakelocks_tree); + wakelocks_lru_add(wl); + increment_wakelocks_number(); + return wl; +} + +int pm_wake_lock(const char *buf) +{ + const char *str = buf; + struct wakelock *wl; + u64 timeout_ns = 0; + size_t len; + int ret = 0; + + while (*str && !isspace(*str)) + str++; + + len = str - buf; + if (!len) + return -EINVAL; + + if (*str && *str != '\n') { + /* Find out if there's a valid timeout string appended. */ + ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); + if (ret) + return -EINVAL; + } + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, true); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + if (timeout_ns) { + u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; + + do_div(timeout_ms, NSEC_PER_MSEC); + __pm_wakeup_event(&wl->ws, timeout_ms); + } else { + __pm_stay_awake(&wl->ws); + } + + wakelocks_lru_most_recent(wl); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} + +int pm_wake_unlock(const char *buf) +{ + struct wakelock *wl; + size_t len; + int ret = 0; + + len = strlen(buf); + if (!len) + return -EINVAL; + + if (buf[len-1] == '\n') + len--; + + if (!len) + return -EINVAL; + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, false); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + __pm_relax(&wl->ws); + + wakelocks_lru_most_recent(wl); + wakelocks_gc(); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364a..177fa49357a5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -193,12 +193,21 @@ static int console_may_schedule; * separated by ',', and find the message after the ';' character. */ +enum log_flags { + LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NEWLINE = 2, /* text ended with a newline */ + LOG_PREFIX = 4, /* text started with a prefix */ + LOG_CONT = 8, /* text is a fragment of a continuation line */ +}; + struct log { u64 ts_nsec; /* timestamp in nanoseconds */ u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ u16 dict_len; /* length of dictionary buffer */ - u16 level; /* syslog level + facility */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ }; /* @@ -210,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; +static enum log_flags syslog_prev; +static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ static u64 log_first_seq; @@ -227,10 +238,10 @@ static u32 clear_idx; #define LOG_LINE_MAX 1024 /* record buffer */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 #else -#define LOG_ALIGN 8 +#define LOG_ALIGN __alignof__(struct log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -286,6 +297,7 @@ static u32 log_next(u32 idx) /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) { @@ -329,8 +341,13 @@ static void log_store(int facility, int level, msg->text_len = text_len; memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; - msg->level = (facility << 3) | (level & 7); - msg->ts_nsec = local_clock(); + msg->facility = facility; + msg->level = level & 7; + msg->flags = flags & 0x1f; + if (ts_nsec > 0) + msg->ts_nsec = ts_nsec; + else + msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = sizeof(struct log) + text_len + dict_len + pad_len; @@ -414,21 +431,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (!user) return -EBADF; - mutex_lock(&user->lock); - raw_spin_lock(&logbuf_lock); + ret = mutex_lock_interruptible(&user->lock); + if (ret) + return ret; + raw_spin_lock_irq(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } if (user->seq < log_first_seq) { @@ -436,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } @@ -444,13 +463,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); len = sprintf(user->buf, "%u,%llu,%llu;", - msg->level, user->seq, ts_usec); + (msg->facility << 3) | msg->level, user->seq, ts_usec); /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { unsigned char c = log_text(msg)[i]; - if (c < ' ' || c >= 128) + if (c < ' ' || c >= 127 || c == '\\') len += sprintf(user->buf + len, "\\x%02x", c); else user->buf[len++] = c; @@ -474,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, continue; } - if (c < ' ' || c >= 128) { + if (c < ' ' || c >= 127 || c == '\\') { len += sprintf(user->buf + len, "\\x%02x", c); continue; } @@ -486,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (len > count) { ret = -EINVAL; @@ -513,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); switch (whence) { case SEEK_SET: /* the first record */ @@ -537,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -551,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; ret = POLLIN|POLLRDNORM; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -582,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); file->private_data = user; return 0; @@ -785,44 +804,64 @@ static bool printk_time; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static size_t print_time(u64 ts, char *buf) +{ + unsigned long rem_nsec; + + if (!printk_time) + return 0; + + if (!buf) + return 15; + + rem_nsec = do_div(ts, 1000000000); + return sprintf(buf, "[%5lu.%06lu] ", + (unsigned long)ts, rem_nsec / 1000); +} + static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; + unsigned int prefix = (msg->facility << 3) | msg->level; if (syslog) { if (buf) { - len += sprintf(buf, "<%u>", msg->level); + len += sprintf(buf, "<%u>", prefix); } else { len += 3; - if (msg->level > 9) - len++; - if (msg->level > 99) + if (prefix > 999) + len += 3; + else if (prefix > 99) + len += 2; + else if (prefix > 9) len++; } } - if (printk_time) { - if (buf) { - unsigned long long ts = msg->ts_nsec; - unsigned long rem_nsec = do_div(ts, 1000000000); - - len += sprintf(buf + len, "[%5lu.%06lu] ", - (unsigned long) ts, rem_nsec / 1000); - } else { - len += 15; - } - } - + len += print_time(msg->ts_nsec, buf ? buf + len : NULL); return len; } -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; + bool prefix = true; + bool newline = true; size_t len = 0; + if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) + prefix = false; + + if (msg->flags & LOG_CONT) { + if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) + prefix = false; + + if (!(msg->flags & LOG_NEWLINE)) + newline = false; + } + do { const char *next = memchr(text, '\n', text_size); size_t text_len; @@ -840,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog, text_len + 1>= size - len) break; - len += print_prefix(msg, syslog, buf + len); + if (prefix) + len += print_prefix(msg, syslog, buf + len); memcpy(buf + len, text, text_len); len += text_len; - buf[len++] = '\n'; + if (next || newline) + buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - len += print_prefix(msg, syslog, NULL); - len += text_len + 1; + if (prefix) + len += print_prefix(msg, syslog, NULL); + len += text_len; + if (next || newline) + len++; } + prefix = true; text = next; } while (text); @@ -860,26 +905,60 @@ static int syslog_print(char __user *buf, int size) { char *text; struct log *msg; - int len; + int len = 0; text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - } - msg = log_from_idx(syslog_idx); - len = msg_print_text(msg, true, text, LOG_LINE_MAX); - syslog_idx = log_next(syslog_idx); - syslog_seq++; - raw_spin_unlock_irq(&logbuf_lock); + while (size > 0) { + size_t n; + size_t skip; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + syslog_prev = 0; + syslog_partial = 0; + } + if (syslog_seq == log_next_seq) { + raw_spin_unlock_irq(&logbuf_lock); + break; + } + + skip = syslog_partial; + msg = log_from_idx(syslog_idx); + n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); + if (n - syslog_partial <= size) { + /* message fits into buffer, move forward */ + syslog_idx = log_next(syslog_idx); + syslog_seq++; + syslog_prev = msg->flags; + n -= syslog_partial; + syslog_partial = 0; + } else if (!len){ + /* partial read(), remember position */ + n = size; + syslog_partial += n; + } else + n = 0; + raw_spin_unlock_irq(&logbuf_lock); + + if (!n) + break; + + if (copy_to_user(buf, text + skip, n)) { + if (!len) + len = -EFAULT; + break; + } - if (len > 0 && copy_to_user(buf, text, len)) - len = -EFAULT; + len += n; + size -= n; + buf += n; + } kfree(text); return len; @@ -899,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; + enum log_flags prev; if (clear_seq < log_first_seq) { /* messages are gone, move to first available one */ @@ -909,41 +989,47 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. - */ + */ seq = clear_seq; idx = clear_idx; + prev = 0; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len += msg_print_text(msg, true, NULL, 0); + len += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; } + + /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; + prev = 0; while (len > size && seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); + len -= msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; } - /* last message in this dump */ + /* last message fitting into this dump */ next_seq = log_next_seq; len = 0; + prev = 0; while (len >= 0 && seq < next_seq) { struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); if (textlen < 0) { len = textlen; break; } idx = log_next(idx); seq++; + prev = msg->flags; raw_spin_unlock_irq(&logbuf_lock); if (copy_to_user(buf + len, text, textlen)) @@ -956,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* messages are gone, move to next one */ seq = log_first_seq; idx = log_first_idx; + prev = 0; } } } @@ -1027,6 +1114,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_print_all(NULL, 0, true); + break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) @@ -1059,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_prev = 0; + syslog_partial = 0; } if (from_file) { /* @@ -1068,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) */ error = log_next_idx - syslog_idx; } else { - u64 seq; - u32 idx; + u64 seq = syslog_seq; + u32 idx = syslog_idx; + enum log_flags prev = syslog_prev; error = 0; - seq = syslog_seq; - idx = syslog_idx; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - error += msg_print_text(msg, true, NULL, 0); + error += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } + error -= syslog_partial; } raw_spin_unlock_irq(&logbuf_lock); break; @@ -1259,22 +1350,98 @@ static inline void printk_delay(void) } } +/* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments + * though, are printed immediately to the consoles to ensure everything has + * reached the console in case of a kernel crash. + */ +static struct cont { + char buf[LOG_LINE_MAX]; + size_t len; /* length == 0 means unused buffer */ + size_t cons; /* bytes written to console */ + struct task_struct *owner; /* task of first print*/ + u64 ts_nsec; /* time of first print */ + u8 level; /* log level of first message */ + u8 facility; /* log level of first message */ + bool flushed:1; /* buffer sealed and committed */ +} cont; + +static void cont_flush(void) +{ + if (cont.flushed) + return; + if (cont.len == 0) + return; + + log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, + NULL, 0, cont.buf, cont.len); + + cont.flushed = true; +} + +static bool cont_add(int facility, int level, const char *text, size_t len) +{ + if (cont.len && cont.flushed) + return false; + + if (cont.len + len > sizeof(cont.buf)) { + cont_flush(); + return false; + } + + if (!cont.len) { + cont.facility = facility; + cont.level = level; + cont.owner = current; + cont.ts_nsec = local_clock(); + cont.cons = 0; + cont.flushed = false; + } + + memcpy(cont.buf + cont.len, text, len); + cont.len += len; + return true; +} + +static size_t cont_print_text(char *text, size_t size) +{ + size_t textlen = 0; + size_t len; + + if (cont.cons == 0) { + textlen += print_time(cont.ts_nsec, text); + size -= textlen; + } + + len = cont.len - cont.cons; + if (len > 0) { + if (len+1 > size) + len = size-1; + memcpy(text + textlen, cont.buf + cont.cons, len); + textlen += len; + cont.cons = cont.len; + } + + if (cont.flushed) { + text[textlen++] = '\n'; + /* got everything, release buffer */ + cont.len = 0; + } + return textlen; +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { static int recursion_bug; - static char cont_buf[LOG_LINE_MAX]; - static size_t cont_len; - static int cont_level; - static struct task_struct *cont_task; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; + enum log_flags lflags = 0; unsigned long flags; int this_cpu; - bool newline = false; - bool prefix = false; int printed_len = 0; boot_delay_msec(); @@ -1313,7 +1480,8 @@ asmlinkage int vprintk_emit(int facility, int level, recursion_bug = 0; printed_len += strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, NULL, 0, recursion_msg, printed_len); + log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, printed_len); } /* @@ -1325,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { text_len--; - newline = true; + lflags |= LOG_NEWLINE; } /* strip syslog prefix and extract log level or control flags */ @@ -1335,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (level == -1) level = text[1] - '0'; case 'd': /* KERN_DEFAULT */ - prefix = true; + lflags |= LOG_PREFIX; case 'c': /* KERN_CONT */ text += 3; text_len -= 3; @@ -1345,61 +1513,41 @@ asmlinkage int vprintk_emit(int facility, int level, if (level == -1) level = default_message_loglevel; - if (dict) { - prefix = true; - newline = true; - } + if (dict) + lflags |= LOG_PREFIX|LOG_NEWLINE; - if (!newline) { - if (cont_len && (prefix || cont_task != current)) { - /* - * Flush earlier buffer, which is either from a - * different thread, or when we got a new prefix. - */ - log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); - cont_len = 0; - } - - if (!cont_len) { - cont_level = level; - cont_task = current; - } + if (!(lflags & LOG_NEWLINE)) { + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or another task also prints continuation lines. + */ + if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) + cont_flush(); - /* buffer or append to earlier buffer from the same thread */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; + /* buffer line if possible, otherwise store it right away */ + if (!cont_add(facility, level, text, text_len)) + log_store(facility, level, lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); } else { - if (cont_len && cont_task == current) { - if (prefix) { - /* - * New prefix from the same thread; flush. We - * either got no earlier newline, or we race - * with an interrupt. - */ - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - } + bool stored = false; - /* append to the earlier buffer and flush */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - cont_task = NULL; - printed_len = cont_len; - } else { - /* ordinary single and terminated line */ - log_store(facility, level, - dict, dictlen, text, text_len); - printed_len = text_len; + /* + * If an earlier newline was missing and it was the same task, + * either merge it with the current buffer and flush, or if + * there was a race with interrupts (prefix == true) then just + * flush it out and store this line separately. + */ + if (cont.len && cont.owner == current) { + if (!(lflags & LOG_PREFIX)) + stored = cont_add(facility, level, text, text_len); + cont_flush(); } + + if (!stored) + log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); } + printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. @@ -1486,11 +1634,18 @@ EXPORT_SYMBOL(printk); #else #define LOG_LINE_MAX 0 +static struct cont { + size_t len; + size_t cons; + u8 level; + bool flushed:1; +} cont; static struct log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) { return 0; } +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { return 0; } +static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ @@ -1765,6 +1920,7 @@ void wake_up_klogd(void) /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; +static enum log_flags console_prev; /** * console_unlock - unlock the console system @@ -1782,6 +1938,7 @@ static u32 console_idx; */ void console_unlock(void) { + static char text[LOG_LINE_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1794,10 +1951,23 @@ void console_unlock(void) console_may_schedule = 0; + /* flush buffered message fragment immediately to console */ + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (cont.len && (cont.cons < cont.len || cont.flushed)) { + size_t len; + + len = cont_print_text(text, sizeof(text)); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + } else + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + again: for (;;) { struct log *msg; - static char text[LOG_LINE_MAX]; size_t len; int level; @@ -1811,18 +1981,35 @@ again: /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; + console_prev = 0; } - +skip: if (console_seq == log_next_seq) break; msg = log_from_idx(console_idx); - level = msg->level & 7; - - len = msg_print_text(msg, false, text, sizeof(text)); + if (msg->flags & LOG_NOCONS) { + /* + * Skip record we have buffered and already printed + * directly to the console when we received it. + */ + console_idx = log_next(console_idx); + console_seq++; + /* + * We will get here again when we register a new + * CON_PRINTBUFFER console. Clear the flag so we + * will properly dump everything later. + */ + msg->flags &= ~LOG_NOCONS; + goto skip; + } + level = msg->level; + len = msg_print_text(msg, console_prev, false, + text, sizeof(text)); console_idx = log_next(console_idx); console_seq++; + console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ @@ -2085,6 +2272,7 @@ void register_console(struct console *newcon) raw_spin_lock_irqsave(&logbuf_lock, flags); console_seq = syslog_seq; console_idx = syslog_idx; + console_prev = syslog_prev; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -2300,48 +2488,214 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer(). */ void kmsg_dump(enum kmsg_dump_reason reason) { - u64 idx; struct kmsg_dumper *dumper; - const char *s1, *s2; - unsigned long l1, l2; unsigned long flags; if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) return; - /* Theoretically, the log could move on after we do this, but - there's not a lot we can do about that. The new messages - will overwrite the start of what we dump. */ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + if (dumper->max_reason && reason > dumper->max_reason) + continue; + + /* initialize iterator with data about the stored records */ + dumper->active = true; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); + + /* reset iterator */ + dumper->active = false; + } + rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + struct log *msg; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - if (syslog_seq < log_first_seq) - idx = syslog_idx; - else - idx = log_first_idx; + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } - if (idx > log_next_idx) { - s1 = log_buf; - l1 = log_next_idx; + /* last entry */ + if (dumper->cur_seq >= log_next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } - s2 = log_buf + idx; - l2 = log_buf_len - idx; - } else { - s1 = ""; - l1 = 0; + msg = log_from_idx(dumper->cur_idx); + l = msg_print_text(msg, 0, syslog, line, size); + + dumper->cur_idx = log_next(dumper->cur_idx); + dumper->cur_seq++; + ret = true; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @buf: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + unsigned long flags; + u64 seq; + u32 idx; + u64 next_seq; + u32 next_idx; + enum log_flags prev; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= dumper->next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } + + /* calculate length of entire buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + prev = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, prev, true, NULL, 0); + idx = log_next(idx); + seq++; + prev = msg->flags; + } - s2 = log_buf + idx; - l2 = log_next_idx - idx; + /* move first record forward until length fits into the buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + prev = 0; + while (l > size && seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l -= msg_print_text(msg, prev, true, NULL, 0); + idx = log_next(idx); + seq++; + prev = msg->flags; } + + /* last message in next interation */ + next_seq = seq; + next_idx = idx; + + l = 0; + prev = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, prev, syslog, buf + l, size - l); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + + dumper->next_seq = next_seq; + dumper->next_idx = next_idx; + ret = true; raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) - dumper->dump(dumper, reason, s1, l1, s2, l2); - rcu_read_unlock(); +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); } +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ee8d49b9c309..a232bb59d93f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->uid == tcred->euid && - cred->uid == tcred->suid && - cred->uid == tcred->uid && - cred->gid == tcred->egid && - cred->gid == tcred->sgid && - cred->gid == tcred->gid)) + if (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)) goto ok; - if (ptrace_has_cap(tcred->user->user_ns, mode)) + if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d0..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -1397,6 +1398,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; rdp->n_cbs_adopted += rsp->qlen; + if (rsp->qlen_lazy != rsp->qlen) + rcu_idle_count_callbacks_posted(); rsp->qlen_lazy = 0; rsp->qlen = 0; @@ -1528,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy; + int bl, count, count_lazy, i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { @@ -1551,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; tail = rdp->nxttail[RCU_DONE_TAIL]; - for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) - if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[count] = &rdp->nxtlist; + for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = &rdp->nxtlist; local_irq_restore(flags); /* Invoke callbacks. */ @@ -1581,9 +1584,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; - for (count = 0; count < RCU_NEXT_SIZE; count++) - if (&rdp->nxtlist == rdp->nxttail[count]) - rdp->nxttail[count] = tail; + for (i = 0; i < RCU_NEXT_SIZE; i++) + if (&rdp->nxtlist == rdp->nxttail[i]) + rdp->nxttail[i] = tail; else break; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf5..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,6 +84,20 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_RCU_FAST_NO_HZ + int dyntick_drain; /* Prepare-for-idle state variable. */ + unsigned long dyntick_holdoff; + /* No retries for the jiffy of failure. */ + struct timer_list idle_gp_timer; + /* Wake up CPU sleeping with callbacks. */ + unsigned long idle_gp_timer_expires; + /* When to wake up CPU (for repost). */ + bool idle_first_pass; /* First pass of attempt to go idle? */ + unsigned long nonlazy_posted; + /* # times non-lazy CBs posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* idle-period nonlazy_posted snapshot. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; /* RCU's kthread states for tracing. */ @@ -430,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = __this_cpu_ptr(rcu_preempt_state.rda); + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(cpu); local_irq_restore(flags); } @@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +/* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ @@ -1886,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu); } @@ -1962,41 +1971,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* Loop counter for rcu_prepare_for_idle(). */ -static DEFINE_PER_CPU(int, rcu_dyntick_drain); -/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ -static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); -/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ -static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); -/* Enable special processing on first attempt to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(bool, rcu_idle_first_pass); -/* Running count of non-lazy callbacks posted, never decremented. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); -/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); - -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - /* Flag a new idle sojourn to the idle-entry state machine. */ - per_cpu(rcu_idle_first_pass, cpu) = 1; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; -} - /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2040,6 +2014,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) } /* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks. The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU. This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Flag a new idle sojourn to the idle-entry state machine. */ + rdtp->idle_first_pass = 1; + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) { + *delta_jiffies = ULONG_MAX; + return 0; + } + if (rdtp->dyntick_holdoff == jiffies) { + /* RCU recently tried and failed, so don't try again. */ + *delta_jiffies = 1; + return 1; + } + /* Set up for the possibility that RCU will post a timer. */ + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + *delta_jiffies = RCU_IDLE_GP_DELAY; + else + *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + return 0; +} + +/* * Handler for smp_call_function_single(). The only point of this * handler is to wake the CPU up, so the handler does only tracing. */ @@ -2075,21 +2090,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) */ static void rcu_prepare_for_idle_init(int cpu) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_timer_func, cpu); - per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; - per_cpu(rcu_idle_first_pass, cpu) = 1; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + rdtp->dyntick_holdoff = jiffies - 1; + setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); + rdtp->idle_gp_timer_expires = jiffies - 1; + rdtp->idle_first_pass = 1; } /* * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * is no longer any point to ->idle_gp_timer, so cancel it. This will * do nothing if this timer is not active, so just cancel it unconditionally. */ static void rcu_cleanup_after_idle(int cpu) { - del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); } @@ -2108,42 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu) * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * later. The ->dyntick_drain field controls the sequencing. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle * loop, then don't take any state-machine actions, unless the * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * Instead, repost the ->idle_gp_timer if this CPU has callbacks * pending. */ - if (!per_cpu(rcu_idle_first_pass, cpu) && - (per_cpu(rcu_nonlazy_posted, cpu) == - per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (!rdtp->idle_first_pass && + (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { if (rcu_cpu_has_callbacks(cpu)) { - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); } return; } - per_cpu(rcu_idle_first_pass, cpu) = 0; - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu) - 1; + rdtp->idle_first_pass = 0; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. */ if (!rcu_cpu_has_callbacks(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; + rdtp->dyntick_holdoff = jiffies - 1; + rdtp->dyntick_drain = 0; trace_rcu_prep_idle("No callbacks"); return; } @@ -2152,36 +2169,37 @@ static void rcu_prepare_for_idle(int cpu) * If in holdoff mode, just return. We will presumably have * refrained from disabling the scheduling-clock tick. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + if (rdtp->dyntick_holdoff == jiffies) { trace_rcu_prep_idle("In holdoff"); return; } - /* Check and update the rcu_dyntick_drain sequencing. */ - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* Check and update the ->dyntick_drain sequencing. */ + if (rdtp->dyntick_drain <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; - } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + rdtp->dyntick_drain = RCU_IDLE_FLUSHES; + } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) - per_cpu(rcu_idle_gp_timer_expires, cpu) = + rdtp->dyntick_drain = 0; + rdtp->dyntick_holdoff = jiffies; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_GP_DELAY; - else - per_cpu(rcu_idle_gp_timer_expires, cpu) = + } else { + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_LAZY_GP_DELAY; - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu); + trace_rcu_prep_idle("Dyntick with lazy callbacks"); + } + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; return; /* Nothing more to do immediately. */ - } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + } else if (--(rdtp->dyntick_drain) <= 0) { /* We have hit the limit, so time to give up. */ - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_holdoff = jiffies; trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2227,7 +2245,7 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_idle_count_callbacks_posted(void) { - __this_cpu_add(rcu_nonlazy_posted, 1); + __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -2238,11 +2256,12 @@ static void rcu_idle_count_callbacks_posted(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct timer_list *tltp = &rdtp->idle_gp_timer; sprintf(cp, "drain=%d %c timer=%lu", - per_cpu(rcu_dyntick_drain, cpu), - per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + rdtp->dyntick_drain, + rdtp->dyntick_holdoff == jiffies ? 'H' : '.', timer_pending(tltp) ? tltp->expires - jiffies : -1); } diff --git a/kernel/relay.c b/kernel/relay.c index ab56a1764d4d..e8cd2027abbd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, struct splice_pipe_desc spd = { .pages = pages, .nr_pages = 0, + .nr_pages_max = PIPE_DEF_BUFFERS, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, @@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, ret += padding; out: - splice_shrink_spd(pipe, &spd); - return ret; + splice_shrink_spd(&spd); + return ret; } static ssize_t relay_file_splice_read(struct file *in, diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bebe2b170d49..ad581aa2369a 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -94,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) counter->usage -= val; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +void res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { + for (c = counter; c != top; c = c->parent) { spin_lock(&c->lock); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); @@ -108,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) local_irq_restore(flags); } +void res_counter_uncharge(struct res_counter *counter, unsigned long val) +{ + res_counter_uncharge_until(counter, NULL, val); +} static inline unsigned long long * res_counter_member(struct res_counter *counter, int member) diff --git a/kernel/resource.c b/kernel/resource.c index 7e8ea66a8c01..e1d2b8ee76d5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -515,8 +515,8 @@ out: * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size - * @min: minimum size to allocate - * @max: maximum size to allocate + * @min: minimum boundary to allocate + * @max: maximum boundary to allocate * @align: alignment requested, in bytes * @alignf: alignment function, optional, called if not NULL * @alignf_data: arbitrary data to pass to the @alignf function diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d833cc94eedc..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = { #include "features.h" - NULL }; #undef SCHED_FEAT @@ -1912,7 +1911,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { sched_info_switch(prev, next); - perf_event_task_sched(prev, next); + perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); @@ -1955,6 +1954,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + perf_event_task_sched_in(prev, current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -2075,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); @@ -2155,11 +2160,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -2176,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -2187,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; -void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_write_idx(void) { + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void) { - long delta = 0; + struct rq *this_rq = this_rq(); /* - * Its got a race, we don't care... + * If we're still before the sample window, we're done. */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -2296,66 +2454,39 @@ static void calc_global_nohz(void) { long delta, active, n; - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * It could be the one fold was all it took, we done! - */ - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} + calc_load_update += n * LOAD_FREQ; + } -static inline long calc_load_fold_idle(void) -{ - return 0; + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2363,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -2378,12 +2516,7 @@ void calc_global_load(unsigned long ticks) calc_load_update += LOAD_FREQ; /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } @@ -2400,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -2408,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * End of global load-average stuff + */ + +/* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load * @@ -2510,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = jiffies; + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* - * Bloody broken means of dealing with nohz, but better than nothing.. - * jiffies is updated by one cpu, another cpu can drift wrt the jiffy - * update and see 0 difference the one time and 2 the next, even though - * we ticked at roughtly the same rate. - * - * Hence we only use this from nohz_idle_balance() and skip this - * nonsense when called from the scheduler_tick() since that's - * guaranteed a stable rate. + * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; @@ -2540,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq) } /* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* - * See the mess in update_idle_cpu_load(). + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); @@ -4063,11 +4232,8 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - if (cred->user->user_ns == pcred->user->user_ns) - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); - else - match = false; + match = (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); rcu_read_unlock(); return match; } @@ -4978,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* @@ -5520,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str) { - sched_domain_debug_enabled = 1; + sched_debug_enabled = 1; return 0; } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) @@ -5568,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->sgp->power) { + /* + * Even though we initialize ->power to something semi-sane, + * we leave power_orig unset. This allows us to detect if + * domain iteration is still funny without causing /0 traps. + */ + if (!group->sgp->power_orig) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -5616,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_domain_debug_enabled) + if (!sched_debug_enabled) return; if (!sd) { @@ -5637,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} #endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) @@ -5958,6 +6138,44 @@ struct sched_domain_topology_level { struct sd_data data; }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -5976,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); @@ -5983,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); @@ -5993,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); - atomic_inc(&sg->sgp->ref); + sg->sgp = *per_cpu_ptr(sdd->sgp, i); + if (atomic_inc_return(&sg->sgp->ref) == 1) + build_group_mask(sd, sg); + + /* + * Initialize sgp->power such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); - if (cpumask_test_cpu(cpu, sg_span)) + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + group_balance_cpu(sg) == cpu) groups = sg; if (!first) @@ -6070,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; + cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -6111,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sg = sg->next; } while (sg != sd->groups); - if (cpu != group_first_cpu(sg)) + if (cpu != group_balance_cpu(sg)) return; update_group_power(sd, cpu); @@ -6161,11 +6398,8 @@ int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -6275,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol #ifdef CONFIG_NUMA static int sched_domains_numa_levels; -static int sched_domains_numa_scale; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; static inline int sd_local_flags(int level) { - if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) return 0; return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; @@ -6340,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu) return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; } +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6347,7 +6616,6 @@ static void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_scale = curr_distance; sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); if (!sched_domains_numa_distance) return; @@ -6358,23 +6626,41 @@ static void sched_init_numa(void) * * Assumes node_distance(0,j) includes all distances in * node_distance(i,j) in order to avoid cubic time. - * - * XXX: could be optimized to O(n log n) by using sort() */ next_distance = curr_distance; for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - int distance = node_distance(0, j); - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; } /* * 'level' contains the number of unique distances, excluding the @@ -6399,7 +6685,7 @@ static void sched_init_numa(void) return; for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!mask) return; @@ -6486,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power), + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sgp) return -ENOMEM; @@ -6539,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; @@ -6547,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, child->parent = sd; } sd->child = child; + set_domain_attribute(sd, attr); return sd; } @@ -6687,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..c099cc6eebe3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available; + u64 total, available, age_stamp, avg; - total = sched_avg_period() + (rq->clock - rq->age_stamp); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = ACCESS_ONCE(rq->age_stamp); + avg = ACCESS_ONCE(rq->rt_avg); + + total = sched_avg_period() + (rq->clock - age_stamp); - if (unlikely(total < rq->rt_avg)) { + if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ available = 0; } else { - available = total - rq->rt_avg; + available = total - avg; } if (unlikely((s64)total < SCHED_POWER_SCALE)) @@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu) power = 0; - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ - sdg->sgp->power = power; + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + } + + sdg->sgp->power_orig = sdg->sgp->power = power; } /* @@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. + * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. @@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, int i; if (local_group) - balance_cpu = group_first_cpu(group); + balance_cpu = group_balance_cpu(group); /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; @@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { + if (idle_cpu(i) && !first_idle_cpu && + cpumask_test_cpu(i, sched_group_mask(group))) { first_idle_cpu = 1; balance_cpu = i; } @@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, /** * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked + * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics - * @this_cpu: the current cpu * * Determine if @sg is a busier group than the previously selected * busiest group. @@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu + * @env: The load balancing environment. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, * Returns 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * - * @sd: The sched_domain whose packing is to be checked. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing. */ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) { @@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during * load balancing. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. */ static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) @@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * Also calculates the amount of weighted load which should be moved * to restore balance. * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - * be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. + * @env: The load balancing environment. * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1276,10 +1282,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + (p->nr_cpus_allowed > 1)) return 1; return 0; } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->on_rq)) { - raw_spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; } @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && + p->nr_cpus_allowed > 1 && rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Only update if the process changes its state from whether it * can migrate or not. */ - if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; rq = task_rq(p); @@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { + struct sched_rt_entity *rt_se = &p->rt; + update_curr_rt(rq); watchdog(rq, p); @@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = RR_TIMESLICE; /* - * Requeue to the end of queue if we are not the only element - * on the queue: + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +extern int group_balance_cpu(struct sched_group *sg); + #endif /* CONFIG_SMP */ #include "stats.h" @@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 60636a4e25c3..4567fc020fe3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable); * down_trylock - try to acquire the semaphore, without waiting * @sem: the semaphore to be acquired * - * Try to acquire the semaphore atomically. Returns 0 if the mutex has + * Try to acquire the semaphore atomically. Returns 0 if the semaphore has * been acquired successfully or 1 if it it cannot be acquired. * * NOTE: This return value is inverted from both spin_trylock and diff --git a/kernel/signal.c b/kernel/signal.c index 1a006b5d9d9d..677102789cf2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -29,6 +29,7 @@ #include <linux/pid_namespace.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> +#include <linux/uprobes.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -767,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t) const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->euid == tcred->suid || - cred->euid == tcred->uid || - cred->uid == tcred->suid || - cred->uid == tcred->uid)) + if (uid_eq(cred->euid, tcred->suid) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->uid, tcred->suid) || + uid_eq(cred->uid, tcred->uid)) return 1; - if (ns_capable(tcred->user->user_ns, CAP_KILL)) + if (ns_capable(tcred->user_ns, CAP_KILL)) return 1; return 0; @@ -1020,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -/* - * map the uid in struct cred into user namespace *ns - */ -static inline uid_t map_cred_ns(const struct cred *cred, - struct user_namespace *ns) -{ - return user_ns_map_uid(ns, cred, cred->uid); -} - #ifdef CONFIG_USER_NS static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) { @@ -1038,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str if (SI_FROMKERNEL(info)) return; - info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), - current_cred(), info->si_uid); + rcu_read_lock(); + info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), + make_kuid(current_user_ns(), info->si_uid)); + rcu_read_unlock(); } #else static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) @@ -1106,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); - q->info.si_uid = current_uid(); + q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); break; case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; @@ -1387,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred, struct task_struct *target) { const struct cred *pcred = __task_cred(target); - if (cred->user_ns != pcred->user_ns) - return 0; - if (cred->euid != pcred->suid && cred->euid != pcred->uid && - cred->uid != pcred->suid && cred->uid != pcred->uid) + if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && + !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) return 0; return 1; } @@ -1665,21 +1656,20 @@ bool do_notify_parent(struct task_struct *tsk, int sig) info.si_signo = sig; info.si_errno = 0; /* - * we are under tasklist_lock here so our parent is tied to - * us and cannot exit and release its namespace. + * We are under tasklist_lock here so our parent is tied to + * us and cannot change. * - * the only it can is to switch its nsproxy with sys_unshare, - * bu uncharing pid namespaces is not allowed, so we'll always - * see relevant namespace + * task_active_pid_ns will always return the same pid namespace + * until a task passes through release_task. * * write_lock() currently calls preempt_disable() which is the * same as rcu_read_lock(), but according to Oleg, this is not * correct to rely on this */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = map_cred_ns(__task_cred(tsk), - task_cred_xxx(tsk->parent, user_ns)); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); + info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), + task_uid(tsk)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1762,8 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = map_cred_ns(__task_cred(tsk), - task_cred_xxx(parent, user_ns)); + info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -1973,7 +1962,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ ptrace_stop(exit_code, why, 1, &info); @@ -2181,8 +2170,8 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_code = SI_USER; rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = map_cred_ns(__task_cred(current->parent), - current_user_ns()); + info->si_uid = from_kuid_munged(current_user_ns(), + task_uid(current->parent)); rcu_read_unlock(); } @@ -2202,6 +2191,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; + if (unlikely(uprobe_deny_signal())) + return 0; + relock: /* * We'll jump back here after any time we were stopped in TASK_STOPPED. @@ -2376,24 +2368,34 @@ relock: } /** - * block_sigmask - add @ka's signal mask to current->blocked - * @ka: action for @signr - * @signr: signal that has been successfully delivered + * signal_delivered - + * @sig: number of signal being delivered + * @info: siginfo_t of signal being delivered + * @ka: sigaction setting that chose the handler + * @regs: user register state + * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has succesfully been - * delivered. It adds the mask of signals for @ka to current->blocked - * so that they are blocked during the execution of the signal - * handler. In addition, @signr will be blocked unless %SA_NODEFER is - * set in @ka->sa.sa_flags. + * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * is always blocked, and the signal itself is blocked unless %SA_NODEFER + * is set in @ka->sa.sa_flags. Tracing is notified. */ -void block_sigmask(struct k_sigaction *ka, int signr) +void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, + struct pt_regs *regs, int stepping) { sigset_t blocked; + /* A signal was successfully delivered, and the + saved sigmask was stored on the signal frame, + and will be restored by sigreturn. So we can + simply clear the restore sigmask flag. */ + clear_restore_sigmask(); + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, signr); + sigaddset(&blocked, sig); set_current_blocked(&blocked); + tracehook_signal_handler(sig, info, ka, regs, stepping); } /* @@ -2526,7 +2528,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) * It is wrong to change ->blocked directly, this helper should be used * to ensure the process can't miss a shared signal we are going to block. */ -void set_current_blocked(const sigset_t *newset) +void set_current_blocked(sigset_t *newset) +{ + struct task_struct *tsk = current; + sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, newset); + spin_unlock_irq(&tsk->sighand->siglock); +} + +void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; @@ -2566,7 +2577,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return -EINVAL; } - set_current_blocked(&newset); + __set_current_blocked(&newset); return 0; } @@ -2835,7 +2846,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) info.si_errno = 0; info.si_code = SI_USER; info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); return kill_something_info(sig, &info, pid); } @@ -2878,7 +2889,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) info.si_errno = 0; info.si_code = SI_TKILL; info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); return do_send_specific(tgid, pid, sig, &info); } @@ -3140,7 +3151,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, return -EINVAL; } - set_current_blocked(&new_blocked); + __set_current_blocked(&new_blocked); } if (oset) { @@ -3204,7 +3215,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) int old = current->blocked.sig[0]; sigset_t newset; - siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); set_current_blocked(&newset); return old; @@ -3243,6 +3253,17 @@ SYSCALL_DEFINE0(pause) #endif +int sigsuspend(sigset_t *set) +{ + current->saved_sigmask = current->blocked; + set_current_blocked(set); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + return -ERESTARTNOHAND; +} + #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND /** * sys_rt_sigsuspend - replace the signal mask for a value with the @@ -3260,15 +3281,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) if (copy_from_user(&newset, unewset, sizeof(newset))) return -EFAULT; - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e1a797e028a3..98f60c5caa1b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void) per_cpu(idle_threads, smp_processor_id()) = current; } +/** + * idle_init - Initialize the idle thread for a cpu + * @cpu: The cpu for which the idle thread should be initialized + * + * Creates the thread if it does not exist. + */ static inline void idle_init(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); @@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu) } /** - * idle_thread_init - Initialize the idle thread for a cpu - * @cpu: The cpu for which the idle thread should be initialized - * - * Creates the thread if it does not exist. + * idle_threads_init - Initialize idle threads for all cpus */ void __init idle_threads_init(void) { - unsigned int cpu; + unsigned int cpu, boot_cpu; + + boot_cpu = smp_processor_id(); for_each_possible_cpu(cpu) { - if (cpu != smp_processor_id()) + if (cpu != boot_cpu) idle_init(cpu); } } diff --git a/kernel/sys.c b/kernel/sys.c index ba0ae8eea6fb..2d39a84cd857 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -36,6 +36,8 @@ #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> +#include <linux/file.h> +#include <linux/mount.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/version.h> @@ -93,10 +95,8 @@ int overflowuid = DEFAULT_OVERFLOWUID; int overflowgid = DEFAULT_OVERFLOWGID; -#ifdef CONFIG_UID16 EXPORT_SYMBOL(overflowuid); EXPORT_SYMBOL(overflowgid); -#endif /* * the same as above, but for filesystems which can only store a 16-bit @@ -133,11 +133,10 @@ static bool set_one_prio_perm(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred = __task_cred(p); - if (pcred->user->user_ns == cred->user->user_ns && - (pcred->uid == cred->euid || - pcred->euid == cred->euid)) + if (uid_eq(pcred->uid, cred->euid) || + uid_eq(pcred->euid, cred->euid)) return true; - if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) return true; return false; } @@ -177,6 +176,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) goto out; @@ -209,18 +209,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = (struct user_struct *) cred->user; + uid = make_kuid(cred->user_ns, who); + user = cred->user; if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && + !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - if (__task_cred(p)->uid == who) + if (uid_eq(task_uid(p), uid)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); - if (who != cred->uid) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } @@ -244,6 +245,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; @@ -274,21 +276,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = (struct user_struct *) cred->user; + uid = make_kuid(cred->user_ns, who); + user = cred->user; if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && + !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - if (__task_cred(p)->uid == who) { + if (uid_eq(task_uid(p), uid)) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } } while_each_thread(g, p); - if (who != cred->uid) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } @@ -553,9 +556,19 @@ void ctrl_alt_del(void) */ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t krgid, kegid; + + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -564,25 +577,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) retval = -EPERM; if (rgid != (gid_t) -1) { - if (old->gid == rgid || - old->egid == rgid || + if (gid_eq(old->gid, krgid) || + gid_eq(old->egid, krgid) || nsown_capable(CAP_SETGID)) - new->gid = rgid; + new->gid = krgid; else goto error; } if (egid != (gid_t) -1) { - if (old->gid == egid || - old->egid == egid || - old->sgid == egid || + if (gid_eq(old->gid, kegid) || + gid_eq(old->egid, kegid) || + gid_eq(old->sgid, kegid) || nsown_capable(CAP_SETGID)) - new->egid = egid; + new->egid = kegid; else goto error; } if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old->gid)) + (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) new->sgid = new->egid; new->fsgid = new->egid; @@ -600,9 +613,15 @@ error: */ SYSCALL_DEFINE1(setgid, gid_t, gid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t kgid; + + kgid = make_kgid(ns, gid); + if (!gid_valid(kgid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -611,9 +630,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) retval = -EPERM; if (nsown_capable(CAP_SETGID)) - new->gid = new->egid = new->sgid = new->fsgid = gid; - else if (gid == old->gid || gid == old->sgid) - new->egid = new->fsgid = gid; + new->gid = new->egid = new->sgid = new->fsgid = kgid; + else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) + new->egid = new->fsgid = kgid; else goto error; @@ -631,7 +650,7 @@ static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current_user_ns(), new->uid); + new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; @@ -670,9 +689,19 @@ static int set_user(struct cred *new) */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kruid, keuid; + + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); + + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -681,29 +710,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) retval = -EPERM; if (ruid != (uid_t) -1) { - new->uid = ruid; - if (old->uid != ruid && - old->euid != ruid && + new->uid = kruid; + if (!uid_eq(old->uid, kruid) && + !uid_eq(old->euid, kruid) && !nsown_capable(CAP_SETUID)) goto error; } if (euid != (uid_t) -1) { - new->euid = euid; - if (old->uid != euid && - old->euid != euid && - old->suid != euid && + new->euid = keuid; + if (!uid_eq(old->uid, keuid) && + !uid_eq(old->euid, keuid) && + !uid_eq(old->suid, keuid) && !nsown_capable(CAP_SETUID)) goto error; } - if (new->uid != old->uid) { + if (!uid_eq(new->uid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old->uid)) + (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) new->suid = new->euid; new->fsuid = new->euid; @@ -731,9 +760,15 @@ error: */ SYSCALL_DEFINE1(setuid, uid_t, uid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kuid; + + kuid = make_kuid(ns, uid); + if (!uid_valid(kuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -742,17 +777,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (nsown_capable(CAP_SETUID)) { - new->suid = new->uid = uid; - if (uid != old->uid) { + new->suid = new->uid = kuid; + if (!uid_eq(kuid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } - } else if (uid != old->uid && uid != new->suid) { + } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { goto error; } - new->fsuid = new->euid = uid; + new->fsuid = new->euid = kuid; retval = security_task_fix_setuid(new, old, LSM_SETID_ID); if (retval < 0) @@ -772,9 +807,24 @@ error: */ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kruid, keuid, ksuid; + + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); + ksuid = make_kuid(ns, suid); + + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; + + if ((suid != (uid_t) -1) && !uid_valid(ksuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -784,29 +834,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) retval = -EPERM; if (!nsown_capable(CAP_SETUID)) { - if (ruid != (uid_t) -1 && ruid != old->uid && - ruid != old->euid && ruid != old->suid) + if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; - if (euid != (uid_t) -1 && euid != old->uid && - euid != old->euid && euid != old->suid) + if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) goto error; - if (suid != (uid_t) -1 && suid != old->uid && - suid != old->euid && suid != old->suid) + if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) goto error; } if (ruid != (uid_t) -1) { - new->uid = ruid; - if (ruid != old->uid) { + new->uid = kruid; + if (!uid_eq(kruid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } if (euid != (uid_t) -1) - new->euid = euid; + new->euid = keuid; if (suid != (uid_t) -1) - new->suid = suid; + new->suid = ksuid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RES); @@ -820,14 +870,19 @@ error: return retval; } -SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; + uid_t ruid, euid, suid; - if (!(retval = put_user(cred->uid, ruid)) && - !(retval = put_user(cred->euid, euid))) - retval = put_user(cred->suid, suid); + ruid = from_kuid_munged(cred->user_ns, cred->uid); + euid = from_kuid_munged(cred->user_ns, cred->euid); + suid = from_kuid_munged(cred->user_ns, cred->suid); + + if (!(retval = put_user(ruid, ruidp)) && + !(retval = put_user(euid, euidp))) + retval = put_user(suid, suidp); return retval; } @@ -837,9 +892,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u */ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t krgid, kegid, ksgid; + + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + ksgid = make_kgid(ns, sgid); + + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; + if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -848,23 +916,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) retval = -EPERM; if (!nsown_capable(CAP_SETGID)) { - if (rgid != (gid_t) -1 && rgid != old->gid && - rgid != old->egid && rgid != old->sgid) + if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; - if (egid != (gid_t) -1 && egid != old->gid && - egid != old->egid && egid != old->sgid) + if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) goto error; - if (sgid != (gid_t) -1 && sgid != old->gid && - sgid != old->egid && sgid != old->sgid) + if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) goto error; } if (rgid != (gid_t) -1) - new->gid = rgid; + new->gid = krgid; if (egid != (gid_t) -1) - new->egid = egid; + new->egid = kegid; if (sgid != (gid_t) -1) - new->sgid = sgid; + new->sgid = ksgid; new->fsgid = new->egid; return commit_creds(new); @@ -874,14 +942,19 @@ error: return retval; } -SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; + gid_t rgid, egid, sgid; - if (!(retval = put_user(cred->gid, rgid)) && - !(retval = put_user(cred->egid, egid))) - retval = put_user(cred->sgid, sgid); + rgid = from_kgid_munged(cred->user_ns, cred->gid); + egid = from_kgid_munged(cred->user_ns, cred->egid); + sgid = from_kgid_munged(cred->user_ns, cred->sgid); + + if (!(retval = put_user(rgid, rgidp)) && + !(retval = put_user(egid, egidp))) + retval = put_user(sgid, sgidp); return retval; } @@ -898,18 +971,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) const struct cred *old; struct cred *new; uid_t old_fsuid; + kuid_t kuid; + + old = current_cred(); + old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); + + kuid = make_kuid(old->user_ns, uid); + if (!uid_valid(kuid)) + return old_fsuid; new = prepare_creds(); if (!new) - return current_fsuid(); - old = current_cred(); - old_fsuid = old->fsuid; + return old_fsuid; - if (uid == old->uid || uid == old->euid || - uid == old->suid || uid == old->fsuid || + if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || + uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || nsown_capable(CAP_SETUID)) { - if (uid != old_fsuid) { - new->fsuid = uid; + if (!uid_eq(kuid, old->fsuid)) { + new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) goto change_okay; } @@ -931,18 +1010,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) const struct cred *old; struct cred *new; gid_t old_fsgid; + kgid_t kgid; + + old = current_cred(); + old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); + + kgid = make_kgid(old->user_ns, gid); + if (!gid_valid(kgid)) + return old_fsgid; new = prepare_creds(); if (!new) - return current_fsgid(); - old = current_cred(); - old_fsgid = old->fsgid; + return old_fsgid; - if (gid == old->gid || gid == old->egid || - gid == old->sgid || gid == old->fsgid || + if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || + gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || nsown_capable(CAP_SETGID)) { - if (gid != old_fsgid) { - new->fsgid = gid; + if (!gid_eq(kgid, old->fsgid)) { + new->fsgid = kgid; goto change_okay; } } @@ -1295,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; + uts_proc_notify(UTS_PROC_HOSTNAME); } - uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1346,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; + uts_proc_notify(UTS_PROC_DOMAINNAME); } - uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } @@ -1498,15 +1583,14 @@ static int check_prlimit_permission(struct task_struct *task) return 0; tcred = __task_cred(task); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->uid == tcred->euid && - cred->uid == tcred->suid && - cred->uid == tcred->uid && - cred->gid == tcred->egid && - cred->gid == tcred->sgid && - cred->gid == tcred->gid)) + if (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)) return 0; - if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return 0; return -EPERM; @@ -1702,77 +1786,105 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +{ + struct file *exe_file; + struct dentry *dentry; + int err; + + exe_file = fget(fd); + if (!exe_file) + return -EBADF; + + dentry = exe_file->f_path.dentry; + + /* + * Because the original mm->exe_file points to executable file, make + * sure that this one is executable as well, to avoid breaking an + * overall picture. + */ + err = -EACCES; + if (!S_ISREG(dentry->d_inode->i_mode) || + exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + err = inode_permission(dentry->d_inode, MAY_EXEC); + if (err) + goto exit; + + down_write(&mm->mmap_sem); + + /* + * Forbid mm->exe_file change if old file still mapped. + */ + err = -EBUSY; + if (mm->exe_file) { + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_file && + path_equal(&vma->vm_file->f_path, + &mm->exe_file->f_path)) + goto exit_unlock; + } + + /* + * The symlink can be changed only once, just to disallow arbitrary + * transitions malicious software might bring in. This means one + * could make a snapshot over all processes running and monitor + * /proc/pid/exe changes to notice unusual activity if needed. + */ + err = -EPERM; + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) + goto exit_unlock; + + err = 0; + set_mm_exe_file(mm, exe_file); +exit_unlock: + up_write(&mm->mmap_sem); + +exit: + fput(exe_file); + return err; +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { unsigned long rlim = rlimit(RLIMIT_DATA); - unsigned long vm_req_flags; - unsigned long vm_bad_flags; - struct vm_area_struct *vma; - int error = 0; struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int error; - if (arg4 | arg5) + if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) return -EINVAL; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (addr >= TASK_SIZE) + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); + + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; + error = -EINVAL; + down_read(&mm->mmap_sem); vma = find_vma(mm, addr); - if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { - /* It must be existing VMA */ - if (!vma || vma->vm_start > addr) - goto out; - } - - error = -EINVAL; switch (opt) { case PR_SET_MM_START_CODE: + mm->start_code = addr; + break; case PR_SET_MM_END_CODE: - vm_req_flags = VM_READ | VM_EXEC; - vm_bad_flags = VM_WRITE | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_CODE) - mm->start_code = addr; - else - mm->end_code = addr; + mm->end_code = addr; break; - case PR_SET_MM_START_DATA: - case PR_SET_MM_END_DATA: - vm_req_flags = VM_READ | VM_WRITE; - vm_bad_flags = VM_EXEC | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_DATA) - mm->start_data = addr; - else - mm->end_data = addr; + mm->start_data = addr; break; - - case PR_SET_MM_START_STACK: - -#ifdef CONFIG_STACK_GROWSUP - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; -#else - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; -#endif - if ((vma->vm_flags & vm_req_flags) != vm_req_flags) - goto out; - - mm->start_stack = addr; + case PR_SET_MM_END_DATA: + mm->end_data = addr; break; case PR_SET_MM_START_BRK: @@ -1799,24 +1911,89 @@ static int prctl_set_mm(int opt, unsigned long addr, mm->brk = addr; break; + /* + * If command line arguments and environment + * are placed somewhere else on stack, we can + * set them up here, ARG_START/END to setup + * command line argumets and ENV_START/END + * for environment. + */ + case PR_SET_MM_START_STACK: + case PR_SET_MM_ARG_START: + case PR_SET_MM_ARG_END: + case PR_SET_MM_ENV_START: + case PR_SET_MM_ENV_END: + if (!vma) { + error = -EFAULT; + goto out; + } + if (opt == PR_SET_MM_START_STACK) + mm->start_stack = addr; + else if (opt == PR_SET_MM_ARG_START) + mm->arg_start = addr; + else if (opt == PR_SET_MM_ARG_END) + mm->arg_end = addr; + else if (opt == PR_SET_MM_ENV_START) + mm->env_start = addr; + else if (opt == PR_SET_MM_ENV_END) + mm->env_end = addr; + break; + + /* + * This doesn't move auxiliary vector itself + * since it's pinned to mm_struct, but allow + * to fill vector with new values. It's up + * to a caller to provide sane values here + * otherwise user space tools which use this + * vector might be unhappy. + */ + case PR_SET_MM_AUXV: { + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (arg4 > sizeof(user_auxv)) + goto out; + up_read(&mm->mmap_sem); + + if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, arg4); + task_unlock(current); + + return 0; + } default: - error = -EINVAL; goto out; } error = 0; - out: up_read(&mm->mmap_sem); - return error; } + +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return put_user(me->clear_child_tid, tid_addr); +} + #else /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { return -EINVAL; } +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return -EINVAL; +} #endif SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -1971,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; error = 0; @@ -2032,7 +2212,6 @@ int orderly_poweroff(bool force) NULL }; int ret = -ENOMEM; - struct subprocess_info *info; if (argv == NULL) { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", @@ -2040,18 +2219,16 @@ int orderly_poweroff(bool force) goto out; } - info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); - if (info == NULL) { - argv_free(argv); - goto out; - } - - call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + NULL, argv_cleanup, NULL); +out: + if (likely(!ret)) + return 0; - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + if (ret == -ENOMEM) + argv_free(argv); - out: - if (ret && force) { + if (force) { printk(KERN_WARNING "Failed to start orderly shutdown: " "forcing the issue\n"); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); cond_syscall(sys_name_to_handle_at); cond_syscall(sys_open_by_handle_at); cond_syscall(compat_sys_open_by_handle_at); + +/* compare kernel pointers */ +cond_syscall(sys_kcmp); diff --git a/kernel/task_work.c b/kernel/task_work.c new file mode 100644 index 000000000000..82d1c794066d --- /dev/null +++ b/kernel/task_work.c @@ -0,0 +1,84 @@ +#include <linux/spinlock.h> +#include <linux/task_work.h> +#include <linux/tracehook.h> + +int +task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +{ + unsigned long flags; + int err = -ESRCH; + +#ifndef TIF_NOTIFY_RESUME + if (notify) + return -ENOTSUPP; +#endif + /* + * We must not insert the new work if the task has already passed + * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() + * and check PF_EXITING under pi_lock. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (likely(!(task->flags & PF_EXITING))) { + hlist_add_head(&twork->hlist, &task->task_works); + err = 0; + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ + if (likely(!err) && notify) + set_notify_resume(task); + return err; +} + +struct task_work * +task_work_cancel(struct task_struct *task, task_work_func_t func) +{ + unsigned long flags; + struct task_work *twork; + struct hlist_node *pos; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + hlist_for_each_entry(twork, pos, &task->task_works, hlist) { + if (twork->func == func) { + hlist_del(&twork->hlist); + goto found; + } + } + twork = NULL; + found: + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + return twork; +} + +void task_work_run(void) +{ + struct task_struct *task = current; + struct hlist_head task_works; + struct hlist_node *pos; + + raw_spin_lock_irq(&task->pi_lock); + hlist_move_list(&task->task_works, &task_works); + raw_spin_unlock_irq(&task->pi_lock); + + if (unlikely(hlist_empty(&task_works))) + return; + /* + * We use hlist to save the space in task_struct, but we want fifo. + * Find the last entry, the list should be short, then process them + * in reverse order. + */ + for (pos = task_works.first; pos->next; pos = pos->next) + ; + + for (;;) { + struct hlist_node **pprev = pos->pprev; + struct task_work *twork = container_of(pos, struct task_work, + hlist); + twork->func(twork); + + if (pprev == &task_works.first) + break; + pos = container_of(pprev, struct hlist_node, next); + } +} diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a20dc8a3c949..fd42bd452b75 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -2,6 +2,55 @@ # Timer subsystem related configuration options # +# Options selectable by arch Kconfig + +# Watchdog function for clocksources to detect instabilities +config CLOCKSOURCE_WATCHDOG + bool + +# Architecture has extra clocksource data +config ARCH_CLOCKSOURCE_DATA + bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL + bool + +# ktime_t scalar 64bit nsec representation +config KTIME_SCALAR + bool + +# Old style timekeeping +config ARCH_USES_GETTIMEOFFSET + bool + +# The generic clock events infrastructure +config GENERIC_CLOCKEVENTS + bool + +# Migration helper. Builds, but does not invoke +config GENERIC_CLOCKEVENTS_BUILD + bool + default y + depends on GENERIC_CLOCKEVENTS + +# Clockevents broadcasting infrastructure +config GENERIC_CLOCKEVENTS_BROADCAST + bool + depends on GENERIC_CLOCKEVENTS + +# Automatically adjust the min. reprogramming time for +# clock event device +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool + +# Generic update of CMOS clock +config GENERIC_CMOS_UPDATE + bool + +if GENERIC_CLOCKEVENTS +menu "Timers subsystem" + # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independ of this. @@ -26,10 +75,5 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. -config GENERIC_CLOCKEVENTS_BUILD - bool - default y - depends on GENERIC_CLOCKEVENTS - -config GENERIC_CLOCKEVENTS_MIN_ADJUST - bool +endmenu +endif diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -static void clockevents_config(struct clock_event_device *dev, - u32 freq) +void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f03fd83b170b..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -409,15 +409,20 @@ int second_overflow(unsigned long secs) time_state = TIME_DEL; break; case TIME_INS: - if (secs % 86400 == 0) { + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; + time_tai++; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: - if ((secs + 1) % 86400 == 0) { + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { leap = 1; time_tai--; time_state = TIME_WAIT; @@ -426,7 +431,6 @@ int second_overflow(unsigned long secs) } break; case TIME_OOP: - time_tai++; time_state = TIME_WAIT; break; @@ -473,8 +477,6 @@ int second_overflow(unsigned long secs) << NTP_SCALE_SHIFT; time_adjust = 0; - - out: spin_unlock_irqrestore(&ntp_lock, flags); @@ -559,10 +561,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) /* only set allowed bits */ time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; - } + /* - * Called with the xtime lock held, so we can access and modify + * Called with ntp_lock held, so we can access and modify * all the global NTP state: */ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off @@ -401,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -576,6 +582,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* @@ -591,6 +598,7 @@ void tick_nohz_idle_exit(void) account_idle_ticks(ticks); #endif + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick @@ -814,6 +822,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) return HRTIMER_RESTART; } +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -831,6 +849,14 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + /* Offset the tick to avert xtime_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d66b21308f7c..3447cfaf11e7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -70,6 +70,12 @@ struct timekeeper { /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ struct timespec raw_time; + /* Offset clock monotonic -> clock realtime */ + ktime_t offs_real; + + /* Offset clock monotonic -> clock boottime */ + ktime_t offs_boot; + /* Seqlock for all timekeeper values */ seqlock_t lock; }; @@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } +static void update_rt_offset(void) +{ + struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; + + set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); + timekeeper.offs_real = timespec_to_ktime(tmp); +} + /* must hold write on timekeeper.lock */ static void timekeeping_update(bool clearntp) { @@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp) timekeeper.ntp_error = 0; ntp_clear(); } + update_rt_offset(); update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -240,7 +255,6 @@ void getnstimeofday(struct timespec *ts) timespec_add_ns(ts, nsecs); } - EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) @@ -357,8 +371,8 @@ void do_gettimeofday(struct timeval *tv) tv->tv_sec = now.tv_sec; tv->tv_usec = now.tv_nsec/1000; } - EXPORT_SYMBOL(do_gettimeofday); + /** * do_settimeofday - Sets the time of day * @tv: pointer to the timespec variable containing the new time @@ -392,7 +406,6 @@ int do_settimeofday(const struct timespec *tv) return 0; } - EXPORT_SYMBOL(do_settimeofday); @@ -606,6 +619,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_rt_offset(); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -614,6 +628,12 @@ void __init timekeeping_init(void) /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; +static void update_sleep_time(struct timespec t) +{ + timekeeper.total_sleep_time = t; + timekeeper.offs_boot = timespec_to_ktime(t); +} + /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -632,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *delta); - timekeeper.total_sleep_time = timespec_add( - timekeeper.total_sleep_time, *delta); + update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); } @@ -698,6 +717,7 @@ static void timekeeping_resume(void) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; + timekeeping_update(false); write_sequnlock_irqrestore(&timekeeper.lock, flags); touch_softlockup_watchdog(); @@ -964,6 +984,9 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } /* Accumulate raw time */ @@ -1079,6 +1102,9 @@ static void update_wall_time(void) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } timekeeping_update(false); @@ -1246,6 +1272,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } while (read_seqretry(&timekeeper.lock, seq)); } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +{ + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqbegin(&timekeeper.lock); + + secs = timekeeper.xtime.tv_sec; + nsecs = timekeeper.xtime.tv_nsec; + nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + *offs_real = timekeeper.offs_real; + *offs_boot = timekeeper.offs_boot; + } while (read_seqretry(&timekeeper.lock, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *offs_real); + return now; +} +#endif + /** * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format */ diff --git a/kernel/timer.c b/kernel/timer.c index 09de9a941cd7..6ec7e7e0db43 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1435,25 +1435,25 @@ SYSCALL_DEFINE0(getppid) SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ - return current_uid(); + return from_kuid_munged(current_user_ns(), current_uid()); } SYSCALL_DEFINE0(geteuid) { /* Only we change this so SMP safe */ - return current_euid(); + return from_kuid_munged(current_user_ns(), current_euid()); } SYSCALL_DEFINE0(getgid) { /* Only we change this so SMP safe */ - return current_gid(); + return from_kgid_munged(current_user_ns(), current_gid()); } SYSCALL_DEFINE0(getegid) { /* Only we change this so SMP safe */ - return current_egid(); + return from_kgid_munged(current_user_ns(), current_egid()); } #endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d81a1a532994..8c4c07071cc5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -271,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" select TRACE_BRANCH_PROFILING help - This tracer profiles all the the likely and unlikely macros + This tracer profiles all likely and unlikely macros in the kernel. It will display the results in: /sys/kernel/debug/tracing/trace_stat/branch_annotated @@ -372,6 +372,7 @@ config KPROBE_EVENT depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" select TRACING + select PROBE_EVENTS default y help This allows the user to add tracing events (similar to tracepoints) @@ -384,6 +385,25 @@ config KPROBE_EVENT This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. +config UPROBE_EVENT + bool "Enable uprobes-based dynamic events" + depends on ARCH_SUPPORTS_UPROBES + depends on MMU + select UPROBES + select PROBE_EVENTS + select TRACING + default n + help + This allows the user to add tracing events on top of userspace + dynamic events (similar to tracepoints) on the fly via the trace + events interface. Those events can be inserted wherever uprobes + can probe, and record various registers. + This option is required if you plan to use perf-probe subcommand + of perf tools on user space applications. + +config PROBE_EVENTS + def_bool n + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b3afe0e76f79..b831087c8200 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -60,5 +60,7 @@ endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif +obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o +obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o libftrace-y := ftrace.o diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6420cda62336..f765465bffe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) @@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. - * No need to update overruns, since this page is - * deleted from ring buffer and its entries are - * already accounted for. + * Increment overrun to account for the lost events. */ + local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); } @@ -1486,6 +1486,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!buffer) return size; + /* Make sure the requested buffer exists */ + if (cpu_id != RING_BUFFER_ALL_CPUS && + !cpumask_test_cpu(cpu_id, buffer->cpumask)) + return size; + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177db..a7fa0702be1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); void tracing_off(void) { if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + ring_buffer_record_off(global_trace.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, @@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = splice_to_pipe(pipe, &spd); out: - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; out_err: @@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); out: return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6c6f7933eede..5aec220d2de0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; +struct uprobe_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 580a05ec926b..b31d3d5699fe 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -19,547 +19,15 @@ #include <linux/module.h> #include <linux/uaccess.h> -#include <linux/kprobes.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/smp.h> -#include <linux/debugfs.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <linux/ptrace.h> -#include <linux/perf_event.h> -#include <linux/stringify.h> -#include <linux/limits.h> -#include <asm/bitsperlong.h> - -#include "trace.h" -#include "trace_output.h" - -#define MAX_TRACE_ARGS 128 -#define MAX_ARGSTR_LEN 63 -#define MAX_EVENT_NAME_LEN 64 -#define MAX_STRING_SIZE PATH_MAX -#define KPROBE_EVENT_SYSTEM "kprobes" - -/* Reserved field names */ -#define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_RETIP "__probe_ret_ip" -#define FIELD_STRING_FUNC "__probe_func" - -const char *reserved_field_names[] = { - "common_type", - "common_flags", - "common_preempt_count", - "common_pid", - "common_tgid", - FIELD_STRING_IP, - FIELD_STRING_RETIP, - FIELD_STRING_FUNC, -}; - -/* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, - void *); -#define PRINT_TYPE_FUNC_NAME(type) print_type_##type -#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type - -/* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ -static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent)\ -{ \ - return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ -} \ -static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) - -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs) \ - (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl) ((u32)(dl) >> 16) -#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) - -static inline void *get_rloc_data(u32 *dl) -{ - return (u8 *)dl + get_rloc_offs(*dl); -} - -/* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) -{ - return (u8 *)ent + get_rloc_offs(*dl); -} - -/* - * Convert data_rloc to data_loc: - * data_rloc stores the offset from data_rloc itself, but data_loc - * stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) - -/* For defining macros, define string/string_size types */ -typedef u32 string; -typedef u32 string_size; - -/* Print type function for string type */ -static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, - const char *name, - void *data, void *ent) -{ - int len = *(u32 *)data >> 16; - - if (!len) - return trace_seq_printf(s, " %s=(fault)", name); - else - return trace_seq_printf(s, " %s=\"%s\"", name, - (const char *)get_loc_data(data, ent)); -} -static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; - -/* Data fetch function type */ -typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); - -struct fetch_param { - fetch_func_t fn; - void *data; -}; - -static __kprobes void call_fetch(struct fetch_param *fprm, - struct pt_regs *regs, void *dest) -{ - return fprm->fn(regs, fprm->data, dest); -} - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) - -#define CHECK_FETCH_FUNCS(method, fn) \ - (((FETCH_FUNC_NAME(method, u8) == fn) || \ - (FETCH_FUNC_NAME(method, u16) == fn) || \ - (FETCH_FUNC_NAME(method, u32) == fn) || \ - (FETCH_FUNC_NAME(method, u64) == fn) || \ - (FETCH_FUNC_NAME(method, string) == fn) || \ - (FETCH_FUNC_NAME(method, string_size) == fn)) \ - && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type) \ -static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_register(regs, \ - (unsigned int)((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -#define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_retval(type) \ -static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ - void *dummy, void *dest) \ -{ \ - *(type *)dest = (type)regs_return_value(regs); \ -} -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - long ret; - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); - if (!maxlen) - return; - /* - * Try to get string again, since the string can be changed while - * probing. - */ - set_fs(KERNEL_DS); - pagefault_disable(); - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); -} -/* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - int ret, len = 0; - u8 c; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - pagefault_disable(); - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} - -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -static unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - if (sc->addr) - sc->addr += sc->offset; - return sc->addr; -} - -static void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - return sc; -} - -#define DEFINE_FETCH_symbol(type) \ -static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - -/* Dereference memory access function */ -struct deref_fetch_param { - struct fetch_param orig; - long offset; -}; - -#define DEFINE_FETCH_deref(type) \ -static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct deref_fetch_param *dprm = data; \ - unsigned long addr; \ - call_fetch(&dprm->orig, regs, &addr); \ - if (addr) { \ - addr += dprm->offset; \ - fetch_memory_##type(regs, (void *)addr, dest); \ - } else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) -DEFINE_FETCH_deref(string_size) - -static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} - -/* Bitfield fetch function */ -struct bitfield_fetch_param { - struct fetch_param orig; - unsigned char hi_shift; - unsigned char low_shift; -}; +#include "trace_probe.h" -#define DEFINE_FETCH_bitfield(type) \ -static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct bitfield_fetch_param *bprm = data; \ - type buf = 0; \ - call_fetch(&bprm->orig, regs, &buf); \ - if (buf) { \ - buf <<= bprm->hi_shift; \ - buf >>= bprm->low_shift; \ - } \ - *(type *)dest = buf; \ -} -DEFINE_BASIC_FETCH_FUNCS(bitfield) -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -static __kprobes void -update_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static __kprobes void -free_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} - -/* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t -#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) -#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) -#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) - -/* Fetch types */ -enum { - FETCH_MTD_reg = 0, - FETCH_MTD_stack, - FETCH_MTD_retval, - FETCH_MTD_memory, - FETCH_MTD_symbol, - FETCH_MTD_deref, - FETCH_MTD_bitfield, - FETCH_MTD_END, -}; - -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ - .size = _size, \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ - .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ - } \ - } - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 - -/* Fetch type information table */ -static const struct fetch_type { - const char *name; /* Name of type */ - size_t size; /* Byte size of type */ - int is_signed; /* Signed flag */ - print_type_func_t print; /* Print functions */ - const char *fmt; /* Fromat string */ - const char *fmttype; /* Name in format file */ - /* Fetch functions */ - fetch_func_t fetch[FETCH_MTD_END]; -} fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), -}; - -static const struct fetch_type *find_fetch_type(const char *type) -{ - int i; - - if (!type) - type = DEFAULT_FETCH_TYPE_STR; - - /* Special case: bitfield */ - if (*type == 'b') { - unsigned long bs; - type = strchr(type, '/'); - if (!type) - goto fail; - type++; - if (strict_strtoul(type, 0, &bs)) - goto fail; - switch (bs) { - case 8: - return find_fetch_type("u8"); - case 16: - return find_fetch_type("u16"); - case 32: - return find_fetch_type("u32"); - case 64: - return find_fetch_type("u64"); - default: - goto fail; - } - } - - for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) - if (strcmp(type, fetch_type_table[i].name) == 0) - return &fetch_type_table[i]; -fail: - return NULL; -} - -/* Special function : only accept unsigned long */ -static __kprobes void fetch_stack_address(struct pt_regs *regs, - void *dummy, void *dest) -{ - *(unsigned long *)dest = kernel_stack_pointer(regs); -} - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn) -{ - int i; - - if (type != &fetch_type_table[FETCH_TYPE_STRING]) - return NULL; /* Only string type needs size function */ - for (i = 0; i < FETCH_MTD_END; i++) - if (type->fetch[i] == orig_fn) - return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; - - WARN_ON(1); /* This should not happen */ - return NULL; -} +#define KPROBE_EVENT_SYSTEM "kprobes" /** * Kprobe event core functions */ -struct probe_arg { - struct fetch_param fetch; - struct fetch_param fetch_size; - unsigned int offset; /* Offset from argument entry */ - const char *name; /* Name of this argument */ - const char *comm; /* Command of this argument */ - const struct fetch_type *type; /* Type of this argument */ -}; - -/* Flags for trace_probe */ -#define TP_FLAG_TRACE 1 -#define TP_FLAG_PROFILE 2 -#define TP_FLAG_REGISTERED 4 - struct trace_probe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ @@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Check the name is good for event/group/fields */ -static int is_good_name(const char *name) -{ - if (!isalpha(*name) && *name != '_') - return 0; - while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return 0; - } - return 1; -} - /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, void *addr, const char *symbol, unsigned long offs, - int nargs, int is_return) + int nargs, bool is_return) { struct trace_probe *tp; int ret = -ENOMEM; @@ -702,34 +158,12 @@ error: return ERR_PTR(ret); } -static void update_probe_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - update_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - update_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - update_symbol_cache(arg->fetch.data); -} - -static void free_probe_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - free_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - free_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - free_symbol_cache(arg->fetch.data); - kfree(arg->name); - kfree(arg->comm); -} - static void free_trace_probe(struct trace_probe *tp) { int i; for (i = 0; i < tp->nr_args; i++) - free_probe_arg(&tp->args[i]); + traceprobe_free_probe_arg(&tp->args[i]); kfree(tp->call.class->system); kfree(tp->call.name); @@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp) return -EINVAL; for (i = 0; i < tp->nr_args; i++) - update_probe_arg(&tp->args[i]); + traceprobe_update_arg(&tp->args[i]); /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(tp)) @@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; -/* Split symbol and offset. */ -static int split_symbol_offset(char *symbol, unsigned long *offset) -{ - char *tmp; - int ret; - - if (!offset) - return -EINVAL; - - tmp = strchr(symbol, '+'); - if (tmp) { - /* skip sign because strict_strtol doesn't accept '+' */ - ret = strict_strtoul(tmp + 1, 0, offset); - if (ret) - return ret; - *tmp = '\0'; - } else - *offset = 0; - return 0; -} - -#define PARAM_MAX_ARGS 16 -#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) - -static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, int is_return) -{ - int ret = 0; - unsigned long param; - - if (strcmp(arg, "retval") == 0) { - if (is_return) - f->fn = t->fetch[FETCH_MTD_retval]; - else - ret = -EINVAL; - } else if (strncmp(arg, "stack", 5) == 0) { - if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) - f->fn = fetch_stack_address; - else - ret = -EINVAL; - } else if (isdigit(arg[5])) { - ret = strict_strtoul(arg + 5, 10, ¶m); - if (ret || param > PARAM_MAX_STACK) - ret = -EINVAL; - else { - f->fn = t->fetch[FETCH_MTD_stack]; - f->data = (void *)param; - } - } else - ret = -EINVAL; - } else - ret = -EINVAL; - return ret; -} - -/* Recursive argument parser */ -static int __parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, int is_return) -{ - int ret = 0; - unsigned long param; - long offset; - char *tmp; - - switch (arg[0]) { - case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return); - break; - case '%': /* named register */ - ret = regs_query_register_offset(arg + 1); - if (ret >= 0) { - f->fn = t->fetch[FETCH_MTD_reg]; - f->data = (void *)(unsigned long)ret; - ret = 0; - } - break; - case '@': /* memory or symbol */ - if (isdigit(arg[1])) { - ret = strict_strtoul(arg + 1, 0, ¶m); - if (ret) - break; - f->fn = t->fetch[FETCH_MTD_memory]; - f->data = (void *)param; - } else { - ret = split_symbol_offset(arg + 1, &offset); - if (ret) - break; - f->data = alloc_symbol_cache(arg + 1, offset); - if (f->data) - f->fn = t->fetch[FETCH_MTD_symbol]; - } - break; - case '+': /* deref memory */ - arg++; /* Skip '+', because strict_strtol() rejects it. */ - case '-': - tmp = strchr(arg, '('); - if (!tmp) - break; - *tmp = '\0'; - ret = strict_strtol(arg, 0, &offset); - if (ret) - break; - arg = tmp + 1; - tmp = strrchr(arg, ')'); - if (tmp) { - struct deref_fetch_param *dprm; - const struct fetch_type *t2 = find_fetch_type(NULL); - *tmp = '\0'; - dprm = kzalloc(sizeof(struct deref_fetch_param), - GFP_KERNEL); - if (!dprm) - return -ENOMEM; - dprm->offset = offset; - ret = __parse_probe_arg(arg, t2, &dprm->orig, - is_return); - if (ret) - kfree(dprm); - else { - f->fn = t->fetch[FETCH_MTD_deref]; - f->data = (void *)dprm; - } - } - break; - } - if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ - pr_info("%s type has no corresponding fetch method.\n", - t->name); - ret = -EINVAL; - } - return ret; -} - -#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) - -/* Bitfield type needs to be parsed into a fetch function */ -static int __parse_bitfield_probe_arg(const char *bf, - const struct fetch_type *t, - struct fetch_param *f) -{ - struct bitfield_fetch_param *bprm; - unsigned long bw, bo; - char *tail; - - if (*bf != 'b') - return 0; - - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - return -ENOMEM; - bprm->orig = *f; - f->fn = t->fetch[FETCH_MTD_bitfield]; - f->data = (void *)bprm; - - bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ - if (bw == 0 || *tail != '@') - return -EINVAL; - - bf = tail + 1; - bo = simple_strtoul(bf, &tail, 0); - if (tail == bf || *tail != '/') - return -EINVAL; - - bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); - bprm->low_shift = bprm->hi_shift + bo; - return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; -} - -/* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct trace_probe *tp, - struct probe_arg *parg, int is_return) -{ - const char *t; - int ret; - - if (strlen(arg) > MAX_ARGSTR_LEN) { - pr_info("Argument is too long.: %s\n", arg); - return -ENOSPC; - } - parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - pr_info("Failed to allocate memory for command '%s'.\n", arg); - return -ENOMEM; - } - t = strchr(parg->comm, ':'); - if (t) { - arg[t - parg->comm] = '\0'; - t++; - } - parg->type = find_fetch_type(t); - if (!parg->type) { - pr_info("Unsupported type: %s\n", t); - return -EINVAL; - } - parg->offset = tp->size; - tp->size += parg->type->size; - ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); - if (ret >= 0 && t != NULL) - ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); - if (ret >= 0) { - parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn); - parg->fetch_size.data = parg->fetch.data; - } - return ret; -} - -/* Return 1 if name is reserved or already used by another argument */ -static int conflict_field_name(const char *name, - struct probe_arg *args, int narg) -{ - int i; - for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) - if (strcmp(reserved_field_names[i], name) == 0) - return 1; - for (i = 0; i < narg; i++) - if (strcmp(args[i].name, name) == 0) - return 1; - return 0; -} - static int create_trace_probe(int argc, char **argv) { /* @@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv) */ struct trace_probe *tp; int i, ret = 0; - int is_return = 0, is_delete = 0; + bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; char *arg; unsigned long offset = 0; @@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv) /* argc must be >= 1 */ if (argv[0][0] == 'p') - is_return = 0; + is_return = false; else if (argv[0][0] == 'r') - is_return = 1; + is_return = true; else if (argv[0][0] == '-') - is_delete = 1; + is_delete = true; else { pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); @@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv) /* a symbol specified */ symbol = argv[1]; /* TODO: support .init module functions */ - ret = split_symbol_offset(symbol, &offset); + ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret) { pr_info("Failed to parse symbol.\n"); return ret; @@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv) goto error; } - if (conflict_field_name(tp->args[i].name, tp->args, i)) { + if (traceprobe_conflict_field_name(tp->args[i].name, + tp->args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv) } /* Parse fetch argument */ - ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); + ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], + is_return, true); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file) return seq_open(file, &probes_seq_op); } -static int command_trace_probe(const char *buf) -{ - char **argv; - int argc = 0, ret = 0; - - argv = argv_split(GFP_KERNEL, buf, &argc); - if (!argv) - return -ENOMEM; - - if (argc) - ret = create_trace_probe(argc, argv); - - argv_free(argv); - return ret; -} - -#define WRITE_BUFSIZE 4096 - static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char *kbuf, *tmp; - int ret; - size_t done; - size_t size; - - kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - ret = done = 0; - while (done < count) { - size = count - done; - if (size >= WRITE_BUFSIZE) - size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } - kbuf[size] = '\0'; - tmp = strchr(kbuf, '\n'); - if (tmp) { - *tmp = '\0'; - size = tmp - kbuf + 1; - } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); - ret = -EINVAL; - goto out; - } - done += size; - /* Remove comments */ - tmp = strchr(kbuf, '#'); - if (tmp) - *tmp = '\0'; - - ret = command_trace_probe(kbuf); - if (ret) - goto out; - } - ret = done; -out: - kfree(kbuf); - return ret; + return traceprobe_probes_write(file, buffer, count, ppos, + create_trace_probe); } static const struct file_operations kprobe_events_ops = { @@ -1711,16 +867,6 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -#undef DEFINE_FIELD -#define DEFINE_FIELD(type, item, name, is_signed) \ - do { \ - ret = trace_define_field(event_call, #type, name, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed, \ - FILTER_OTHER); \ - if (ret) \ - return ret; \ - } while (0) static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { @@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); - ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " - "$stack $stack0 +0($stack)"); + ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " + "$stack $stack0 +0($stack)", + create_trace_probe); if (WARN_ON_ONCE(ret)) { pr_warning("error on probing function entry.\n"); warn++; @@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void) enable_trace_probe(tp, TP_FLAG_TRACE); } - ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " - "$retval"); + ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " + "$retval", create_trace_probe); if (WARN_ON_ONCE(ret)) { pr_warning("error on probing function return.\n"); warn++; @@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void) } else disable_trace_probe(tp, TP_FLAG_TRACE); - ret = command_trace_probe("-:testprobe"); + ret = traceprobe_command("-:testprobe", create_trace_probe); if (WARN_ON_ONCE(ret)) { pr_warning("error on deleting a probe.\n"); warn++; } - ret = command_trace_probe("-:testprobe2"); + ret = traceprobe_command("-:testprobe2", create_trace_probe); if (WARN_ON_ONCE(ret)) { pr_warning("error on deleting a probe.\n"); warn++; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c new file mode 100644 index 000000000000..daa9980153af --- /dev/null +++ b/kernel/trace/trace_probe.c @@ -0,0 +1,839 @@ +/* + * Common code for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ + +#include "trace_probe.h" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + FIELD_STRING_IP, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +/* Printing function type */ +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, \ + void *data, void *ent)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +/* Print type function for string type */ +static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, + const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} + +static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ +} +DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + + if (!maxlen) + return; + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else { + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); + } +} + +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + + if (sc->addr) + sc->addr += sc->offset; + + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + update_symbol_cache(sc); + + return sc; +} + +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) + +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; + long offset; +}; + +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) +DEFINE_FETCH_deref(string_size) + +static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} + +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static __kprobes void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + +static __kprobes void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + + kfree(data); +} + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +/* Fetch type information table */ +static const struct fetch_type fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + + type = strchr(type, '/'); + if (!type) + goto fail; + + type++; + if (strict_strtoul(type, 0, &bs)) + goto fail; + + switch (bs) { + case 8: + return find_fetch_type("u8"); + case 16: + return find_fetch_type("u16"); + case 32: + return find_fetch_type("u32"); + case 64: + return find_fetch_type("u64"); + default: + goto fail; + } + } + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; + +fail: + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn) +{ + int i; + + if (type != &fetch_type_table[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + + return NULL; +} + +/* Split symbol and offset. */ +int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + + *tmp = '\0'; + } else + *offset = 0; + + return 0; +} + +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, bool is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) + f->fn = t->fetch[FETCH_MTD_retval]; + else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + f->fn = t->fetch[FETCH_MTD_stack]; + f->data = (void *)param; + } + } else + ret = -EINVAL; + } else + ret = -EINVAL; + + return ret; +} + +/* Recursive argument parser */ +static int parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, bool is_return, bool is_kprobe) +{ + unsigned long param; + long offset; + char *tmp; + int ret; + + ret = 0; + + /* Until uprobe_events supports only reg arguments */ + if (!is_kprobe && arg[0] != '%') + return -EINVAL; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, t, f, is_return); + break; + + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + f->fn = t->fetch[FETCH_MTD_reg]; + f->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_memory]; + f->data = (void *)param; + } else { + ret = traceprobe_split_symbol_offset(arg + 1, &offset); + if (ret) + break; + + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->fetch[FETCH_MTD_symbol]; + } + break; + + case '+': /* deref memory */ + arg++; /* Skip '+', because strict_strtol() rejects it. */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) + break; + + *tmp = '\0'; + ret = strict_strtol(arg, 0, &offset); + + if (ret) + break; + + arg = tmp + 1; + tmp = strrchr(arg, ')'); + + if (tmp) { + struct deref_fetch_param *dprm; + const struct fetch_type *t2; + + t2 = find_fetch_type(NULL); + *tmp = '\0'; + dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); + + if (!dprm) + return -ENOMEM; + + dprm->offset = offset; + ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, + is_kprobe); + if (ret) + kfree(dprm); + else { + f->fn = t->fetch[FETCH_MTD_deref]; + f->data = (void *)dprm; + } + } + break; + } + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", t->name); + ret = -EINVAL; + } + + return ret; +} + +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + +/* String length checking wrapper */ +int traceprobe_parse_probe_arg(char *arg, ssize_t *size, + struct probe_arg *parg, bool is_return, bool is_kprobe) +{ + const char *t; + int ret; + + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = *size; + *size += parg->type->size; + ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); + + if (ret >= 0 && t != NULL) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); + + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn); + parg->fetch_size.data = parg->fetch.data; + } + + return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + + return 0; +} + +void traceprobe_update_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + update_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + update_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + update_symbol_cache(arg->fetch.data); +} + +void traceprobe_free_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + free_symbol_cache(arg->fetch.data); + + kfree(arg->name); + kfree(arg->comm); +} + +int traceprobe_command(const char *buf, int (*createfn)(int, char **)) +{ + char **argv; + int argc, ret; + + argc = 0; + ret = 0; + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = createfn(argc, argv); + + argv_free(argv); + + return ret; +} + +#define WRITE_BUFSIZE 4096 + +ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos, + int (*createfn)(int, char **)) +{ + char *kbuf, *tmp; + int ret = 0; + size_t done = 0; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + while (done < count) { + size = count - done; + + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + + if (tmp) + *tmp = '\0'; + + ret = traceprobe_command(kbuf, createfn); + if (ret) + goto out; + } + ret = done; + +out: + kfree(kbuf); + + return ret; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h new file mode 100644 index 000000000000..933708677814 --- /dev/null +++ b/kernel/trace/trace_probe.h @@ -0,0 +1,161 @@ +/* + * Common header file for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This code was copied from kernel/trace/trace_kprobe.h written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ + +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> +#include <linux/kprobes.h> +#include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> +#include <asm/bitsperlong.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 +#define TP_FLAG_REGISTERED 4 +#define TP_FLAG_UPROBE 8 + + +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); + +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_bitfield, + FETCH_MTD_END, +}; + +/* Fetch type information table */ +struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ + /* Fetch functions */ + fetch_func_t fetch[FETCH_MTD_END]; +}; + +struct fetch_param { + fetch_func_t fn; + void *data; +}; + +struct probe_arg { + struct fetch_param fetch; + struct fetch_param fetch_size; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ +}; + +static inline __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) +{ + return fprm->fn(regs, fprm->data, dest); +} + +/* Check the name is good for event/group/fields */ +static inline int is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return 0; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return 0; + } + return 1; +} + +extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, + struct probe_arg *parg, bool is_return, bool is_kprobe); + +extern int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg); + +extern void traceprobe_update_arg(struct probe_arg *arg); +extern void traceprobe_free_probe_arg(struct probe_arg *arg); + +extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); + +extern ssize_t traceprobe_probes_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos, + int (*createfn)(int, char**)); + +extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c new file mode 100644 index 000000000000..2b36ac68549e --- /dev/null +++ b/kernel/trace/trace_uprobe.c @@ -0,0 +1,788 @@ +/* + * uprobes-based tracing events + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (C) IBM Corporation, 2010-2012 + * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include <linux/namei.h> + +#include "trace_probe.h" + +#define UPROBE_EVENT_SYSTEM "uprobes" + +/* + * uprobe event core functions + */ +struct trace_uprobe; +struct uprobe_trace_consumer { + struct uprobe_consumer cons; + struct trace_uprobe *tu; +}; + +struct trace_uprobe { + struct list_head list; + struct ftrace_event_class class; + struct ftrace_event_call call; + struct uprobe_trace_consumer *consumer; + struct inode *inode; + char *filename; + unsigned long offset; + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_UPROBE(n) \ + (offsetof(struct trace_uprobe, args) + \ + (sizeof(struct probe_arg) * (n))) + +static int register_uprobe_event(struct trace_uprobe *tu); +static void unregister_uprobe_event(struct trace_uprobe *tu); + +static DEFINE_MUTEX(uprobe_lock); +static LIST_HEAD(uprobe_list); + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); + +/* + * Allocate new trace_uprobe and initialize it (including uprobes). + */ +static struct trace_uprobe * +alloc_trace_uprobe(const char *group, const char *event, int nargs) +{ + struct trace_uprobe *tu; + + if (!event || !is_good_name(event)) + return ERR_PTR(-EINVAL); + + if (!group || !is_good_name(group)) + return ERR_PTR(-EINVAL); + + tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); + if (!tu) + return ERR_PTR(-ENOMEM); + + tu->call.class = &tu->class; + tu->call.name = kstrdup(event, GFP_KERNEL); + if (!tu->call.name) + goto error; + + tu->class.system = kstrdup(group, GFP_KERNEL); + if (!tu->class.system) + goto error; + + INIT_LIST_HEAD(&tu->list); + return tu; + +error: + kfree(tu->call.name); + kfree(tu); + + return ERR_PTR(-ENOMEM); +} + +static void free_trace_uprobe(struct trace_uprobe *tu) +{ + int i; + + for (i = 0; i < tu->nr_args; i++) + traceprobe_free_probe_arg(&tu->args[i]); + + iput(tu->inode); + kfree(tu->call.class->system); + kfree(tu->call.name); + kfree(tu->filename); + kfree(tu); +} + +static struct trace_uprobe *find_probe_event(const char *event, const char *group) +{ + struct trace_uprobe *tu; + + list_for_each_entry(tu, &uprobe_list, list) + if (strcmp(tu->call.name, event) == 0 && + strcmp(tu->call.class->system, group) == 0) + return tu; + + return NULL; +} + +/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ +static void unregister_trace_uprobe(struct trace_uprobe *tu) +{ + list_del(&tu->list); + unregister_uprobe_event(tu); + free_trace_uprobe(tu); +} + +/* Register a trace_uprobe and probe_event */ +static int register_trace_uprobe(struct trace_uprobe *tu) +{ + struct trace_uprobe *old_tp; + int ret; + + mutex_lock(&uprobe_lock); + + /* register as an event */ + old_tp = find_probe_event(tu->call.name, tu->call.class->system); + if (old_tp) + /* delete old event */ + unregister_trace_uprobe(old_tp); + + ret = register_uprobe_event(tu); + if (ret) { + pr_warning("Failed to register probe event(%d)\n", ret); + goto end; + } + + list_add_tail(&tu->list, &uprobe_list); + +end: + mutex_unlock(&uprobe_lock); + + return ret; +} + +/* + * Argument syntax: + * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] + * + * - Remove uprobe: -:[GRP/]EVENT + */ +static int create_trace_uprobe(int argc, char **argv) +{ + struct trace_uprobe *tu; + struct inode *inode; + char *arg, *event, *group, *filename; + char buf[MAX_EVENT_NAME_LEN]; + struct path path; + unsigned long offset; + bool is_delete; + int i, ret; + + inode = NULL; + ret = 0; + is_delete = false; + event = NULL; + group = NULL; + + /* argc must be >= 1 */ + if (argv[0][0] == '-') + is_delete = true; + else if (argv[0][0] != 'p') { + pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + arg = strchr(event, '/'); + + if (arg) { + group = event; + event = arg + 1; + event[-1] = '\0'; + + if (strlen(group) == 0) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + } + if (!group) + group = UPROBE_EVENT_SYSTEM; + + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + mutex_lock(&uprobe_lock); + tu = find_probe_event(event, group); + + if (!tu) { + mutex_unlock(&uprobe_lock); + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_uprobe(tu); + mutex_unlock(&uprobe_lock); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + pr_info("probe point must be have a filename.\n"); + return -EINVAL; + } + arg = strchr(argv[1], ':'); + if (!arg) + goto fail_address_parse; + + *arg++ = '\0'; + filename = argv[1]; + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_address_parse; + + ret = strict_strtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + + inode = igrab(path.dentry->d_inode); + + argc -= 2; + argv += 2; + + /* setup a probe */ + if (!event) { + char *tail = strrchr(filename, '/'); + char *ptr; + + ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); + if (!ptr) { + ret = -ENOMEM; + goto fail_address_parse; + } + + tail = ptr; + ptr = strpbrk(tail, ".-_"); + if (ptr) + *ptr = '\0'; + + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); + event = buf; + kfree(tail); + } + + tu = alloc_trace_uprobe(group, event, argc); + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); + ret = PTR_ERR(tu); + goto fail_address_parse; + } + tu->offset = offset; + tu->inode = inode; + tu->filename = kstrdup(filename, GFP_KERNEL); + + if (!tu->filename) { + pr_info("Failed to allocate filename.\n"); + ret = -ENOMEM; + goto error; + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Increment count for freeing args in error case */ + tu->nr_args++; + + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) { + *arg++ = '\0'; + tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); + } else { + arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + tu->args[i].name = kstrdup(buf, GFP_KERNEL); + } + + if (!tu->args[i].name) { + pr_info("Failed to allocate argument[%d] name.\n", i); + ret = -ENOMEM; + goto error; + } + + if (!is_good_name(tu->args[i].name)) { + pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); + ret = -EINVAL; + goto error; + } + + if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { + pr_info("Argument[%d] name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + /* Parse fetch argument */ + ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); + if (ret) { + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + goto error; + } + } + + ret = register_trace_uprobe(tu); + if (ret) + goto error; + return 0; + +error: + free_trace_uprobe(tu); + return ret; + +fail_address_parse: + if (inode) + iput(inode); + + pr_info("Failed to parse address.\n"); + + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_uprobe *tu; + + mutex_lock(&uprobe_lock); + while (!list_empty(&uprobe_list)) { + tu = list_entry(uprobe_list.next, struct trace_uprobe, list); + unregister_trace_uprobe(tu); + } + mutex_unlock(&uprobe_lock); +} + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&uprobe_lock); + return seq_list_start(&uprobe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &uprobe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&uprobe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_uprobe *tu = v; + int i; + + seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); + seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + + for (i = 0; i < tu->nr_args; i++) + seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); + + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); +} + +static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_uprobe *tu = v; + + seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations uprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* uprobe handler */ +static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + struct uprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + u8 *data; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tu->call; + + tu->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = sizeof(*entry) + tu->size; + + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + data = (u8 *)&entry[1]; + for (i = 0; i < tu->nr_args; i++) + call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_buffer_unlock_commit(buffer, event, irq_flags, pc); +} + +/* Event entry printers */ +static enum print_line_t +print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) +{ + struct uprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_uprobe *tu; + u8 *data; + int i; + + field = (struct uprobe_trace_entry_head *)iter->ent; + tu = container_of(event, struct trace_uprobe, call.event); + + if (!trace_seq_printf(s, "%s: (", tu->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + data = (u8 *)&field[1]; + for (i = 0; i < tu->nr_args; i++) { + if (!tu->args[i].type->print(s, tu->args[i].name, + data + tu->args[i].offset, field)) + goto partial; + } + + if (trace_seq_puts(s, "\n")) + return TRACE_TYPE_HANDLED; + +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct trace_uprobe *tu, int flag) +{ + struct uprobe_trace_consumer *utc; + int ret = 0; + + if (!tu->inode || tu->consumer) + return -EINTR; + + utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); + if (!utc) + return -EINTR; + + utc->cons.handler = uprobe_dispatcher; + utc->cons.filter = NULL; + ret = uprobe_register(tu->inode, tu->offset, &utc->cons); + if (ret) { + kfree(utc); + return ret; + } + + tu->flags |= flag; + utc->tu = tu; + tu->consumer = utc; + + return 0; +} + +static void probe_event_disable(struct trace_uprobe *tu, int flag) +{ + if (!tu->inode || !tu->consumer) + return; + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); + tu->flags &= ~flag; + kfree(tu->consumer); + tu->consumer = NULL; +} + +static int uprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct uprobe_trace_entry_head field; + struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + /* Set argument names as fields */ + for (i = 0; i < tu->nr_args; i++) { + ret = trace_define_field(event_call, tu->args[i].type->fmttype, + tu->args[i].name, + sizeof(field) + tu->args[i].offset, + tu->args[i].type->size, + tu->args[i].type->is_signed, + FILTER_OTHER); + + if (ret) + return ret; + } + return 0; +} + +#define LEN_OR_ZERO (len ? len - pos : 0) +static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) +{ + const char *fmt, *arg; + int i; + int pos = 0; + + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + + /* When len=0, we just calculate the needed length */ + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tu->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tu->args[i].name, tu->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tu->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tu->args[i].name); + } + + return pos; /* return the length of print_fmt */ +} +#undef LEN_OR_ZERO + +static int set_print_fmt(struct trace_uprobe *tu) +{ + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tu, NULL, 0); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tu, print_fmt, len + 1); + tu->call.print_fmt = print_fmt; + + return 0; +} + +#ifdef CONFIG_PERF_EVENTS +/* uprobe profile handler */ +static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + struct ftrace_event_call *call = &tu->call; + struct uprobe_trace_entry_head *entry; + struct hlist_head *head; + u8 *data; + int size, __size, i; + int rctx; + + __size = sizeof(*entry) + tu->size; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) + return; + + preempt_disable(); + + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + if (!entry) + goto out; + + entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + data = (u8 *)&entry[1]; + for (i = 0; i < tu->nr_args; i++) + call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + + out: + preempt_enable(); +} +#endif /* CONFIG_PERF_EVENTS */ + +static +int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) +{ + struct trace_uprobe *tu = (struct trace_uprobe *)event->data; + + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(tu, TP_FLAG_TRACE); + + case TRACE_REG_UNREGISTER: + probe_event_disable(tu, TP_FLAG_TRACE); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_event_enable(tu, TP_FLAG_PROFILE); + + case TRACE_REG_PERF_UNREGISTER: + probe_event_disable(tu, TP_FLAG_PROFILE); + return 0; +#endif + default: + return 0; + } + return 0; +} + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +{ + struct uprobe_trace_consumer *utc; + struct trace_uprobe *tu; + + utc = container_of(con, struct uprobe_trace_consumer, cons); + tu = utc->tu; + if (!tu || tu->consumer != utc) + return 0; + + if (tu->flags & TP_FLAG_TRACE) + uprobe_trace_func(tu, regs); + +#ifdef CONFIG_PERF_EVENTS + if (tu->flags & TP_FLAG_PROFILE) + uprobe_perf_func(tu, regs); +#endif + return 0; +} + +static struct trace_event_functions uprobe_funcs = { + .trace = print_uprobe_event +}; + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct ftrace_event_call *call = &tu->call; + int ret; + + /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &uprobe_funcs; + call->class->define_fields = uprobe_event_define_fields; + + if (set_print_fmt(tu) < 0) + return -ENOMEM; + + ret = register_ftrace_event(&call->event); + if (!ret) { + kfree(call->print_fmt); + return -ENODEV; + } + call->flags = 0; + call->class->reg = trace_uprobe_register; + call->data = tu; + ret = trace_add_event_call(call); + + if (ret) { + pr_info("Failed to register uprobe event: %s\n", call->name); + kfree(call->print_fmt); + unregister_ftrace_event(&call->event); + } + + return ret; +} + +static void unregister_uprobe_event(struct trace_uprobe *tu) +{ + /* tu->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tu->call); + kfree(tu->call.print_fmt); + tu->call.print_fmt = NULL; +} + +/* Make a trace interface for controling probe points */ +static __init int init_uprobe_trace(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("uprobe_events", 0644, d_tracer, + NULL, &uprobe_events_ops); + /* Profile interface */ + trace_create_file("uprobe_profile", 0444, d_tracer, + NULL, &uprobe_profile_ops); + return 0; +} + +fs_initcall(init_uprobe_trace); diff --git a/kernel/uid16.c b/kernel/uid16.c index 51c6e89e8619..d7948eb10225 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) return ret; } -SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) +SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; + old_uid_t ruid, euid, suid; - if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && - !(retval = put_user(high2lowuid(cred->euid), euid))) - retval = put_user(high2lowuid(cred->suid), suid); + ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid)); + euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid)); + suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid)); + + if (!(retval = put_user(ruid, ruidp)) && + !(retval = put_user(euid, euidp))) + retval = put_user(suid, suidp); return retval; } @@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) } -SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) +SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; + old_gid_t rgid, egid, sgid; + + rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid)); + egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid)); + sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid)); - if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && - !(retval = put_user(high2lowgid(cred->egid), egid))) - retval = put_user(high2lowgid(cred->sgid), sgid); + if (!(retval = put_user(rgid, rgidp)) && + !(retval = put_user(egid, egidp))) + retval = put_user(sgid, sgidp); return retval; } @@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) static int groups16_to_user(old_gid_t __user *grouplist, struct group_info *group_info) { + struct user_namespace *user_ns = current_user_ns(); int i; old_gid_t group; + kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - group = high2lowgid(GROUP_AT(group_info, i)); + kgid = GROUP_AT(group_info, i); + group = high2lowgid(from_kgid_munged(user_ns, kgid)); if (put_user(group, grouplist+i)) return -EFAULT; } @@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist, static int groups16_from_user(struct group_info *group_info, old_gid_t __user *grouplist) { + struct user_namespace *user_ns = current_user_ns(); int i; old_gid_t group; + kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { if (get_user(group, grouplist+i)) return -EFAULT; - GROUP_AT(group_info, i) = low2highgid(group); + + kgid = make_kgid(user_ns, low2highgid(group)); + if (!gid_valid(kgid)) + return -EINVAL; + + GROUP_AT(group_info, i) = kgid; } return 0; @@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) SYSCALL_DEFINE0(getuid16) { - return high2lowuid(current_uid()); + return high2lowuid(from_kuid_munged(current_user_ns(), current_uid())); } SYSCALL_DEFINE0(geteuid16) { - return high2lowuid(current_euid()); + return high2lowuid(from_kuid_munged(current_user_ns(), current_euid())); } SYSCALL_DEFINE0(getgid16) { - return high2lowgid(current_gid()); + return high2lowgid(from_kgid_munged(current_user_ns(), current_gid())); } SYSCALL_DEFINE0(getegid16) { - return high2lowgid(current_egid()); + return high2lowgid(from_kgid_munged(current_user_ns(), current_egid())); } diff --git a/kernel/user.c b/kernel/user.c index 71dd2363ab0f..b815fefbe76f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -22,10 +22,27 @@ * and 1 for... ? */ struct user_namespace init_user_ns = { + .uid_map = { + .nr_extents = 1, + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, + }, + .gid_map = { + .nr_extents = 1, + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, + }, .kref = { .refcount = ATOMIC_INIT(3), }, - .creator = &root_user, + .owner = GLOBAL_ROOT_UID, + .group = GLOBAL_ROOT_GID, }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns); * when changing user ID's (ie setuid() and friends). */ +#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) +#define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) +#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; +struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is @@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ +/* root_user.__count is 1, for init task cred */ struct user_struct root_user = { - .__count = ATOMIC_INIT(2), + .__count = ATOMIC_INIT(1), .processes = ATOMIC_INIT(1), .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, - .user_ns = &init_user_ns, + .uid = GLOBAL_ROOT_UID, }; /* @@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); - put_user_ns(up->user_ns); } -static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) +static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; struct hlist_node *h; hlist_for_each_entry(user, h, hashent, uidhash_node) { - if (user->uid == uid) { + if (uid_eq(user->uid, uid)) { atomic_inc(&user->__count); return user; } @@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags) * * If the user_struct could not be found, return NULL. */ -struct user_struct *find_user(uid_t uid) +struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; - struct user_namespace *ns = current_user_ns(); spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(ns, uid)); + ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } @@ -136,9 +154,9 @@ void free_uid(struct user_struct *up) local_irq_restore(flags); } -struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) +struct user_struct *alloc_uid(kuid_t uid) { - struct hlist_head *hashent = uidhashentry(ns, uid); + struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); @@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) new->uid = uid; atomic_set(&new->__count, 1); - new->user_ns = get_user_ns(ns); - /* * Before adding this, check whether we raced * on adding the same user already.. @@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - put_user_ns(ns); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -187,11 +202,11 @@ static int __init uid_cache_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); + INIT_HLIST_HEAD(uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); + uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 3b906e98b1db..86602316422d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -11,9 +11,20 @@ #include <linux/user_namespace.h> #include <linux/highuid.h> #include <linux/cred.h> +#include <linux/securebits.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <keys/user-type.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/ctype.h> static struct kmem_cache *user_ns_cachep __read_mostly; +static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, + struct uid_gid_map *map); + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly; */ int create_user_ns(struct cred *new) { - struct user_namespace *ns; - struct user_struct *root_user; - int n; + struct user_namespace *ns, *parent_ns = new->user_ns; + kuid_t owner = new->euid; + kgid_t group = new->egid; + + /* The creator needs a mapping in the parent user namespace + * or else we won't be able to reasonably tell userspace who + * created a user_namespace. + */ + if (!kuid_has_mapping(parent_ns, owner) || + !kgid_has_mapping(parent_ns, group)) + return -EPERM; - ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); + ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) return -ENOMEM; kref_init(&ns->kref); + ns->parent = parent_ns; + ns->owner = owner; + ns->group = group; - for (n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(ns->uidhash_table + n); - - /* Alloc new root user. */ - root_user = alloc_uid(ns, 0); - if (!root_user) { - kmem_cache_free(user_ns_cachep, ns); - return -ENOMEM; - } - - /* set the new root user in the credentials under preparation */ - ns->creator = new->user; - new->user = root_user; - new->uid = new->euid = new->suid = new->fsuid = 0; - new->gid = new->egid = new->sgid = new->fsgid = 0; - put_group_info(new->group_info); - new->group_info = get_group_info(&init_groups); + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + new->securebits = SECUREBITS_DEFAULT; + new->cap_inheritable = CAP_EMPTY_SET; + new->cap_permitted = CAP_FULL_SET; + new->cap_effective = CAP_FULL_SET; + new->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(new->request_key_auth); new->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - /* root_user holds a reference to ns, our reference can be dropped */ - put_user_ns(ns); + /* Leave the new->user_ns reference with the new user namespace. */ + /* Leave the reference to our user_ns with the new cred. */ + new->user_ns = ns; return 0; } -/* - * Deferred destructor for a user namespace. This is required because - * free_user_ns() may be called with uidhash_lock held, but we need to call - * back to free_uid() which will want to take the lock again. - */ -static void free_user_ns_work(struct work_struct *work) +void free_user_ns(struct kref *kref) { - struct user_namespace *ns = - container_of(work, struct user_namespace, destroyer); - free_uid(ns->creator); + struct user_namespace *parent, *ns = + container_of(kref, struct user_namespace, kref); + + parent = ns->parent; kmem_cache_free(user_ns_cachep, ns); + put_user_ns(parent); } +EXPORT_SYMBOL(free_user_ns); -void free_user_ns(struct kref *kref) +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { - struct user_namespace *ns = - container_of(kref, struct user_namespace, kref); + unsigned idx, extents; + u32 first, last, id2; + + id2 = id + count - 1; + + /* Find the matching extent */ + extents = map->nr_extents; + smp_read_barrier_depends(); + for (idx = 0; idx < extents; idx++) { + first = map->extent[idx].first; + last = first + map->extent[idx].count - 1; + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) + break; + } + /* Map the id or note failure */ + if (idx < extents) + id = (id - first) + map->extent[idx].lower_first; + else + id = (u32) -1; - INIT_WORK(&ns->destroyer, free_user_ns_work); - schedule_work(&ns->destroyer); + return id; } -EXPORT_SYMBOL(free_user_ns); -uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +static u32 map_id_down(struct uid_gid_map *map, u32 id) { - struct user_namespace *tmp; + unsigned idx, extents; + u32 first, last; - if (likely(to == cred->user->user_ns)) - return uid; + /* Find the matching extent */ + extents = map->nr_extents; + smp_read_barrier_depends(); + for (idx = 0; idx < extents; idx++) { + first = map->extent[idx].first; + last = first + map->extent[idx].count - 1; + if (id >= first && id <= last) + break; + } + /* Map the id or note failure */ + if (idx < extents) + id = (id - first) + map->extent[idx].lower_first; + else + id = (u32) -1; + return id; +} - /* Is cred->user the creator of the target user_ns - * or the creator of one of it's parents? - */ - for ( tmp = to; tmp != &init_user_ns; - tmp = tmp->creator->user_ns ) { - if (cred->user == tmp->creator) { - return (uid_t)0; - } +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + unsigned idx, extents; + u32 first, last; + + /* Find the matching extent */ + extents = map->nr_extents; + smp_read_barrier_depends(); + for (idx = 0; idx < extents; idx++) { + first = map->extent[idx].lower_first; + last = first + map->extent[idx].count - 1; + if (id >= first && id <= last) + break; } + /* Map the id or note failure */ + if (idx < extents) + id = (id - first) + map->extent[idx].first; + else + id = (u32) -1; + + return id; +} + +/** + * make_kuid - Map a user-namespace uid pair into a kuid. + * @ns: User namespace that the uid is in + * @uid: User identifier + * + * Maps a user-namespace uid pair into a kernel internal kuid, + * and returns that kuid. + * + * When there is no mapping defined for the user-namespace uid + * pair INVALID_UID is returned. Callers are expected to test + * for and handle handle INVALID_UID being returned. INVALID_UID + * may be tested for using uid_valid(). + */ +kuid_t make_kuid(struct user_namespace *ns, uid_t uid) +{ + /* Map the uid to a global kernel uid */ + return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); +} +EXPORT_SYMBOL(make_kuid); + +/** + * from_kuid - Create a uid from a kuid user-namespace pair. + * @targ: The user namespace we want a uid in. + * @kuid: The kernel internal uid to start with. + * + * Map @kuid into the user-namespace specified by @targ and + * return the resulting uid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kuid has no mapping in @targ (uid_t)-1 is returned. + */ +uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) +{ + /* Map the uid from a global kernel uid */ + return map_id_up(&targ->uid_map, __kuid_val(kuid)); +} +EXPORT_SYMBOL(from_kuid); - /* No useful relationship so no mapping */ - return overflowuid; +/** + * from_kuid_munged - Create a uid from a kuid user-namespace pair. + * @targ: The user namespace we want a uid in. + * @kuid: The kernel internal uid to start with. + * + * Map @kuid into the user-namespace specified by @targ and + * return the resulting uid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kuid from_kuid_munged never fails and always + * returns a valid uid. This makes from_kuid_munged appropriate + * for use in syscalls like stat and getuid where failing the + * system call and failing to provide a valid uid are not an + * options. + * + * If @kuid has no mapping in @targ overflowuid is returned. + */ +uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) +{ + uid_t uid; + uid = from_kuid(targ, kuid); + + if (uid == (uid_t) -1) + uid = overflowuid; + return uid; } +EXPORT_SYMBOL(from_kuid_munged); -gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +/** + * make_kgid - Map a user-namespace gid pair into a kgid. + * @ns: User namespace that the gid is in + * @uid: group identifier + * + * Maps a user-namespace gid pair into a kernel internal kgid, + * and returns that kgid. + * + * When there is no mapping defined for the user-namespace gid + * pair INVALID_GID is returned. Callers are expected to test + * for and handle INVALID_GID being returned. INVALID_GID may be + * tested for using gid_valid(). + */ +kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { - struct user_namespace *tmp; + /* Map the gid to a global kernel gid */ + return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); +} +EXPORT_SYMBOL(make_kgid); - if (likely(to == cred->user->user_ns)) - return gid; +/** + * from_kgid - Create a gid from a kgid user-namespace pair. + * @targ: The user namespace we want a gid in. + * @kgid: The kernel internal gid to start with. + * + * Map @kgid into the user-namespace specified by @targ and + * return the resulting gid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kgid has no mapping in @targ (gid_t)-1 is returned. + */ +gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) +{ + /* Map the gid from a global kernel gid */ + return map_id_up(&targ->gid_map, __kgid_val(kgid)); +} +EXPORT_SYMBOL(from_kgid); + +/** + * from_kgid_munged - Create a gid from a kgid user-namespace pair. + * @targ: The user namespace we want a gid in. + * @kgid: The kernel internal gid to start with. + * + * Map @kgid into the user-namespace specified by @targ and + * return the resulting gid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kgid from_kgid_munged never fails and always + * returns a valid gid. This makes from_kgid_munged appropriate + * for use in syscalls like stat and getgid where failing the + * system call and failing to provide a valid gid are not options. + * + * If @kgid has no mapping in @targ overflowgid is returned. + */ +gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) +{ + gid_t gid; + gid = from_kgid(targ, kgid); - /* Is cred->user the creator of the target user_ns - * or the creator of one of it's parents? + if (gid == (gid_t) -1) + gid = overflowgid; + return gid; +} +EXPORT_SYMBOL(from_kgid_munged); + +static int uid_m_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + struct uid_gid_extent *extent = v; + struct user_namespace *lower_ns; + uid_t lower; + + lower_ns = current_user_ns(); + if ((lower_ns == ns) && lower_ns->parent) + lower_ns = lower_ns->parent; + + lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); + + seq_printf(seq, "%10u %10u %10u\n", + extent->first, + lower, + extent->count); + + return 0; +} + +static int gid_m_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + struct uid_gid_extent *extent = v; + struct user_namespace *lower_ns; + gid_t lower; + + lower_ns = current_user_ns(); + if ((lower_ns == ns) && lower_ns->parent) + lower_ns = lower_ns->parent; + + lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); + + seq_printf(seq, "%10u %10u %10u\n", + extent->first, + lower, + extent->count); + + return 0; +} + +static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) +{ + struct uid_gid_extent *extent = NULL; + loff_t pos = *ppos; + + if (pos < map->nr_extents) + extent = &map->extent[pos]; + + return extent; +} + +static void *uid_m_start(struct seq_file *seq, loff_t *ppos) +{ + struct user_namespace *ns = seq->private; + + return m_start(seq, ppos, &ns->uid_map); +} + +static void *gid_m_start(struct seq_file *seq, loff_t *ppos) +{ + struct user_namespace *ns = seq->private; + + return m_start(seq, ppos, &ns->gid_map); +} + +static void *m_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + return seq->op->start(seq, pos); +} + +static void m_stop(struct seq_file *seq, void *v) +{ + return; +} + +struct seq_operations proc_uid_seq_operations = { + .start = uid_m_start, + .stop = m_stop, + .next = m_next, + .show = uid_m_show, +}; + +struct seq_operations proc_gid_seq_operations = { + .start = gid_m_start, + .stop = m_stop, + .next = m_next, + .show = gid_m_show, +}; + +static DEFINE_MUTEX(id_map_mutex); + +static ssize_t map_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, + int cap_setid, + struct uid_gid_map *map, + struct uid_gid_map *parent_map) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + struct uid_gid_map new_map; + unsigned idx; + struct uid_gid_extent *extent, *last = NULL; + unsigned long page = 0; + char *kbuf, *pos, *next_line; + ssize_t ret = -EINVAL; + + /* + * The id_map_mutex serializes all writes to any given map. + * + * Any map is only ever written once. + * + * An id map fits within 1 cache line on most architectures. + * + * On read nothing needs to be done unless you are on an + * architecture with a crazy cache coherency model like alpha. + * + * There is a one time data dependency between reading the + * count of the extents and the values of the extents. The + * desired behavior is to see the values of the extents that + * were written before the count of the extents. + * + * To achieve this smp_wmb() is used on guarantee the write + * order and smp_read_barrier_depends() is guaranteed that we + * don't have crazy architectures returning stale data. + * */ - for ( tmp = to; tmp != &init_user_ns; - tmp = tmp->creator->user_ns ) { - if (cred->user == tmp->creator) { - return (gid_t)0; + mutex_lock(&id_map_mutex); + + ret = -EPERM; + /* Only allow one successful write to the map */ + if (map->nr_extents != 0) + goto out; + + /* Require the appropriate privilege CAP_SETUID or CAP_SETGID + * over the user namespace in order to set the id mapping. + */ + if (!ns_capable(ns, cap_setid)) + goto out; + + /* Get a buffer */ + ret = -ENOMEM; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!page) + goto out; + + /* Only allow <= page size writes at the beginning of the file */ + ret = -EINVAL; + if ((*ppos != 0) || (count >= PAGE_SIZE)) + goto out; + + /* Slurp in the user data */ + ret = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + kbuf[count] = '\0'; + + /* Parse the user data */ + ret = -EINVAL; + pos = kbuf; + new_map.nr_extents = 0; + for (;pos; pos = next_line) { + extent = &new_map.extent[new_map.nr_extents]; + + /* Find the end of line and ensure I don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; } + + pos = skip_spaces(pos); + extent->first = simple_strtoul(pos, &pos, 10); + if (!isspace(*pos)) + goto out; + + pos = skip_spaces(pos); + extent->lower_first = simple_strtoul(pos, &pos, 10); + if (!isspace(*pos)) + goto out; + + pos = skip_spaces(pos); + extent->count = simple_strtoul(pos, &pos, 10); + if (*pos && !isspace(*pos)) + goto out; + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') + goto out; + + /* Verify we have been given valid starting values */ + if ((extent->first == (u32) -1) || + (extent->lower_first == (u32) -1 )) + goto out; + + /* Verify count is not zero and does not cause the extent to wrap */ + if ((extent->first + extent->count) <= extent->first) + goto out; + if ((extent->lower_first + extent->count) <= extent->lower_first) + goto out; + + /* For now only accept extents that are strictly in order */ + if (last && + (((last->first + last->count) > extent->first) || + ((last->lower_first + last->count) > extent->lower_first))) + goto out; + + new_map.nr_extents++; + last = extent; + + /* Fail if the file contains too many extents */ + if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && + (next_line != NULL)) + goto out; + } + /* Be very certaint the new map actually exists */ + if (new_map.nr_extents == 0) + goto out; + + ret = -EPERM; + /* Validate the user is allowed to use user id's mapped to. */ + if (!new_idmap_permitted(ns, cap_setid, &new_map)) + goto out; + + /* Map the lower ids from the parent user namespace to the + * kernel global id space. + */ + for (idx = 0; idx < new_map.nr_extents; idx++) { + u32 lower_first; + extent = &new_map.extent[idx]; + + lower_first = map_id_range_down(parent_map, + extent->lower_first, + extent->count); + + /* Fail if we can not map the specified extent to + * the kernel global id space. + */ + if (lower_first == (u32) -1) + goto out; + + extent->lower_first = lower_first; } - /* No useful relationship so no mapping */ - return overflowgid; + /* Install the map */ + memcpy(map->extent, new_map.extent, + new_map.nr_extents*sizeof(new_map.extent[0])); + smp_wmb(); + map->nr_extents = new_map.nr_extents; + + *ppos = count; + ret = count; +out: + mutex_unlock(&id_map_mutex); + if (page) + free_page(page); + return ret; +} + +ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + + if (!ns->parent) + return -EPERM; + + return map_write(file, buf, size, ppos, CAP_SETUID, + &ns->uid_map, &ns->parent->uid_map); +} + +ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + + if (!ns->parent) + return -EPERM; + + return map_write(file, buf, size, ppos, CAP_SETGID, + &ns->gid_map, &ns->parent->gid_map); +} + +static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, + struct uid_gid_map *new_map) +{ + /* Allow the specified ids if we have the appropriate capability + * (CAP_SETUID or CAP_SETGID) over the parent user namespace. + */ + if (ns_capable(ns->parent, cap_setid)) + return true; + + return false; } static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 405caf91aad5..679d97a5d3fd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); up_read(&uts_sem); return ns; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index df30ee08bdd4..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include <linux/sysctl.h> #include <asm/irq_regs.h> +#include <linux/kvm_para.h> #include <linux/perf_event.h> int watchdog_enabled = 1; @@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) __this_cpu_write(softlockup_touch_sync, false); sched_clock_tick(); } + + /* Clear the guest paused flag on watchdog reset */ + kvm_check_and_clear_guest_paused(); __touch_watchdog(); return HRTIMER_RESTART; } @@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ duration = is_softlockup(touch_ts); if (unlikely(duration)) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a soft lockup, check to see if the host + * stopped the vm before we issue the warning + */ + if (kvm_check_and_clear_guest_paused()) + return HRTIMER_RESTART; + /* only warn once */ if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; @@ -360,6 +372,13 @@ static int watchdog(void *unused) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + static int watchdog_nmi_enable(int cpu) { struct perf_event_attr *wd_attr; @@ -378,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + if (!IS_ERR(event)) { - pr_info("enabled, takes one hw-pmu counter.\n"); + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) |