diff options
Diffstat (limited to 'kernel')
60 files changed, 2271 insertions, 899 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 1408b3353a3c..60c302cfb4d3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o smpboot.o + async.o range.o smpboot.o + +obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -195,9 +197,9 @@ x509.genkey: @echo >>x509.genkey "x509_extensions = myexts" @echo >>x509.genkey @echo >>x509.genkey "[ req_distinguished_name ]" - @echo >>x509.genkey "O = Magrathea" - @echo >>x509.genkey "CN = Glacier signing key" - @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2" + @echo >>x509.genkey "#O = Unspecified company" + @echo >>x509.genkey "CN = Build time autogenerated kernel key" + @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company" @echo >>x509.genkey @echo >>x509.genkey "[ myexts ]" @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" diff --git a/kernel/acct.c b/kernel/acct.c index e6c10d1a4058..74963d192c5d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -213,7 +213,7 @@ static int acct_on(struct filename *pathname) return -EACCES; } - if (!file->f_op->write) { + if (!(file->f_mode & FMODE_CAN_WRITE)) { kfree(acct); filp_close(file, NULL); return -EIO; diff --git a/kernel/audit.c b/kernel/audit.c index 72ab759a0b43..1c13e4267de6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -43,6 +43,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> @@ -107,6 +108,7 @@ static u32 audit_rate_limit; * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) +static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_overflow = 0; @@ -338,13 +340,13 @@ static int audit_set_backlog_limit(u32 limit) static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", - &audit_backlog_wait_time, timeout); + &audit_backlog_wait_time_master, timeout); } static int audit_set_enabled(u32 state) { int rc; - if (state < AUDIT_OFF || state > AUDIT_LOCKED) + if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); @@ -663,7 +665,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ - if ((task_active_pid_ns(current) != &init_pid_ns)) + if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) @@ -834,7 +836,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_skb_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time = audit_backlog_wait_time_master; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -877,8 +879,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; - if (s.backlog_wait_time < 0 || - s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) + if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) @@ -1385,7 +1386,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; + if (!reserve) + audit_backlog_wait_time = audit_backlog_wait_time_master; ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { @@ -1759,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } else audit_log_format(ab, " name=(null)"); - if (n->ino != (unsigned long)-1) { + if (n->ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu" " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", @@ -1771,7 +1773,6 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - } if (n->osid != 0) { char *ctx = NULL; u32 len; @@ -1838,11 +1839,29 @@ error_path: } EXPORT_SYMBOL(audit_log_task_context); +void audit_log_d_path_exe(struct audit_buffer *ab, + struct mm_struct *mm) +{ + struct file *exe_file; + + if (!mm) + goto out_null; + + exe_file = get_mm_exe_file(mm); + if (!exe_file) + goto out_null; + + audit_log_d_path(ab, " exe=", &exe_file->f_path); + fput(exe_file); + return; +out_null: + audit_log_format(ab, " exe=(null)"); +} + void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; char comm[sizeof(tsk->comm)]; - struct mm_struct *mm = tsk->mm; char *tty; if (!ab) @@ -1878,13 +1897,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); - if (mm) { - down_read(&mm->mmap_sem); - if (mm->exe_file) - audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); - up_read(&mm->mmap_sem); - } else - audit_log_format(ab, " exe=(null)"); + audit_log_d_path_exe(ab, tsk->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); @@ -1915,7 +1928,7 @@ void audit_log_link_denied(const char *operation, struct path *link) /* Generate AUDIT_PATH record with object. */ name->type = AUDIT_TYPE_NORMAL; - audit_copy_inode(name, link->dentry, link->dentry->d_inode); + audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry)); audit_log_name(current->audit_context, name, link, 0, NULL); out: kfree(name); diff --git a/kernel/audit.h b/kernel/audit.h index 1caa0d345d90..d641f9bb3ed0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -257,6 +257,9 @@ extern struct list_head audit_filter_list[]; extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); +extern void audit_log_d_path_exe(struct audit_buffer *ab, + struct mm_struct *mm); + /* audit watch functions */ #ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2e0c97427b33..b0f9877273fc 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -37,6 +37,7 @@ struct audit_chunk { static LIST_HEAD(tree_list); static LIST_HEAD(prune_list); +static struct task_struct *prune_thread; /* * One struct chunk is attached to each inode of interest. @@ -576,7 +577,7 @@ int audit_remove_tree_rule(struct audit_krule *rule) static int compare_root(struct vfsmount *mnt, void *arg) { - return mnt->mnt_root->d_inode == arg; + return d_backing_inode(mnt->mnt_root) == arg; } void audit_trim_trees(void) @@ -648,7 +649,58 @@ void audit_put_tree(struct audit_tree *tree) static int tag_mount(struct vfsmount *mnt, void *arg) { - return tag_chunk(mnt->mnt_root->d_inode, arg); + return tag_chunk(d_backing_inode(mnt->mnt_root), arg); +} + +/* + * That gets run when evict_chunk() ends up needing to kill audit_tree. + * Runs from a separate thread. + */ +static int prune_tree_thread(void *unused) +{ + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&prune_list)) + schedule(); + __set_current_state(TASK_RUNNING); + + mutex_lock(&audit_cmd_mutex); + mutex_lock(&audit_filter_mutex); + + while (!list_empty(&prune_list)) { + struct audit_tree *victim; + + victim = list_entry(prune_list.next, + struct audit_tree, list); + list_del_init(&victim->list); + + mutex_unlock(&audit_filter_mutex); + + prune_one(victim); + + mutex_lock(&audit_filter_mutex); + } + + mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); + } + return 0; +} + +static int audit_launch_prune(void) +{ + if (prune_thread) + return 0; + prune_thread = kthread_create(prune_tree_thread, NULL, + "audit_prune_tree"); + if (IS_ERR(prune_thread)) { + pr_err("cannot start thread audit_prune_tree"); + prune_thread = NULL; + return -ENOMEM; + } else { + wake_up_process(prune_thread); + return 0; + } } /* called with audit_filter_mutex */ @@ -674,6 +726,12 @@ int audit_add_tree_rule(struct audit_krule *rule) /* do not set rule->tree yet */ mutex_unlock(&audit_filter_mutex); + if (unlikely(!prune_thread)) { + err = audit_launch_prune(); + if (err) + goto Err; + } + err = kern_path(tree->pathname, 0, &path); if (err) goto Err; @@ -811,36 +869,10 @@ int audit_tag_tree(char *old, char *new) return failed; } -/* - * That gets run when evict_chunk() ends up needing to kill audit_tree. - * Runs from a separate thread. - */ -static int prune_tree_thread(void *unused) -{ - mutex_lock(&audit_cmd_mutex); - mutex_lock(&audit_filter_mutex); - - while (!list_empty(&prune_list)) { - struct audit_tree *victim; - - victim = list_entry(prune_list.next, struct audit_tree, list); - list_del_init(&victim->list); - - mutex_unlock(&audit_filter_mutex); - - prune_one(victim); - - mutex_lock(&audit_filter_mutex); - } - - mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); - return 0; -} static void audit_schedule_prune(void) { - kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); + wake_up_process(prune_thread); } /* @@ -907,9 +939,9 @@ static void evict_chunk(struct audit_chunk *chunk) for (n = 0; n < chunk->count; n++) list_del_init(&chunk->owners[n].list); spin_unlock(&hash_lock); + mutex_unlock(&audit_filter_mutex); if (need_prune) audit_schedule_prune(); - mutex_unlock(&audit_filter_mutex); } static int audit_tree_handle_event(struct fsnotify_group *group, diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index ad9c1682f616..6e30024d9aac 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -146,7 +146,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) /* Initialize a parent watch entry. */ static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); struct audit_parent *parent; int ret; @@ -361,11 +361,11 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - mutex_unlock(&parent->dentry->d_inode->i_mutex); - if (d->d_inode) { + mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex); + if (d_is_positive(d)) { /* update watch filter fields */ - watch->dev = d->d_inode->i_sb->s_dev; - watch->ino = d->d_inode->i_ino; + watch->dev = d_backing_inode(d)->i_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; } dput(d); return 0; @@ -426,7 +426,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(parent_path.dentry->d_inode); + parent = audit_find_parent(d_backing_inode(parent_path.dentry)); if (!parent) { parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { @@ -482,7 +482,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = ((struct path *)data)->dentry->d_inode; + inode = d_backing_inode(((struct path *)data)->dentry); break; case (FSNOTIFY_EVENT_INODE): inode = (struct inode *)data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dc4ae70a7413..9fb9d1cb83ce 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1629,7 +1629,7 @@ retry: rcu_read_lock(); seq = read_seqbegin(&rename_lock); for(;;) { - struct inode *inode = d->d_inode; + struct inode *inode = d_backing_inode(d); if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); @@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = current->audit_context; - const struct inode *inode = dentry->d_inode; + const struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1853,7 +1853,7 @@ void __audit_inode_child(const struct inode *parent, const unsigned char type) { struct audit_context *context = current->audit_context; - const struct inode *inode = dentry->d_inode; + const struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; @@ -2361,7 +2361,6 @@ static void audit_log_task(struct audit_buffer *ab) kuid_t auid, uid; kgid_t gid; unsigned int sessionid; - struct mm_struct *mm = current->mm; char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); @@ -2376,13 +2375,7 @@ static void audit_log_task(struct audit_buffer *ab) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); - if (mm) { - down_read(&mm->mmap_sem); - if (mm->exe_file) - audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); - up_read(&mm->mmap_sem); - } else - audit_log_format(ab, " exe=(null)"); + audit_log_d_path_exe(ab, current->mm); } /** diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a5ae60f0b0a2..e6983be12bd3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,2 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o -ifdef CONFIG_TEST_BPF -obj-$(CONFIG_BPF_SYSCALL) += test_stub.o -endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 9eb4d8a7cd87..8a6616583f38 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map) kvfree(array); } -static struct bpf_map_ops array_ops = { +static const struct bpf_map_ops array_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list tl = { +static struct bpf_map_type_list array_type __read_mostly = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; static int __init register_array_map(void) { - bpf_register_map_type(&tl); + bpf_register_map_type(&array_type); return 0; } late_initcall(register_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a64e7a207d2b..54f0e7fcd0e2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -357,8 +357,8 @@ select_insn: ALU64_MOD_X: if (unlikely(SRC == 0)) return 0; - tmp = DST; - DST = do_div(tmp, SRC); + div64_u64_rem(DST, SRC, &tmp); + DST = tmp; CONT; ALU_MOD_X: if (unlikely(SRC == 0)) @@ -367,8 +367,8 @@ select_insn: DST = do_div(tmp, (u32) SRC); CONT; ALU64_MOD_K: - tmp = DST; - DST = do_div(tmp, IMM); + div64_u64_rem(DST, IMM, &tmp); + DST = tmp; CONT; ALU_MOD_K: tmp = (u32) DST; @@ -377,7 +377,7 @@ select_insn: ALU64_DIV_X: if (unlikely(SRC == 0)) return 0; - do_div(DST, SRC); + DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: if (unlikely(SRC == 0)) @@ -387,7 +387,7 @@ select_insn: DST = (u32) tmp; CONT; ALU64_DIV_K: - do_div(DST, IMM); + DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: tmp = (u32) DST; @@ -656,6 +656,14 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); +/* Weak definitions of helper functions in case we don't have bpf syscall. */ +const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; +const struct bpf_func_proto bpf_map_update_elem_proto __weak; +const struct bpf_func_proto bpf_map_delete_elem_proto __weak; + +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b3ba43674310..83c209d9b17a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } -static struct bpf_map_ops htab_ops = { +static const struct bpf_map_ops htab_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list tl = { +static struct bpf_map_type_list htab_type __read_mostly = { .ops = &htab_ops, .type = BPF_MAP_TYPE_HASH, }; static int __init register_htab_map(void) { - bpf_register_map_type(&tl); + bpf_register_map_type(&htab_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e3414d85459..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,8 @@ */ #include <linux/bpf.h> #include <linux/rcupdate.h> +#include <linux/random.h> +#include <linux/smp.h> /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -41,7 +43,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return (unsigned long) value; } -struct bpf_func_proto bpf_map_lookup_elem_proto = { +const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, @@ -60,7 +62,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return map->ops->map_update_elem(map, key, value, r4); } -struct bpf_func_proto bpf_map_update_elem_proto = { +const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, .ret_type = RET_INTEGER, @@ -80,10 +82,32 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return map->ops->map_delete_elem(map, key); } -struct bpf_func_proto bpf_map_delete_elem_proto = { +const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..3bae6c591914 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -16,6 +16,7 @@ #include <linux/file.h> #include <linux/license.h> #include <linux/filter.h> +#include <linux/version.h> static LIST_HEAD(bpf_map_types); @@ -354,10 +355,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) list_for_each_entry(tl, &bpf_prog_types, list_node) { if (tl->type == type) { prog->aux->ops = tl->ops; - prog->aux->prog_type = type; + prog->type = type; return 0; } } + return -EINVAL; } @@ -418,6 +420,7 @@ void bpf_prog_put(struct bpf_prog *prog) bpf_prog_free(prog); } } +EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) { @@ -465,9 +468,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd) fdput(f); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_buf +#define BPF_PROG_LOAD_LAST_FIELD kern_version static int bpf_prog_load(union bpf_attr *attr) { @@ -492,6 +496,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (attr->insn_cnt >= BPF_MAXINSNS) return -EINVAL; + if (type == BPF_PROG_TYPE_KPROBE && + attr->kern_version != LINUX_VERSION_CODE) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) @@ -508,7 +516,7 @@ static int bpf_prog_load(union bpf_attr *attr) prog->jited = false; atomic_set(&prog->aux->refcnt, 1); - prog->aux->is_gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); @@ -516,8 +524,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(prog, attr); - + err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; @@ -528,7 +535,6 @@ static int bpf_prog_load(union bpf_attr *attr) bpf_prog_select_runtime(prog); err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); - if (err < 0) /* failed to allocate fd */ goto free_used_maps; diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c deleted file mode 100644 index 0ceae1e6e8b5..000000000000 --- a/kernel/bpf/test_stub.c +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/err.h> -#include <linux/bpf.h> - -/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC - * to be used by user space verifier testsuite - */ -struct bpf_context { - u64 arg1; - u64 arg2; -}; - -static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - default: - return NULL; - } -} - -static const struct bpf_context_access { - int size; - enum bpf_access_type type; -} test_ctx_access[] = { - [offsetof(struct bpf_context, arg1)] = { - FIELD_SIZEOF(struct bpf_context, arg1), - BPF_READ - }, - [offsetof(struct bpf_context, arg2)] = { - FIELD_SIZEOF(struct bpf_context, arg2), - BPF_READ - }, -}; - -static bool test_is_valid_access(int off, int size, enum bpf_access_type type) -{ - const struct bpf_context_access *access; - - if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) - return false; - - access = &test_ctx_access[off]; - if (access->size == size && (access->type & type)) - return true; - - return false; -} - -static struct bpf_verifier_ops test_ops = { - .get_func_proto = test_func_proto, - .is_valid_access = test_is_valid_access, -}; - -static struct bpf_prog_type_list tl_prog = { - .ops = &test_ops, - .type = BPF_PROG_TYPE_UNSPEC, -}; - -static int __init register_test_ops(void) -{ - bpf_register_prog_type(&tl_prog); - return 0; -} -late_initcall(register_test_ops); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a28e09c7825d..47dcd3aa6e23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, enum bpf_reg_type expected_type; int err = 0; - if (arg_type == ARG_ANYTHING) + if (arg_type == ARG_DONTCARE) return 0; if (reg->type == NOT_INIT) { @@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } + if (arg_type == ARG_ANYTHING) + return 0; + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; @@ -770,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + } else if (arg_type == ARG_PTR_TO_CTX) { + expected_type = PTR_TO_CTX; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -852,7 +857,7 @@ static int check_call(struct verifier_env *env, int func_id) } /* eBPF programs must be GPL compatible to use GPL-ed functions */ - if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { + if (!env->prog->gpl_compatible && fn->gpl_only) { verbose("cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1172,6 +1177,18 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +static bool may_access_skb(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + return true; + default: + return false; + } +} + /* verify safety of LD_ABS|LD_IND instructions: * - they can only appear in the programs where ctx == skb * - since they are wrappers of function calls, they scratch R1-R5 registers, @@ -1194,8 +1211,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *reg; int i, err; - if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { - verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + if (!may_access_skb(env->prog->type)) { + verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); return -EINVAL; } @@ -1380,7 +1397,8 @@ peek_stack: /* tell verifier to check for equivalent states * after every call and jump */ - env->explored_states[t + 1] = STATE_LIST_MARK; + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; } else { /* conditional jump with two edges */ ret = push_insn(t, t + 1, FALLTHROUGH, env); @@ -1606,11 +1624,10 @@ static int do_check(struct verifier_env *env) return err; } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_LDX uses reserved fields\n"); - return -EINVAL; - } + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + /* check src operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1620,6 +1637,8 @@ static int do_check(struct verifier_env *env) if (err) return err; + src_reg_type = regs[insn->src_reg].type; + /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ @@ -1629,6 +1648,32 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (BPF_SIZE(insn->code) != BPF_W) { + insn_idx++; + continue; + } + + if (insn->imm == 0) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -1776,6 +1821,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1867,6 +1919,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + static void free_states(struct verifier_env *env) { struct verifier_state_list *sl, *sln; @@ -1889,13 +2027,13 @@ static void free_states(struct verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; - if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, @@ -1905,7 +2043,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; - env->prog = prog; + env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -1937,7 +2075,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - env->explored_states = kcalloc(prog->len, + env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -1954,6 +2092,10 @@ skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -1969,18 +2111,18 @@ skip_full_check: if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ - prog->aux->used_maps = kmalloc_array(env->used_map_cnt, - sizeof(env->used_maps[0]), - GFP_KERNEL); + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); - if (!prog->aux->used_maps) { + if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } - memcpy(prog->aux->used_maps, env->used_maps, + memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); - prog->aux->used_map_cnt = env->used_map_cnt; + env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions @@ -1992,11 +2134,12 @@ free_log_buf: if (log_level) vfree(log_buf); free_env: - if (!prog->aux->used_maps) + if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); + *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; diff --git a/kernel/capability.c b/kernel/capability.c index 989f5bfc57dc..45432b54d5c6 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str) } __setup("no_file_caps", file_caps_disable); +#ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * @@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); +#endif /* CONFIG_MULTIUSER */ + /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check @@ -412,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, EXPORT_SYMBOL(file_ns_capable); /** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - -/** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a220fdb66568..469dd547770c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4196,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) static int cgroup_pidlist_show(struct seq_file *s, void *v) { - return seq_printf(s, "%d\n", *(int *)v); + seq_printf(s, "%d\n", *(int *)v); + + return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, @@ -5451,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return idr_find(&ss->css_idr, id); + return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } #ifdef CONFIG_CGROUP_DEBUG diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 937ecdfdf258..72d59a1a6eb6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu) } /** - * context_tracking_user_enter - Inform the context tracking that the CPU is going to - * enter userspace mode. + * context_tracking_enter - Inform the context tracking that the CPU is going + * enter user or guest space mode. * * This function must be called right before we switch from the kernel - * to userspace, when it's guaranteed the remaining kernel instructions - * to execute won't use any RCU read side critical section because this - * function sets RCU in extended quiescent state. + * to user or guest space, when it's guaranteed the remaining kernel + * instructions to execute won't use any RCU read side critical section + * because this function sets RCU in extended quiescent state. */ -void context_tracking_user_enter(void) +void context_tracking_enter(enum ctx_state state) { unsigned long flags; @@ -75,9 +75,8 @@ void context_tracking_user_enter(void) WARN_ON_ONCE(!current->mm); local_irq_save(flags); - if ( __this_cpu_read(context_tracking.state) != IN_USER) { + if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { - trace_user_enter(0); /* * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be @@ -85,7 +84,10 @@ void context_tracking_user_enter(void) * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency * on the tick. */ - vtime_user_enter(current); + if (state == CONTEXT_USER) { + trace_user_enter(0); + vtime_user_enter(current); + } rcu_user_enter(); } /* @@ -101,24 +103,32 @@ void context_tracking_user_enter(void) * OTOH we can spare the calls to vtime and RCU when context_tracking.active * is false because we know that CPU is not tickless. */ - __this_cpu_write(context_tracking.state, IN_USER); + __this_cpu_write(context_tracking.state, state); } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_enter); +EXPORT_SYMBOL_GPL(context_tracking_enter); + +void context_tracking_user_enter(void) +{ + context_tracking_enter(CONTEXT_USER); +} NOKPROBE_SYMBOL(context_tracking_user_enter); /** - * context_tracking_user_exit - Inform the context tracking that the CPU is - * exiting userspace mode and entering the kernel. + * context_tracking_exit - Inform the context tracking that the CPU is + * exiting user or guest mode and entering the kernel. * - * This function must be called after we entered the kernel from userspace - * before any use of RCU read side critical section. This potentially include - * any high level kernel code like syscalls, exceptions, signal handling, etc... + * This function must be called after we entered the kernel from user or + * guest space before any use of RCU read side critical section. This + * potentially include any high level kernel code like syscalls, exceptions, + * signal handling, etc... * * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void context_tracking_user_exit(void) +void context_tracking_exit(enum ctx_state state) { unsigned long flags; @@ -129,20 +139,29 @@ void context_tracking_user_exit(void) return; local_irq_save(flags); - if (__this_cpu_read(context_tracking.state) == IN_USER) { + if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* * We are going to run code that may use RCU. Inform * RCU core about that (ie: we may need the tick again). */ rcu_user_exit(); - vtime_user_exit(current); - trace_user_exit(0); + if (state == CONTEXT_USER) { + vtime_user_exit(current); + trace_user_exit(0); + } } - __this_cpu_write(context_tracking.state, IN_KERNEL); + __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_exit); +EXPORT_SYMBOL_GPL(context_tracking_exit); + +void context_tracking_user_exit(void) +{ + context_tracking_exit(CONTEXT_USER); +} NOKPROBE_SYMBOL(context_tracking_user_exit); /** diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c68f0721df10..ee14e3a35a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2453,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * @node: is this an allowed node? * @gfp_mask: memory allocation flags * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest - * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been - * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE - * flag, yes. + * If we're in interrupt, yes, we can always allocate. If @node is set in + * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this + * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, + * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. * Otherwise, no. * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed as is marked TIF_MEMDIE. @@ -2502,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) int allowed; /* is allocation in zone z allowed? */ unsigned long flags; - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) + if (in_interrupt()) return 1; if (node_isset(node, current->mems_allowed)) return 1; diff --git a/kernel/cred.c b/kernel/cred.c index e0573a43c7df..ec1c07667ec1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -29,6 +29,9 @@ static struct kmem_cache *cred_jar; +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + /* * The initial credentials for the initial task */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 2fabc0627165..81aa3a4ece9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -34,14 +34,16 @@ #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> +#include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> -#include <linux/cgroup.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> +#include <linux/bpf.h> +#include <linux/filter.h> #include "internal.h" @@ -153,7 +155,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -327,6 +329,11 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline u64 perf_event_clock(struct perf_event *event) +{ + return event->clock(); +} + static inline struct perf_cpu_context * __get_cpu_context(struct perf_event_context *ctx) { @@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF -/* - * perf_cgroup_info keeps track of time_enabled for a cgroup. - * This is a per-cpu dynamically allocated data structure. - */ -struct perf_cgroup_info { - u64 time; - u64 timestamp; -}; - -struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info __percpu *info; -}; - -/* - * Must ensure cgroup is pinned (css_get) before calling - * this function. In other words, we cannot call this function - * if there is no cgroup event for the current CPU context. - */ -static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) -{ - return container_of(task_css(task, perf_event_cgrp_id), - struct perf_cgroup, css); -} - static inline bool perf_cgroup_match(struct perf_event *event) { @@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx) WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx->task_ctx_data); + kfree(ctx); +} + static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { @@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); if (ctx->task) put_task_struct(ctx->task); - kfree_rcu(ctx, rcu_head); + call_rcu(&ctx->rcu_head, free_ctx); } } @@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; - if (has_branch_stack(event)) - ctx->nr_branch_stack++; - list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) @@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } - if (has_branch_stack(event)) - ctx->nr_branch_stack--; - ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event, #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); +static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, @@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_running += tstamp - event->tstamp_stopped; + + perf_set_shadow_time(event, ctx, tstamp); + + perf_log_itrace_start(event); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event, goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - - perf_set_shadow_time(event, ctx, tstamp); - if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; + + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -2577,6 +2567,56 @@ unlock: } } +void perf_sched_cb_dec(struct pmu *pmu) +{ + this_cpu_dec(perf_sched_cb_usages); +} + +void perf_sched_cb_inc(struct pmu *pmu) +{ + this_cpu_inc(perf_sched_cb_usages); +} + +/* + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when the context switch callback is enabled. + */ +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (prev == next) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + if (pmu->sched_task) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->sched_task(cpuctx->task_ctx, sched_in); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2755,64 +2798,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, } /* - * When sampling the branck stack in system-wide, it may be necessary - * to flush the stack on context switch. This happens when the branch - * stack does not tag its entries with the pid of the current task. - * Otherwise it becomes impossible to associate a branch entry with a - * task. This ambiguity is more likely to appear when the branch stack - * supports priv level filtering and the user sets it to monitor only - * at the user level (which could be a useful measurement in system-wide - * mode). In that case, the risk is high of having a branch stack with - * branch from multiple tasks. Flushing may mean dropping the existing - * entries or stashing them somewhere in the PMU specific code layer. - * - * This function provides the context switch callback to the lower code - * layer. It is invoked ONLY when there is at least one system-wide context - * with at least one active event using taken branch sampling. - */ -static void perf_branch_stack_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - /* no need to flush branch stack if not changing task */ - if (prev == task) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - /* - * check if the context has at least one - * event using PERF_SAMPLE_BRANCH_STACK - */ - if (cpuctx->ctx.nr_branch_stack > 0 - && pmu->flush_branch_stack) { - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - - perf_pmu_disable(pmu); - - pmu->flush_branch_stack(); - - perf_pmu_enable(pmu); - - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } - } - - rcu_read_unlock(); - - local_irq_restore(flags); -} - -/* * Called from scheduler to add the events of the current task * with interrupts disabled. * @@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); - /* check for system-wide branch_stack events */ - if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) - perf_branch_stack_sched_in(prev, task); + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return local64_read(&event->count) + atomic64_read(&event->child_count); + if (event->pmu->count) + return event->pmu->count(event); + + return __perf_event_count(event); } static u64 perf_event_read(struct perf_event *event) @@ -3321,12 +3308,15 @@ errout: * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +find_get_context(struct pmu *pmu, struct task_struct *task, + struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; + void *task_ctx_data = NULL; unsigned long flags; int ctxn, err; + int cpu = event->cpu; if (!task) { /* Must be root to operate on a CPU event: */ @@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) if (ctxn < 0) goto errout; + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + if (!task_ctx_data) { + err = -ENOMEM; + goto errout; + } + } + retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; + + if (task_ctx_data && !ctx->task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) @@ -3369,6 +3372,11 @@ retry: if (!ctx) goto errout; + if (task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } + err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -3395,13 +3403,16 @@ retry: } } + kfree(task_ctx_data); return ctx; errout: + kfree(task_ctx_data); return ERR_PTR(err); } static void perf_event_free_filter(struct perf_event *event); +static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); + perf_event_free_bpf_prog(event); kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); @@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } @@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event) unaccount_event_cpu(event, event->cpu); } +/* + * The following implement mutual exclusion of events on "exclusive" pmus + * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled + * at a time, so we disallow creating events that might conflict, namely: + * + * 1) cpu-wide events in the presence of per-task events, + * 2) per-task events in the presence of cpu-wide events, + * 3) two matching events on the same context. + * + * The former two cases are handled in the allocation path (perf_event_alloc(), + * __free_event()), the latter -- before the first perf_install_in_context(). + */ +static int exclusive_event_init(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return 0; + + /* + * Prevent co-existence of per-task and cpu-wide events on the + * same exclusive pmu. + * + * Negative pmu::exclusive_cnt means there are cpu-wide + * events on this "exclusive" pmu, positive means there are + * per-task events. + * + * Since this is called in perf_event_alloc() path, event::ctx + * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK + * to mean "per-task event", because unlike other attach states it + * never gets cleared. + */ + if (event->attach_state & PERF_ATTACH_TASK) { + if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) + return -EBUSY; + } else { + if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) + return -EBUSY; + } + + return 0; +} + +static void exclusive_event_destroy(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return; + + /* see comment in exclusive_event_init() */ + if (event->attach_state & PERF_ATTACH_TASK) + atomic_dec(&pmu->exclusive_cnt); + else + atomic_inc(&pmu->exclusive_cnt); +} + +static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) +{ + if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + (e1->cpu == e2->cpu || + e1->cpu == -1 || + e2->cpu == -1)) + return true; + return false; +} + +/* Called under the same ctx::mutex as perf_install_in_context() */ +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *iter_event; + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return true; + + list_for_each_entry(iter_event, &ctx->event_list, event_entry) { + if (exclusive_event_match(iter_event, event)) + return false; + } + + return true; +} + static void __free_event(struct perf_event *event) { if (!event->parent) { @@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) + if (event->pmu) { + exclusive_event_destroy(event); module_put(event->pmu->module); + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); + case PERF_EVENT_IOC_SET_BPF: + return perf_event_set_bpf_prog(event, arg); + default: return -ENOTTY; } @@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event) /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + userpg->data_offset = PAGE_SIZE; + userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); @@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } -static struct ring_buffer *ring_buffer_get(struct perf_event *event) +struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; @@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +void ring_buffer_put(struct ring_buffer *rb) { if (!atomic_dec_and_test(&rb->refcount)) return; @@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); + if (vma->vm_pgoff) + atomic_inc(&event->rb->aux_mmap_count); + if (event->pmu->event_mapped) event->pmu->event_mapped(event); } @@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (event->pmu->event_unmapped) event->pmu->event_unmapped(event); + /* + * rb->aux_mmap_count will always drop before rb->mmap_count and + * event->mmap_count, so it is ok to use event->mmap_mutex to + * serialize with perf_mmap here. + */ + if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + + rb_free_aux(rb); + mutex_unlock(&event->mmap_mutex); + } + atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) @@ -4392,7 +4509,7 @@ out_put: static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, + .close = perf_mmap_close, /* non mergable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; @@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct ring_buffer *rb; + struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; - long user_extra, extra; + long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* @@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (vma->vm_pgoff == 0) { + nr_pages = (vma_size / PAGE_SIZE) - 1; + } else { + /* + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. + */ + u64 aux_offset, aux_size; + + if (!event->rb) + return -EINVAL; + + nr_pages = vma_size / PAGE_SIZE; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; + + rb = event->rb; + if (!rb) + goto aux_unlock; + + aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); + aux_size = ACCESS_ONCE(rb->user_page->aux_size); + + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + goto aux_unlock; + + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + goto aux_unlock; + + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + goto aux_unlock; + + if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) + goto aux_unlock; + + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + goto aux_unlock; + + if (!is_power_of_2(nr_pages)) + goto aux_unlock; + + if (!atomic_inc_not_zero(&rb->mmap_count)) + goto aux_unlock; + + if (rb_has_aux(rb)) { + atomic_inc(&rb->aux_mmap_count); + ret = 0; + goto unlock; + } + + atomic_set(&rb->aux_mmap_count, 1); + user_extra = nr_pages; + + goto accounting; + } /* * If we have rb pages ensure they're a power-of-two number, so we @@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - if (vma->vm_pgoff != 0) - return -EINVAL; - WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); @@ -4459,6 +4632,8 @@ again: } user_extra = nr_pages + 1; + +accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -4468,7 +4643,6 @@ again: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - extra = 0; if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; @@ -4482,35 +4656,46 @@ again: goto unlock; } - WARN_ON(event->rb); + WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - if (!rb) { - ret = -ENOMEM; - goto unlock; - } + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); - atomic_set(&rb->mmap_count, 1); - rb->mmap_locked = extra; - rb->mmap_user = get_current_user(); + if (!rb) { + ret = -ENOMEM; + goto unlock; + } - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - ring_buffer_attach(event, rb); + ring_buffer_attach(event, rb); - perf_event_init_userpage(event); - perf_event_update_userpage(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + } else { + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, flags); + if (!ret) + rb->aux_mmap_locked = extra; + } unlock: - if (!ret) + if (!ret) { + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->pinned_vm += extra; + atomic_inc(&event->mmap_count); + } else if (rb) { + atomic_dec(&rb->mmap_count); + } +aux_unlock: mutex_unlock(&event->mmap_mutex); /* @@ -4766,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); + data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); @@ -5344,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); + task_event->event_id.time = perf_event_clock(event); + perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); @@ -5377,7 +5564,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ - .time = perf_clock(), + /* .time */ }, }; @@ -5732,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma) perf_event_mmap_event(&mmap_event); } +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 offset; + u64 size; + u64 flags; + } rec = { + .header = { + .type = PERF_RECORD_AUX, + .misc = 0, + .size = sizeof(rec), + }, + .offset = head, + .size = size, + .flags = flags, + }; + int ret; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * IRQ throttle logging */ @@ -5753,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) .misc = 0, .size = sizeof(throttle_event), }, - .time = perf_clock(), + .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; @@ -5773,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +static void perf_log_itrace_start(struct perf_event *event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u32 pid; + u32 tid; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || + event->hw.itrace_started) + return; + + event->hw.itrace_started = 1; + + rec.header.type = PERF_RECORD_ITRACE_START; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.pid = perf_event_pid(event, current); + rec.tid = perf_event_tid(event, current); + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * Generic event overflow handling, sampling. */ @@ -6133,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) } hlist_add_head_rcu(&event->hlist_entry, head); + perf_event_update_userpage(event); return 0; } @@ -6296,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event) static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, @@ -6449,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + if (event->tp_event->prog) + return -EEXIST; + + if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + /* bpf programs can only be attached to kprobes */ + return -EINVAL; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type != BPF_PROG_TYPE_KPROBE) { + /* valid fd, but invalid bpf program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + event->tp_event->prog = prog; + + return 0; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ + struct bpf_prog *prog; + + if (!event->tp_event) + return; + + prog = event->tp_event->prog; + if (prog) { + event->tp_event->prog = NULL; + bpf_prog_put(prog); + } +} + #else static inline void perf_tp_register(void) @@ -6464,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event) { } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + return -ENOENT; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ +} #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -6602,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } @@ -6638,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event) static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, @@ -6676,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } @@ -6716,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event) static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, @@ -6993,6 +7312,7 @@ got_cpu_context: pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); + atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); @@ -7037,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister); static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { + struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; + + if (event->group_leader != event) { + ctx = perf_event_ctx_lock(event->group_leader); + BUG_ON(!ctx); + } + event->pmu = pmu; ret = pmu->event_init(event); + + if (ctx) + perf_event_ctx_unlock(event->group_leader, ctx); + if (ret) module_put(pmu->module); @@ -7089,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } @@ -7131,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, - void *context) + void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; @@ -7186,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; - - if (attr->type == PERF_TYPE_TRACEPOINT) - event->hw.tp_target = task; -#ifdef CONFIG_HAVE_HW_BREAKPOINT /* - * hw_breakpoint is a bit difficult here.. + * XXX pmu::event_init needs to know what task to account to + * and we cannot use the ctx information because we need the + * pmu before we get a ctx. */ - else if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif + event->hw.target = task; } + event->clock = &local_clock; + if (parent_event) + event->clock = parent_event->clock; + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; @@ -7224,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto err_ns; + if (!has_branch_stack(event)) + event->attr.branch_sample_type = 0; + + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_ns; + } + pmu = perf_init_event(event); if (!pmu) goto err_ns; @@ -7232,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + err = exclusive_event_init(event); + if (err) + goto err_pmu; + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) - goto err_pmu; + goto err_per_task; } } return event; +err_per_task: + exclusive_event_destroy(event); + err_pmu: if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: + if (is_cgroup_event(event)) + perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); kfree(event); @@ -7409,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; + /* + * Mixing clocks in the same buffer is trouble you don't need. + */ + if (output_event->clock != event->clock) + goto out; + + /* + * If both events generate aux data, they must be on the same PMU + */ + if (has_aux(event) && has_aux(output_event) && + event->pmu != output_event->pmu) + goto out; + set: mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ @@ -7441,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } +static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) +{ + bool nmi_safe = false; + + switch (clk_id) { + case CLOCK_MONOTONIC: + event->clock = &ktime_get_mono_fast_ns; + nmi_safe = true; + break; + + case CLOCK_MONOTONIC_RAW: + event->clock = &ktime_get_raw_fast_ns; + nmi_safe = true; + break; + + case CLOCK_REALTIME: + event->clock = &ktime_get_real_ns; + break; + + case CLOCK_BOOTTIME: + event->clock = &ktime_get_boot_ns; + break; + + case CLOCK_TAI: + event->clock = &ktime_get_tai_ns; + break; + + default: + return -EINVAL; + } + + if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) + return -EINVAL; + + return 0; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7465,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open, int move_group = 0; int err; int f_flags = O_RDWR; + int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -7530,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (flags & PERF_FLAG_PID_CGROUP) + cgroup_fd = pid; + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, - NULL, NULL); + NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_cpus; } - if (flags & PERF_FLAG_PID_CGROUP) { - err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) { - __free_event(event); - goto err_cpus; - } - } - if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -ENOTSUPP; @@ -7560,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = event->pmu; + if (attr.use_clockid) { + err = perf_event_set_clock(event, attr.clockid); + if (err) + goto err_alloc; + } + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -7586,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event->cpu); + ctx = find_get_context(pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } + if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { + err = -EBUSY; + goto err_context; + } + if (task) { put_task_struct(task); task = NULL; @@ -7609,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open, */ if (group_leader->group_leader != group_leader) goto err_context; + + /* All events in a group should have the same clock */ + if (group_leader->clock != event->clock) + goto err_context; + /* * Do not allow to attach to a group in a different * task or CPU context: @@ -7709,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open, get_ctx(ctx); } + if (!exclusive_event_installable(event, ctx)) { + err = -EBUSY; + mutex_unlock(&ctx->mutex); + fput(event_file); + goto err_context; + } + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -7781,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, */ event = perf_event_alloc(attr, cpu, task, NULL, NULL, - overflow_handler, context); + overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -7792,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, account_event(event); - ctx = find_get_context(event->pmu, task, cpu); + ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_free; @@ -7800,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (!exclusive_event_installable(event, ctx)) { + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); + err = -EBUSY; + goto err_free; + } + perf_install_in_context(ctx, event, cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -8142,7 +8564,7 @@ inherit_event(struct perf_event *parent_event, parent_event->cpu, child, group_leader, parent_event, - NULL, NULL); + NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9803a6600d49..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.bp_target; + struct task_struct *tsk = bp->hw.target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && + if (iter->hw.target == tsk && find_slot_idx(iter) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); @@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, int nr; nr = info->cpu_pinned; - if (!bp->hw.bp_target) + if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, weight = -weight; /* Pinned counter cpu profiling */ - if (!bp->hw.bp_target) { + if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b218782ad..9f6ce9ba4a04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -27,6 +27,7 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + long aux_watermark; /* poll crap */ spinlock_t event_lock; struct list_head event_list; @@ -35,6 +36,20 @@ struct ring_buffer { unsigned long mmap_locked; struct user_struct *mmap_user; + /* AUX area */ + local_t aux_head; + local_t aux_nest; + local_t aux_wakeup; + unsigned long aux_pgoff; + int aux_nr_pages; + int aux_overwrite; + atomic_t aux_mmap_count; + unsigned long aux_mmap_locked; + void (*free_aux)(void *); + atomic_t aux_refcount; + void **aux_pages; + void *aux_priv; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; @@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb); extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); +extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, long watermark, int flags); +extern void rb_free_aux(struct ring_buffer *rb); +extern struct ring_buffer *ring_buffer_get(struct perf_event *event); +extern void ring_buffer_put(struct ring_buffer *rb); + +static inline bool rb_has_aux(struct ring_buffer *rb) +{ + return !!rb->aux_nr_pages; +} + +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags); extern void perf_event_header__init_id(struct perf_event_header *header, @@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } +static inline unsigned long perf_aux_size(struct ring_buffer *rb) +{ + return rb->aux_nr_pages << PAGE_SHIFT; +} + #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned long \ func_name(struct perf_output_handle *handle, \ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index eadb95ce7aac..232f00f273cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) spin_lock_init(&rb->event_lock); } +/* + * This is called before hardware starts writing to the AUX area to + * obtain an output handle and make sure there's room in the buffer. + * When the capture completes, call perf_aux_output_end() to commit + * the recorded data to the buffer. + * + * The ordering is similar to that of perf_output_{begin,end}, with + * the exception of (B), which should be taken care of by the pmu + * driver, since ordering rules will differ depending on hardware. + */ +void *perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *output_event = event; + unsigned long aux_head, aux_tail; + struct ring_buffer *rb; + + if (output_event->parent) + output_event = output_event->parent; + + /* + * Since this will typically be open across pmu::add/pmu::del, we + * grab ring_buffer's refcount instead of holding rcu read lock + * to make sure it doesn't disappear under us. + */ + rb = ring_buffer_get(output_event); + if (!rb) + return NULL; + + if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + goto err; + + /* + * Nesting is not supported for AUX area, make sure nested + * writers are caught early + */ + if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + goto err_put; + + aux_head = local_read(&rb->aux_head); + + handle->rb = rb; + handle->event = event; + handle->head = aux_head; + handle->size = 0; + + /* + * In overwrite mode, AUX data stores do not depend on aux_tail, + * therefore (A) control dependency barrier does not exist. The + * (B) <-> (C) ordering is still observed by the pmu driver. + */ + if (!rb->aux_overwrite) { + aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; + if (aux_head - aux_tail < perf_aux_size(rb)) + handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); + + /* + * handle->size computation depends on aux_tail load; this forms a + * control dependency barrier separating aux_tail load from aux data + * store that will be enabled on successful return + */ + if (!handle->size) { /* A, matches D */ + event->pending_disable = 1; + perf_output_wakeup(handle); + local_set(&rb->aux_nest, 0); + goto err_put; + } + } + + return handle->rb->aux_priv; + +err_put: + rb_free_aux(rb); + +err: + ring_buffer_put(rb); + handle->event = NULL; + + return NULL; +} + +/* + * Commit the data written by hardware into the ring buffer by adjusting + * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the + * pmu driver's responsibility to observe ordering rules of the hardware, + * so that all the data is externally visible before this is called. + */ +void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, + bool truncated) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head; + u64 flags = 0; + + if (truncated) + flags |= PERF_AUX_FLAG_TRUNCATED; + + /* in overwrite mode, driver provides aux_head via handle */ + if (rb->aux_overwrite) { + flags |= PERF_AUX_FLAG_OVERWRITE; + + aux_head = handle->head; + local_set(&rb->aux_head, aux_head); + } else { + aux_head = local_read(&rb->aux_head); + local_add(size, &rb->aux_head); + } + + if (size || flags) { + /* + * Only send RECORD_AUX if we have something useful to communicate + */ + + perf_event_aux_event(handle->event, aux_head, size, flags); + } + + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + } + handle->event = NULL; + + local_set(&rb->aux_nest, 0); + rb_free_aux(rb); + ring_buffer_put(rb); +} + +/* + * Skip over a given number of bytes in the AUX buffer, due to, for example, + * hardware's alignment constraints. + */ +int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head; + + if (size > handle->size) + return -ENOSPC; + + local_add(size, &rb->aux_head); + + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + handle->wakeup = local_read(&rb->aux_wakeup) + + rb->aux_watermark; + } + + handle->head = aux_head; + handle->size -= size; + + return 0; +} + +void *perf_get_aux(struct perf_output_handle *handle) +{ + /* this is only valid between perf_aux_output_begin and *_end */ + if (!handle->event) + return NULL; + + return handle->rb->aux_priv; +} + +#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) + +static struct page *rb_alloc_aux_page(int node, int order) +{ + struct page *page; + + if (order > MAX_ORDER) + order = MAX_ORDER; + + do { + page = alloc_pages_node(node, PERF_AUX_GFP, order); + } while (!page && order--); + + if (page && order) { + /* + * Communicate the allocation size to the driver + */ + split_page(page, order); + SetPagePrivate(page); + set_page_private(page, order); + } + + return page; +} + +static void rb_free_aux_page(struct ring_buffer *rb, int idx) +{ + struct page *page = virt_to_page(rb->aux_pages[idx]); + + ClearPagePrivate(page); + page->mapping = NULL; + __free_page(page); +} + +int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, long watermark, int flags) +{ + bool overwrite = !(flags & RING_BUFFER_WRITABLE); + int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); + int ret = -ENOMEM, max_order = 0; + + if (!has_aux(event)) + return -ENOTSUPP; + + if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); + + /* + * PMU requests more than one contiguous chunks of memory + * for SW double buffering + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && + !overwrite) { + if (!max_order) + return -EINVAL; + + max_order--; + } + } + + rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); + if (!rb->aux_pages) + return -ENOMEM; + + rb->free_aux = event->pmu->free_aux; + for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { + struct page *page; + int last, order; + + order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); + page = rb_alloc_aux_page(node, order); + if (!page) + goto out; + + for (last = rb->aux_nr_pages + (1 << page_private(page)); + last > rb->aux_nr_pages; rb->aux_nr_pages++) + rb->aux_pages[rb->aux_nr_pages] = page_address(page++); + } + + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + overwrite); + if (!rb->aux_priv) + goto out; + + ret = 0; + + /* + * aux_pages (and pmu driver's private data, aux_priv) will be + * referenced in both producer's and consumer's contexts, thus + * we keep a refcount here to make sure either of the two can + * reference them safely. + */ + atomic_set(&rb->aux_refcount, 1); + + rb->aux_overwrite = overwrite; + rb->aux_watermark = watermark; + + if (!rb->aux_watermark && !rb->aux_overwrite) + rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); + +out: + if (!ret) + rb->aux_pgoff = pgoff; + else + rb_free_aux(rb); + + return ret; +} + +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; +} + +void rb_free_aux(struct ring_buffer *rb) +{ + if (atomic_dec_and_test(&rb->aux_refcount)) + __rb_free_aux(rb); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; @@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb) return rb->nr_pages << page_order(rb); } -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) @@ -416,3 +719,19 @@ fail: } #endif + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (rb->aux_nr_pages) { + /* above AUX space */ + if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) + return NULL; + + /* AUX space */ + if (pgoff >= rb->aux_pgoff) + return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + } + + return __perf_mmap_to_page(rb, pgoff); +} diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 83d4382f5699..6873bb3e6b7e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -20,145 +20,10 @@ #include <linux/types.h> #include <linux/fs_struct.h> - -static void default_handler(int, struct pt_regs *); - -static struct exec_domain *exec_domains = &default_exec_domain; -static DEFINE_RWLOCK(exec_domains_lock); - - -static unsigned long ident_map[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 -}; - -struct exec_domain default_exec_domain = { - .name = "Linux", /* name */ - .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ - .pers_high = 0, /* PER_LINUX personality. */ - .signal_map = ident_map, /* Identity map signals. */ - .signal_invmap = ident_map, /* - both ways. */ -}; - - -static void -default_handler(int segment, struct pt_regs *regp) -{ - set_personality(0); - - if (current_thread_info()->exec_domain->handler != default_handler) - current_thread_info()->exec_domain->handler(segment, regp); - else - send_sig(SIGSEGV, current, 1); -} - -static struct exec_domain * -lookup_exec_domain(unsigned int personality) -{ - unsigned int pers = personality(personality); - struct exec_domain *ep; - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } - -#ifdef CONFIG_MODULES - read_unlock(&exec_domains_lock); - request_module("personality-%d", pers); - read_lock(&exec_domains_lock); - - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } -#endif - - ep = &default_exec_domain; -out: - read_unlock(&exec_domains_lock); - return ep; -} - -int -register_exec_domain(struct exec_domain *ep) -{ - struct exec_domain *tmp; - int err = -EBUSY; - - if (ep == NULL) - return -EINVAL; - - if (ep->next != NULL) - return -EBUSY; - - write_lock(&exec_domains_lock); - for (tmp = exec_domains; tmp; tmp = tmp->next) { - if (tmp == ep) - goto out; - } - - ep->next = exec_domains; - exec_domains = ep; - err = 0; - -out: - write_unlock(&exec_domains_lock); - return err; -} -EXPORT_SYMBOL(register_exec_domain); - -int -unregister_exec_domain(struct exec_domain *ep) -{ - struct exec_domain **epp; - - epp = &exec_domains; - write_lock(&exec_domains_lock); - for (epp = &exec_domains; *epp; epp = &(*epp)->next) { - if (ep == *epp) - goto unregister; - } - write_unlock(&exec_domains_lock); - return -EINVAL; - -unregister: - *epp = ep->next; - ep->next = NULL; - write_unlock(&exec_domains_lock); - return 0; -} -EXPORT_SYMBOL(unregister_exec_domain); - -int __set_personality(unsigned int personality) -{ - struct exec_domain *oep = current_thread_info()->exec_domain; - - current_thread_info()->exec_domain = lookup_exec_domain(personality); - current->personality = personality; - module_put(oep->module); - - return 0; -} -EXPORT_SYMBOL(__set_personality); - #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) { - struct exec_domain *ep; - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep; ep = ep->next) - seq_printf(m, "%d-%d\t%-16s\t[%s]\n", - ep->pers_low, ep->pers_high, ep->name, - module_name(ep->module)); - read_unlock(&exec_domains_lock); + seq_puts(m, "0-0\tLinux \t[kernel]\n"); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index feff10bbb307..22fcc05dec40 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -756,8 +756,6 @@ void do_exit(long code) cgroup_exit(tsk); - module_put(task_thread_info(tsk)->exec_domain->module); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ diff --git a/kernel/fork.c b/kernel/fork.c index cf65139615a0..03c1eaaa6ef5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -74,6 +74,7 @@ #include <linux/uprobes.h> #include <linux/aio.h> #include <linux/compiler.h> +#include <linux/sysctl.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -88,6 +89,16 @@ #include <trace/events/task.h> /* + * Minimum number of threads to boot the kernel + */ +#define MIN_THREADS 20 + +/* + * Maximum number of threads + */ +#define MAX_THREADS FUTEX_TID_MASK + +/* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -253,7 +264,30 @@ EXPORT_SYMBOL_GPL(__put_task_struct); void __init __weak arch_task_cache_init(void) { } -void __init fork_init(unsigned long mempages) +/* + * set_max_threads + */ +static void set_max_threads(unsigned int max_threads_suggested) +{ + u64 threads; + + /* + * The number of threads shall be limited such that the thread + * structures may only consume a small part of the available memory. + */ + if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + threads = MAX_THREADS; + else + threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + (u64) THREAD_SIZE * 8UL); + + if (threads > max_threads_suggested) + threads = max_threads_suggested; + + max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); +} + +void __init fork_init(void) { #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN @@ -268,18 +302,7 @@ void __init fork_init(unsigned long mempages) /* do the arch specific task caches init */ arch_task_cache_init(); - /* - * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most half - * of memory. - */ - max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); - - /* - * we need to allow at least 20 threads to boot a system - */ - if (max_threads < 20) - max_threads = 20; + set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; @@ -380,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + mm->total_vm = oldmm->total_vm; mm->shared_vm = oldmm->shared_vm; mm->exec_vm = oldmm->exec_vm; @@ -505,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -#define dup_mmap(mm, oldmm) (0) +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ @@ -674,34 +706,53 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/** + * set_mm_exe_file - change a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). + * + * Main users are mmput() and sys_execve(). Callers prevent concurrent + * invocations: in mmput() nobody alive left, in execve task is single + * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the + * mm->exe_file, but does so without using set_mm_exe_file() in order + * to do avoid the need for any locks. + */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { + struct file *old_exe_file; + + /* + * It is safe to dereference the exe_file without RCU as + * this function is only called if nobody else can access + * this mm -- see comment above for justification. + */ + old_exe_file = rcu_dereference_raw(mm->exe_file); + if (new_exe_file) get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; + rcu_assign_pointer(mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); } +/** + * get_mm_exe_file - acquire a reference to the mm's executable file + * + * Returns %NULL if mm has no associated executable file. + * User must release file via fput(). + */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of exe_file */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); + rcu_read_lock(); + exe_file = rcu_dereference(mm->exe_file); + if (exe_file && !get_file_rcu(exe_file)) + exe_file = NULL; + rcu_read_unlock(); return exe_file; } - -static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} +EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_mm - acquire a reference to the task's mm @@ -864,8 +915,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; - dup_mm_exe_file(oldmm, mm); - err = dup_mmap(mm, oldmm); if (err) goto free_pt; @@ -1279,9 +1328,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (nr_threads >= max_threads) goto bad_fork_cleanup_count; - if (!try_module_get(task_thread_info(p)->exec_domain->module)) - goto bad_fork_cleanup_count; - delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); p->flags |= PF_FORKNOEXEC; @@ -1406,10 +1452,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { - retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns_for_children); - if (!pid) + if (IS_ERR(pid)) { + retval = PTR_ERR(pid); goto bad_fork_cleanup_io; + } } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1590,7 +1637,6 @@ bad_fork_cleanup_threadgroup_lock: if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); delayacct_tsk_free(p); - module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); @@ -2004,3 +2050,26 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +int sysctl_max_threads(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int ret; + int threads = max_threads; + int min = MIN_THREADS; + int max = MAX_THREADS; + + t = *table; + t.data = &threads; + t.extra1 = &min; + t.extra2 = &max; + + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + set_max_threads(threads); + + return 0; +} diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index b358a802fd18..a744098e4eb7 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched.h> #include "gcov.h" static int gcov_events_enabled; @@ -107,8 +108,10 @@ void gcov_enable_events(void) gcov_events_enabled = 1; /* Perform event callback for previously registered entries. */ - while ((info = gcov_info_next(info))) + while ((info = gcov_info_next(info))) { gcov_event(GCOV_ADD, info); + cond_resched(); + } mutex_unlock(&gcov_lock); } diff --git a/kernel/groups.c b/kernel/groups.c index 664411f171b5..74d431d25251 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -9,9 +9,6 @@ #include <linux/user_namespace.h> #include <asm/uaccess.h> -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - struct group_info *groups_alloc(int gidsetsize) { struct group_info *group_info; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06db12434d72..e0f90c2b57aa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) return; rcu_read_lock(); - do_each_thread(g, t) { + for_each_process_thread(g, t) { if (!max_count--) goto unlock; if (!--batch_count) { @@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); - } while_each_thread(g, t); + } unlock: rcu_read_unlock(); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 38c25b1f2fd5..7a36fdcca5bf 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -707,7 +707,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, do { unsigned long pfn, epfn, addr, eaddr; - pages = kimage_alloc_pages(GFP_KERNEL, order); + pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_pfn(pages); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ba77ab5f64dd..a0831e1b99f4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -551,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock) static void print_lock(struct held_lock *hlock) { - print_lock_name(hlock_class(hlock)); + /* + * We can be called locklessly through debug_show_all_locks() so be + * extra careful, the hlock might have been released and cleared. + */ + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ + barrier(); + + if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + printk("<RELEASED>\n"); + return; + } + + print_lock_name(lock_classes + class_idx - 1); printk(", at: "); print_ip_sym(hlock->acquire_ip); } diff --git a/kernel/module.c b/kernel/module.c index 650b038ae520..42a1d2afb217 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -387,9 +387,9 @@ static bool check_symbol(const struct symsearch *syms, pr_warn("Symbol %s is marked as UNUSED, however this module is " "using it.\n", fsa->name); pr_warn("This symbol will go away in the future.\n"); - pr_warn("Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " + pr_warn("Please evaluate if this is the right api to use and " + "if it really is, submit a report to the linux kernel " + "mailing list together with submitting your code for " "inclusion.\n"); } #endif @@ -2511,7 +2511,8 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = vmalloc(info->len); + info->hdr = __vmalloc(info->len, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); if (!info->hdr) return -ENOMEM; diff --git a/kernel/params.c b/kernel/params.c index 728e05b167de..a22d6a759b1a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -173,9 +173,9 @@ static char *next_arg(char *args, char **param, char **val) if (args[i-1] == '"') args[i-1] = '\0'; } - if (quoted && args[i-1] == '"') - args[i-1] = '\0'; } + if (quoted && args[i-1] == '"') + args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; diff --git a/kernel/pid.c b/kernel/pid.c index cd36a5e0d173..4fd07d5b7baf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) spin_unlock_irq(&pidmap_lock); kfree(page); if (unlikely(!map->page)) - break; + return -ENOMEM; } if (likely(atomic_read(&map->nr_free))) { for ( ; ; ) { @@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) } pid = mk_pid(pid_ns, map, offset); } - return -1; + return -EAGAIN; } int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) @@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns) int i, nr; struct pid_namespace *tmp; struct upid *upid; + int retval = -ENOMEM; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) - goto out; + return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (nr < 0) + if (IS_ERR_VALUE(nr)) { + retval = nr; goto out_free; + } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; @@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns) } spin_unlock_irq(&pidmap_lock); -out: return pid; out_unlock: @@ -351,8 +353,7 @@ out_free: free_pidmap(pid->numbers + i); kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; - goto out; + return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) diff --git a/kernel/power/main.c b/kernel/power/main.c index 9a59d042ea84..86e8157a450f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,7 +11,7 @@ #include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> -#include <linux/resume-trace.h> +#include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b7d6b3a721b1..8d7a1ef72758 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -28,6 +28,7 @@ #include <linux/ftrace.h> #include <trace/events/power.h> #include <linux/compiler.h> +#include <linux/moduleparam.h> #include "power.h" @@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state) suspend_ops->suspend_again() : false; } +#ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from suspend test"); +#endif + static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); + printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); return 1; } #endif /* !CONFIG_PM_DEBUG */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bb0635bd74f2..c099b082cd02 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,6 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> -#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -46,6 +45,7 @@ #include <linux/irq_work.h> #include <linux/utsname.h> #include <linux/ctype.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(from); ssize_t ret = len; if (len > LOG_LINE_MAX) @@ -2017,24 +2017,6 @@ int add_preferred_console(char *name, int idx, char *options) return __add_preferred_console(name, idx, options, NULL); } -int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) -{ - struct console_cmdline *c; - int i; - - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) - if (strcmp(c->name, name) == 0 && c->index == idx) { - strlcpy(c->name, name_new, sizeof(c->name)); - c->options = options; - c->index = idx_new; - return i; - } - /* not found */ - return -1; -} - bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); @@ -2436,9 +2418,6 @@ void register_console(struct console *newcon) if (preferred_console < 0 || bcon || !console_drivers) preferred_console = selected_console; - if (newcon->early_setup) - newcon->early_setup(); - /* * See if we want to use this console driver. If we * didn't select a console we take the first one @@ -2464,23 +2443,27 @@ void register_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { - BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); - if (strcmp(c->name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != c->index) - continue; - if (newcon->index < 0) - newcon->index = c->index; + if (!newcon->match || + newcon->match(newcon, c->name, c->index, c->options) != 0) { + /* default matching */ + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); + if (strcmp(c->name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != c->index) + continue; + if (newcon->index < 0) + newcon->index = c->index; - if (_braille_register_console(newcon, c)) - return; + if (_braille_register_console(newcon, c)) + return; + + if (newcon->setup && + newcon->setup(newcon, c->options) != 0) + break; + } - if (newcon->setup && - newcon->setup(newcon, console_cmdline[i].options) != 0) - break; newcon->flags |= CON_ENABLED; - newcon->index = c->index; if (i == selected_console) { newcon->flags |= CON_CONSDEV; preferred_console = selected_console; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 227fec36b12a..c8e0e050a36a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) static int ptrace_detach(struct task_struct *child, unsigned int data) { - bool dead = false; - if (!valid_signal(data)) return -EIO; @@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) write_lock_irq(&tasklist_lock); /* - * This child can be already killed. Make sure de_thread() or - * our sub-thread doing do_wait() didn't do release_task() yet. + * We rely on ptrace_freeze_traced(). It can't be killed and + * untraced by another thread, it can't be a zombie. */ - if (child->ptrace) { - child->exit_code = data; - dead = __ptrace_detach(current, child); - } + WARN_ON(!child->ptrace || child->exit_state); + /* + * tasklist_lock avoids the race with wait_task_stopped(), see + * the comment in ptrace_resume(). + */ + child->exit_code = data; + __ptrace_detach(current, child); write_unlock_irq(&tasklist_lock); proc_ptrace_connector(child, PTRACE_DETACH); - if (unlikely(dead)) - release_task(child); return 0; } @@ -697,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { + bool need_siglock; + if (!valid_signal(data)) return -EIO; @@ -724,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request, user_disable_single_step(child); } + /* + * Change ->exit_code and ->state under siglock to avoid the race + * with wait_task_stopped() in between; a non-zero ->exit_code will + * wrongly look like another report from tracee. + * + * Note that we need siglock even if ->exit_code == data and/or this + * status was not reported yet, the new status must not be cleared by + * wait_task_stopped() after resume. + * + * If data == 0 we do not care if wait_task_stopped() reports the old + * status and clears the code too; this can't race with the tracee, it + * takes siglock after resume. + */ + need_siglock = data && !thread_group_empty(current); + if (need_siglock) + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; wake_up_state(child, __TASK_TRACED); + if (need_siglock) + spin_unlock_irq(&child->sighand->siglock); return 0; } diff --git a/kernel/reboot.c b/kernel/reboot.c index 5925f5ae8dff..d20c85d9f8c0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -387,8 +387,9 @@ void ctrl_alt_del(void) } char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +static const char reboot_cmd[] = "/sbin/reboot"; -static int __orderly_poweroff(bool force) +static int run_cmd(const char *cmd) { char **argv; static char *envp[] = { @@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force) NULL }; int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + argv = argv_split(GFP_KERNEL, cmd, NULL); if (argv) { ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); @@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force) ret = -ENOMEM; } + return ret; +} + +static int __orderly_reboot(void) +{ + int ret; + + ret = run_cmd(reboot_cmd); + + if (ret) { + pr_warn("Failed to start orderly reboot: forcing the issue\n"); + emergency_sync(); + kernel_restart(NULL); + } + + return ret; +} + +static int __orderly_poweroff(bool force) +{ + int ret; + + ret = run_cmd(poweroff_cmd); + if (ret && force) { pr_warn("Failed to start orderly shutdown: forcing the issue\n"); + /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an @@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func); * This may be called from any context to trigger a system shutdown. * If the orderly shutdown fails, it will force an immediate shutdown. */ -int orderly_poweroff(bool force) +void orderly_poweroff(bool force) { if (force) /* do not override the pending "true" */ poweroff_force = true; schedule_work(&poweroff_work); - return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); +static void reboot_work_func(struct work_struct *work) +{ + __orderly_reboot(); +} + +static DECLARE_WORK(reboot_work, reboot_work_func); + +/** + * orderly_reboot - Trigger an orderly system reboot + * + * This may be called from any context to trigger a system reboot. + * If the orderly reboot fails, it will force an immediate reboot. + */ +void orderly_reboot(void) +{ + schedule_work(&reboot_work); +} +EXPORT_SYMBOL_GPL(orderly_reboot); + static int __init reboot_setup(char *str) { for (;;) { diff --git a/kernel/relay.c b/kernel/relay.c index 5a56d3c8dc03..e9dbaeb8fd65 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -407,7 +407,7 @@ static inline void relay_set_buf_dentry(struct rchan_buf *buf, struct dentry *dentry) { buf->dentry = dentry; - buf->dentry->d_inode->i_size = buf->early_bytes; + d_inode(buf->dentry)->i_size = buf->early_bytes; } static struct dentry *relay_create_buf_file(struct rchan *chan, @@ -733,7 +733,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->padding[old_subbuf] = buf->prev_padding; buf->subbufs_produced++; if (buf->dentry) - buf->dentry->d_inode->i_size += + d_inode(buf->dentry)->i_size += buf->chan->subbuf_size - buf->padding[old_subbuf]; else diff --git a/kernel/resource.c b/kernel/resource.c index 19f2357dfda3..90552aab5f2d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res) * * request_region creates a new busy region. * - * check_region returns non-zero if the area is already busy. - * * release_region releases a matching busy region. */ @@ -1098,36 +1096,6 @@ struct resource * __request_region(struct resource *parent, EXPORT_SYMBOL(__request_region); /** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region", 0); - if (!res) - return -EBUSY; - - release_resource(res); - free_resource(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - -/** * __release_region - release a previously reserved resource region * @parent: parent resource descriptor * @start: resource start address diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f7937ee9e3a..fe22f7510bce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq, true); } -static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); - -void register_task_migration_notifier(struct notifier_block *n) -{ - atomic_notifier_chain_register(&task_migration_notifier, n); -} - #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { - struct task_migration_notifier tmn; - if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); - - tmn.task = p; - tmn.from_cpu = task_cpu(p); - tmn.to_cpu = new_cpu; - - atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); @@ -2853,7 +2838,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != IN_USER, but that will trigger + * should warn if prev_state != CONTEXT_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index deef1caa94c6..fefcb1fa5160 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -81,7 +81,6 @@ static void cpuidle_idle_call(void) struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; - unsigned int broadcast; bool reflect; /* @@ -150,17 +149,6 @@ static void cpuidle_idle_call(void) goto exit_idle; } - broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; - - /* - * Tell the time framework to switch to a broadcast timer - * because our local timer will be shutdown. If a local timer - * is used from another cpu as a broadcast timer, this call may - * fail if it is not available - */ - if (broadcast && tick_broadcast_enter()) - goto use_default; - /* Take note of the planned idle state. */ idle_set_state(this_rq(), &drv->states[next_state]); @@ -174,8 +162,8 @@ static void cpuidle_idle_call(void) /* The cpu is no longer idle or about to enter idle. */ idle_set_state(this_rq(), NULL); - if (broadcast) - tick_broadcast_exit(); + if (entered_state == -EBUSY) + goto use_default; /* * Give the governor an opportunity to reflect on the outcome diff --git a/kernel/signal.c b/kernel/signal.c index a390499943e4..d51c5ddd855c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + if ((info->si_code >= 0 || info->si_code == SI_TKILL) && + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); diff --git a/kernel/smp.c b/kernel/smp.c index f38a1e692259..07854477c164 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -19,7 +19,7 @@ enum { CSD_FLAG_LOCK = 0x01, - CSD_FLAG_WAIT = 0x02, + CSD_FLAG_SYNCHRONOUS = 0x02, }; struct call_function_data { @@ -107,7 +107,7 @@ void __init call_function_init(void) */ static void csd_lock_wait(struct call_single_data *csd) { - while (csd->flags & CSD_FLAG_LOCK) + while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) cpu_relax(); } @@ -121,19 +121,17 @@ static void csd_lock(struct call_single_data *csd) * to ->flags with any subsequent assignments to other * fields of the specified call_single_data structure: */ - smp_mb(); + smp_wmb(); } static void csd_unlock(struct call_single_data *csd) { - WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ - smp_mb(); - - csd->flags &= ~CSD_FLAG_LOCK; + smp_store_release(&csd->flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); @@ -144,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, struct call_single_data *csd, - smp_call_func_t func, void *info, int wait) + smp_call_func_t func, void *info) { - struct call_single_data csd_stack = { .flags = 0 }; - unsigned long flags; - - if (cpu == smp_processor_id()) { + unsigned long flags; + + /* + * We can unlock early even for the synchronous on-stack case, + * since we're doing this from the same CPU.. + */ + csd_unlock(csd); local_irq_save(flags); func(info); local_irq_restore(flags); @@ -158,24 +159,14 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) + if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { + csd_unlock(csd); return -ENXIO; - - - if (!csd) { - csd = &csd_stack; - if (!wait) - csd = this_cpu_ptr(&csd_data); } - csd_lock(csd); - csd->func = func; csd->info = info; - if (wait) - csd->flags |= CSD_FLAG_WAIT; - /* * The list addition should be visible before sending the IPI * handler locks the list to pull the entry off it because of @@ -190,9 +181,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) arch_send_call_function_single_ipi(cpu); - if (wait) - csd_lock_wait(csd); - return 0; } @@ -250,8 +238,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } llist_for_each_entry_safe(csd, csd_next, entry, llist) { - csd->func(csd->info); - csd_unlock(csd); + smp_call_func_t func = csd->func; + void *info = csd->info; + + /* Do we wait until *after* callback? */ + if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + func(info); + csd_unlock(csd); + } else { + csd_unlock(csd); + func(info); + } } /* @@ -274,6 +271,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { + struct call_single_data *csd; + struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS }; int this_cpu; int err; @@ -292,7 +291,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); - err = generic_exec_single(cpu, NULL, func, info, wait); + csd = &csd_stack; + if (!wait) { + csd = this_cpu_ptr(&csd_data); + csd_lock(csd); + } + + err = generic_exec_single(cpu, csd, func, info); + + if (wait) + csd_lock_wait(csd); put_cpu(); @@ -321,7 +329,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd) int err = 0; preempt_disable(); - err = generic_exec_single(cpu, csd, csd->func, csd->info, 0); + + /* We could deadlock if we have to wait here with interrupts disabled! */ + if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK)) + csd_lock_wait(csd); + + csd->flags = CSD_FLAG_LOCK; + smp_wmb(); + + err = generic_exec_single(cpu, csd, csd->func, csd->info); preempt_enable(); return err; @@ -433,6 +449,8 @@ void smp_call_function_many(const struct cpumask *mask, struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); csd_lock(csd); + if (wait) + csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); diff --git a/kernel/sys.c b/kernel/sys.c index a03d9cd23ed7..a4e372b798a5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -325,6 +325,7 @@ out_unlock: * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ +#ifdef CONFIG_MULTIUSER SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { struct user_namespace *ns = current_user_ns(); @@ -815,6 +816,7 @@ change_okay: commit_creds(new); return old_fsgid; } +#endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process @@ -1647,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; + struct file *old_exe, *exe_file; struct inode *inode; int err; - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - exe = fdget(fd); if (!exe.file) return -EBADF; @@ -1678,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) /* * Forbid mm->exe_file change if old file still mapped. */ + exe_file = get_mm_exe_file(mm); err = -EBUSY; - if (mm->exe_file) { + if (exe_file) { struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_file && - path_equal(&vma->vm_file->f_path, - &mm->exe_file->f_path)) - goto exit; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_err; + } + + up_read(&mm->mmap_sem); + fput(exe_file); } /* @@ -1700,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) goto exit; err = 0; - set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ + /* set the new file, lockless */ + get_file(exe.file); + old_exe = xchg(&mm->exe_file, exe.file); + if (old_exe) + fput(old_exe); exit: fdput(exe); return err; +exit_err: + up_read(&mm->mmap_sem); + fput(exe_file); + goto exit; } #ifdef CONFIG_CHECKPOINT_RESTORE @@ -1838,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - down_write(&mm->mmap_sem); if (prctl_map.exe_fd != (u32)-1) - error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); - downgrade_write(&mm->mmap_sem); + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); + down_read(&mm->mmap_sem); if (error) goto out; @@ -1907,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (opt == PR_SET_MM_EXE_FILE) { - down_write(&mm->mmap_sem); - error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); - up_write(&mm->mmap_sem); - return error; - } + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5adcb0ae3a58..7995ef5868d8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -159,6 +159,20 @@ cond_syscall(sys_uselib); cond_syscall(sys_fadvise64); cond_syscall(sys_fadvise64_64); cond_syscall(sys_madvise); +cond_syscall(sys_setuid); +cond_syscall(sys_setregid); +cond_syscall(sys_setgid); +cond_syscall(sys_setreuid); +cond_syscall(sys_setresuid); +cond_syscall(sys_getresuid); +cond_syscall(sys_setresgid); +cond_syscall(sys_getresgid); +cond_syscall(sys_setgroups); +cond_syscall(sys_getgroups); +cond_syscall(sys_setfsuid); +cond_syscall(sys_setfsgid); +cond_syscall(sys_capget); +cond_syscall(sys_capset); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ce410bb9f2e1..2082b1a88fb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -19,6 +19,7 @@ */ #include <linux/module.h> +#include <linux/aio.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> @@ -92,11 +93,9 @@ #include <linux/nmi.h> #endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP extern int core_uses_pid; @@ -709,10 +708,10 @@ static struct ctl_table kern_table[] = { #endif { .procname = "threads-max", - .data = &max_threads, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_max_threads, }, { .procname = "random", @@ -846,7 +845,7 @@ static struct ctl_table kern_table[] = { .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, }, @@ -855,11 +854,33 @@ static struct ctl_table kern_table[] = { .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog_thresh, .extra1 = &zero, .extra2 = &sixty, }, { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_nmi_watchdog, + .extra1 = &zero, +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) + .extra2 = &one, +#else + .extra2 = &zero, +#endif + }, + { + .procname = "soft_watchdog", + .data = &soft_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), @@ -879,15 +900,6 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif /* CONFIG_SMP */ - { - .procname = "nmi_watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dowatchdog, - .extra1 = &zero, - .extra2 = &one, - }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { @@ -1321,6 +1333,15 @@ static struct ctl_table vm_table[] = { .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_COMPACTION */ { @@ -1960,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - *valp = *negp ? -*lvalp : *lvalp; + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + *valp = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + *valp = *lvalp; + } } else { int val = *valp; if (val < 0) { diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 25d942d1da27..11dc22a6983b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -440,7 +440,7 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) mutex_unlock(&clockevents_mutex); return ret; } -EXPORT_SYMBOL_GPL(clockevents_unbind); +EXPORT_SYMBOL_GPL(clockevents_unbind_device); /* Sanity check of state transition callbacks */ static int clockevents_sanity_check(struct clock_event_device *dev) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fedbdd7d5d1e..3b9a48ae153a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -432,6 +432,14 @@ config UPROBE_EVENT This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config BPF_EVENTS + depends on BPF_SYSCALL + depends on KPROBE_EVENT + bool + default y + help + This allows the user to attach BPF programs to kprobe events. + config PROBE_EVENTS def_bool n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98f26588255e..9b1044e936a6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..2d56ce501632 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,222 @@ +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/uaccess.h> +#include <linux/ctype.h> +#include "trace.h" + +static DEFINE_PER_CPU(int, bpf_prog_active); + +/** + * trace_call_bpf - invoke BPF program + * @prog: BPF program + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + unsigned int ret; + + if (in_nmi()) /* not supported yet */ + return 1; + + preempt_disable(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_call_bpf); + +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *dst = (void *) (long) r1; + int size = (int) r2; + void *unsafe_ptr = (void *) (long) r3; + + return probe_kernel_read(dst, unsafe_ptr, size); +} + +static const struct bpf_func_proto bpf_probe_read_proto = { + .func = bpf_probe_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +static const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +/* + * limited trace_printk() + * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + */ +static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +{ + char *fmt = (char *) (long) r1; + int mod[3] = {}; + int fmt_cnt = 0; + int i; + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + return -EINVAL; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) + return -EINVAL; + + if (fmt[i] != '%') + continue; + + if (fmt_cnt >= 3) + return -EINVAL; + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } else if (fmt[i] == 'p') { + mod[fmt_cnt]++; + i++; + if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + return -EINVAL; + fmt_cnt++; + continue; + } + + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } + + if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + return -EINVAL; + fmt_cnt++; + } + + return __trace_printk(1/* fake ip will not be printed */, fmt, + mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, + mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, + mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + + case BPF_FUNC_trace_printk: + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; + default: + return NULL; + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + return true; +} + +static struct bpf_verifier_ops kprobe_prog_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +static struct bpf_prog_type_list kprobe_tl = { + .ops = &kprobe_prog_ops, + .type = BPF_PROG_TYPE_KPROBE, +}; + +static int __init register_kprobe_prog_ops(void) +{ + bpf_register_prog_type(&kprobe_tl); + return 0; +} +late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 91eecaaa43e0..05330494a0df 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6079,7 +6079,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, struct dentry *ret = trace_create_file(name, mode, parent, data, fops); if (ret) /* See tracing_get_cpu() */ - ret->d_inode->i_cdev = (void *)(cpu + 1); + d_inode(ret)->i_cdev = (void *)(cpu + 1); return ret; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7da1dfeb322e..c4de47fc5cca 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -494,8 +494,8 @@ static void remove_event_file_dir(struct ftrace_event_file *file) if (dir) { spin_lock(&dir->d_lock); /* probably unneeded */ list_for_each_entry(child, &dir->d_subdirs, d_child) { - if (child->d_inode) /* probably unneeded */ - child->d_inode->i_private = NULL; + if (d_really_is_positive(child)) /* probably unneeded */ + d_inode(child)->i_private = NULL; } spin_unlock(&dir->d_lock); @@ -565,6 +565,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; + int ret; /* * The buf format can be <subsystem>:<event-name> @@ -590,7 +591,13 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) event = NULL; } - return __ftrace_set_clr_event(tr, match, sub, event, set); + ret = __ftrace_set_clr_event(tr, match, sub, event, set); + + /* Put back the colon to allow this to be called again */ + if (buf) + *(buf - 1) = ':'; + + return ret; } /** @@ -1753,6 +1760,8 @@ static void update_event_printk(struct ftrace_event_call *call, ptr++; /* Check for alpha chars like ULL */ } while (isalnum(*ptr)); + if (!*ptr) + break; /* * A number must have some kind of delimiter after * it, and we can ignore that too. @@ -1779,12 +1788,16 @@ static void update_event_printk(struct ftrace_event_call *call, do { ptr++; } while (isalnum(*ptr) || *ptr == '_'); + if (!*ptr) + break; /* * If what comes after this variable is a '.' or * '->' then we can continue to ignore that string. */ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { ptr += *ptr == '.' ? 1 : 2; + if (!*ptr) + break; goto skip_more; } /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 9cfea4c6d314..a51e79688455 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1308,15 +1308,19 @@ void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ struct fgraph_data *data; + gfp_t gfpflags; int cpu; iter->private = NULL; - data = kzalloc(sizeof(*data), GFP_KERNEL); + /* We can be called in atomic context via ftrace_dump() */ + gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; + + data = kzalloc(sizeof(*data), gfpflags); if (!data) goto out_err; - data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags); if (!data->cpu_data) goto out_err_free; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9ba3f43f580e..d0ce590f06e1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1135,11 +1135,15 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1166,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1287,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; call->data = tk; ret = trace_add_event_call(call); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c3e4fcfddd45..3f34496244e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p) local_irq_enable(); } -static int trace_lookup_stack(struct seq_file *m, long i) +static void trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; - return seq_printf(m, "%pS\n", (void *)addr); + seq_printf(m, "%pS\n", (void *)addr); } static void print_disabled(struct seq_file *m) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 74865465e0b7..6dd022c7b5bc 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -443,7 +443,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - inode = igrab(path.dentry->d_inode); + inode = igrab(d_inode(path.dentry)); path_put(&path); if (!inode || !S_ISREG(inode->i_mode)) { @@ -1006,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return true; list_for_each_entry(event, &filter->perf_events, hw.tp_list) { - if (event->hw.tp_target->mm == mm) + if (event->hw.target->mm == mm) return true; } @@ -1016,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) static inline bool uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); + return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) @@ -1024,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) bool done; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + (event->hw.target->flags & PF_EXITING) || uprobe_filter_event(tu, event); } else { tu->filter.nr_systemwide--; @@ -1047,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) int err; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid * uprobe_apply(). current->mm must be probed and we can rely diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3174bf8e3538..2316f50b07a4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,8 +24,33 @@ #include <linux/kvm_para.h> #include <linux/perf_event.h> -int watchdog_user_enabled = 1; +/* + * The run state of the lockup detectors is controlled by the content of the + * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - + * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. + * + * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' + * are variables that are only used as an 'interface' between the parameters + * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The + * 'watchdog_thresh' variable is handled differently because its value is not + * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' + * is equal zero. + */ +#define NMI_WATCHDOG_ENABLED_BIT 0 +#define SOFT_WATCHDOG_ENABLED_BIT 1 +#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) +#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#else +static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +#endif +int __read_mostly nmi_watchdog_enabled; +int __read_mostly soft_watchdog_enabled; +int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; + #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; #else @@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; - -static bool hardlockup_detector_enabled = true; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true; * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void watchdog_enable_hardlockup_detector(bool val) -{ - hardlockup_detector_enabled = val; -} - -bool watchdog_hardlockup_detector_is_enabled(void) +void hardlockup_detector_disable(void) { - return hardlockup_detector_enabled; + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; } static int __init hardlockup_panic_setup(char *str) @@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_user_enabled = 0; - else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { - /* - * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) - * has the same effect. - */ - watchdog_user_enabled = 1; - watchdog_enable_hardlockup_detector(true); - } + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); -/* deprecated */ static int __init nosoftlockup_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; return 1; } __setup("nosoftlockup", nosoftlockup_setup); -/* */ + #ifdef CONFIG_SMP static int __init softlockup_all_cpu_backtrace_setup(char *str) { @@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + get_softlockup_thresh())) - return now - touch_ts; - + if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + /* Warn about unreasonable delays. */ + if (time_after(now, touch_ts + get_softlockup_thresh())) + return now - touch_ts; + } return 0; } @@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); + + /* + * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the + * failure path. Check for failures that can occur asynchronously - + * for example, when CPUs are on-lined - and shut down the hardware + * perf event on each CPU accordingly. + * + * The only non-obvious place this bit can be cleared is through + * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a + * pr_info here would be too noisy as it would result in a message + * every few seconds if the hardlockup was disabled but the softlockup + * enabled. + */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + watchdog_nmi_disable(cpu); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu) struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); - /* - * Some kernels need to default hard lockup detection to - * 'disabled', for example a guest on a hypervisor. - */ - if (!watchdog_hardlockup_detector_is_enabled()) { - event = ERR_PTR(-ENOENT); - goto handle_err; - } + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) @@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -handle_err: /* save cpu0 error for future comparision */ if (cpu == 0 && IS_ERR(event)) cpu0_err = PTR_ERR(event); @@ -527,6 +548,18 @@ handle_err: goto out_save; } + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + /* skip displaying the same error again */ if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) return PTR_ERR(event); @@ -540,6 +573,9 @@ handle_err: else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + return PTR_ERR(event); /* success path */ @@ -567,9 +603,37 @@ static void watchdog_nmi_disable(unsigned int cpu) cpu0_err = 0; } } + +void watchdog_nmi_enable_all(void) +{ + int cpu; + + if (!watchdog_user_enabled) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_enable(cpu); + put_online_cpus(); +} + +void watchdog_nmi_disable_all(void) +{ + int cpu; + + if (!watchdog_running) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_disable(cpu); + put_online_cpus(); +} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } +void watchdog_nmi_enable_all(void) {} +void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { @@ -600,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info) HRTIMER_MODE_REL_PINNED); } -static void update_timers(int cpu) +static void update_watchdog(int cpu) { /* * Make sure that perf event counter will adopt to a new @@ -615,17 +679,17 @@ static void update_timers(int cpu) watchdog_nmi_enable(cpu); } -static void update_timers_all_cpus(void) +static void update_watchdog_all_cpus(void) { int cpu; get_online_cpus(); for_each_online_cpu(cpu) - update_timers(cpu); + update_watchdog(cpu); put_online_cpus(); } -static int watchdog_enable_all_cpus(bool sample_period_changed) +static int watchdog_enable_all_cpus(void) { int err = 0; @@ -635,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed) pr_err("Failed to create watchdog threads, disabled\n"); else watchdog_running = 1; - } else if (sample_period_changed) { - update_timers_all_cpus(); + } else { + /* + * Enable/disable the lockup detectors or + * change the sample period 'on the fly'. + */ + update_watchdog_all_cpus(); } return err; @@ -654,58 +722,159 @@ static void watchdog_disable_all_cpus(void) } /* - * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh + * Update the run state of the lockup detectors. */ +static int proc_watchdog_update(void) +{ + int err = 0; -int proc_dowatchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + /* + * Watchdog threads won't be started if they are already active. + * The 'watchdog_running' variable in watchdog_*_all_cpus() takes + * care of this. If those threads are already active, the sample + * period will be updated and the lockup detectors will be enabled + * or disabled 'on the fly'. + */ + if (watchdog_enabled && watchdog_thresh) + err = watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + + return err; + +} + +static DEFINE_MUTEX(watchdog_proc_mutex); + +/* + * common function for watchdog, nmi_watchdog and soft_watchdog parameter + * + * caller | table->data points to | 'which' contains the flag(s) + * -------------------|-----------------------|----------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed + * | | with SOFT_WATCHDOG_ENABLED + * -------------------|-----------------------|----------------------------- + * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED + * -------------------|-----------------------|----------------------------- + * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + */ +static int proc_watchdog_common(int which, struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old_thresh, old_enabled; - bool old_hardlockup; - static DEFINE_MUTEX(watchdog_proc_mutex); + int err, old, new; + int *watchdog_param = (int *)table->data; mutex_lock(&watchdog_proc_mutex); - old_thresh = ACCESS_ONCE(watchdog_thresh); - old_enabled = ACCESS_ONCE(watchdog_user_enabled); - old_hardlockup = watchdog_hardlockup_detector_is_enabled(); - err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; - - set_sample_period(); /* - * Watchdog threads shouldn't be enabled if they are - * disabled. The 'watchdog_running' variable check in - * watchdog_*_all_cpus() function takes care of this. + * If the parameter is being read return the state of the corresponding + * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the + * run state of the lockup detectors. */ - if (watchdog_user_enabled && watchdog_thresh) { + if (!write) { + *watchdog_param = (watchdog_enabled & which) != 0; + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } else { + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (err) + goto out; + /* - * Prevent a change in watchdog_thresh accidentally overriding - * the enablement of the hardlockup detector. + * There is a race window between fetching the current value + * from 'watchdog_enabled' and storing the new value. During + * this race window, watchdog_nmi_enable() can sneak in and + * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. + * The 'cmpxchg' detects this race and the loop retries. */ - if (watchdog_user_enabled != old_enabled) - watchdog_enable_hardlockup_detector(true); - err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); - } else - watchdog_disable_all_cpus(); + do { + old = watchdog_enabled; + /* + * If the parameter value is not zero set the + * corresponding bit(s), else clear it(them). + */ + if (*watchdog_param) + new = old | which; + else + new = old & ~which; + } while (cmpxchg(&watchdog_enabled, old, new) != old); - /* Restore old values on failure */ - if (err) { - watchdog_thresh = old_thresh; - watchdog_user_enabled = old_enabled; - watchdog_enable_hardlockup_detector(old_hardlockup); + /* + * Update the run state of the lockup detectors. + * Restore 'watchdog_enabled' on failure. + */ + err = proc_watchdog_update(); + if (err) + watchdog_enabled = old; } out: mutex_unlock(&watchdog_proc_mutex); return err; } + +/* + * /proc/sys/kernel/watchdog + */ +int proc_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/soft_watchdog + */ +int proc_soft_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/watchdog_thresh + */ +int proc_watchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err, old; + + mutex_lock(&watchdog_proc_mutex); + + old = ACCESS_ONCE(watchdog_thresh); + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (err || !write) + goto out; + + /* + * Update the sample period. + * Restore 'watchdog_thresh' on failure. + */ + set_sample_period(); + err = proc_watchdog_update(); + if (err) + watchdog_thresh = old; +out: + mutex_unlock(&watchdog_proc_mutex); + return err; +} #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { set_sample_period(); - if (watchdog_user_enabled) - watchdog_enable_all_cpus(false); + if (watchdog_enabled) + watchdog_enable_all_cpus(); } |