From e7fd37ba12170cc414be8b639dfc2c5f7172fac2 Mon Sep 17 00:00:00 2001 From: Ma Shimiao Date: Tue, 12 Dec 2017 09:43:49 +0800 Subject: cgroup: avoid copying strings longer than the buffers cgroup root name and file name have max length limit, we should avoid copying longer name than that to the name. tj: minor update to $SUBJ. Signed-off-by: Ma Shimiao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe147f24..18d71fbd3923 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.3 From 50034ed49645463a16327cad05694e201e6b4126 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Dec 2017 05:09:47 -0800 Subject: cgroup: use strlcpy() instead of strscpy() to avoid spurious warning As long as cft->name is guaranteed to be NUL-terminated, using strlcpy() would work just as well and avoid that warning, so the change below could be folded into that commit. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 18d71fbd3923..f4c2f8cb5748 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.3 From 116d2f7496c51b2e02e8e4ecdd2bdf5fb9d5a641 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 19 Dec 2017 12:56:57 +0530 Subject: cgroup: Fix deadlock in cpu hotplug path Deadlock during cgroup migration from cpu hotplug path when a task T is being moved from source to destination cgroup. kworker/0:0 cpuset_hotplug_workfn() cpuset_hotplug_update_tasks() hotplug_update_tasks_legacy() remove_tasks_in_empty_cpuset() cgroup_transfer_tasks() // stuck in iterator loop cgroup_migrate() cgroup_migrate_add_task() In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. Task T will not migrate to destination cgroup. css_task_iter_start() will keep pointing to task T in loop waiting for task T cg_list node to be removed. Task T do_exit() exit_signals() // sets PF_EXITING exit_task_namespaces() switch_task_namespaces() free_nsproxy() put_mnt_ns() drop_collected_mounts() namespace_unlock() synchronize_rcu() _synchronize_rcu_expedited() schedule_work() // on cpu0 low priority worker pool wait_event() // waiting for work item to execute Task T inserted a work item in the worklist of cpu0 low priority worker pool. It is waiting for expedited grace period work item to execute. This work item will only be executed once kworker/0:0 complete execution of cpuset_hotplug_workfn(). kworker/0:0 ==> Task T ==>kworker/0:0 In case of PF_EXITING task being migrated from source to destination cgroup, migrate next available task in source cgroup. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085daab1a..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); -- cgit v1.2.3 From 74d0833c659a8a54735e5efdd44f4b225af68586 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 20 Dec 2017 07:09:19 -0800 Subject: cgroup: fix css_task_iter crash on CSS_TASK_ITER_PROC While teaching css_task_iter to handle skipping over tasks which aren't group leaders, bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") introduced a silly bug. CSS_TASK_ITER_PROCS is implemented by repeating css_task_iter_advance() while the advanced cursor is pointing to a non-leader thread. However, the cursor variable, @l, wasn't updated when the iteration has to advance to the next css_set and the following repetition would operate on the terminal @l from the previous iteration which isn't pointing to a valid task leading to oopses like the following or infinite looping. BUG: unable to handle kernel NULL pointer dereference at 0000000000000254 IP: __task_pid_nr_ns+0xc7/0xf0 PGD 0 P4D 0 Oops: 0000 [#1] SMP ... CPU: 2 PID: 1 Comm: systemd Not tainted 4.14.4-200.fc26.x86_64 #1 Hardware name: System manufacturer System Product Name/PRIME B350M-A, BIOS 3203 11/09/2017 task: ffff88c4baee8000 task.stack: ffff96d5c3158000 RIP: 0010:__task_pid_nr_ns+0xc7/0xf0 RSP: 0018:ffff96d5c315bd50 EFLAGS: 00010206 RAX: 0000000000000000 RBX: ffff88c4b68c6000 RCX: 0000000000000250 RDX: ffffffffa5e47960 RSI: 0000000000000000 RDI: ffff88c490f6ab00 RBP: ffff96d5c315bd50 R08: 0000000000001000 R09: 0000000000000005 R10: ffff88c4be006b80 R11: ffff88c42f1b8004 R12: ffff96d5c315bf18 R13: ffff88c42d7dd200 R14: ffff88c490f6a510 R15: ffff88c4b68c6000 FS: 00007f9446f8ea00(0000) GS:ffff88c4be680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000254 CR3: 00000007f956f000 CR4: 00000000003406e0 Call Trace: cgroup_procs_show+0x19/0x30 cgroup_seqfile_show+0x4c/0xb0 kernfs_seq_show+0x21/0x30 seq_read+0x2ec/0x3f0 kernfs_fop_read+0x134/0x180 __vfs_read+0x37/0x160 ? security_file_permission+0x9b/0xc0 vfs_read+0x8e/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xa5 RIP: 0033:0x7f94455f942d RSP: 002b:00007ffe81ba2d00 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00005574e2233f00 RCX: 00007f94455f942d RDX: 0000000000001000 RSI: 00005574e2321a90 RDI: 000000000000002b RBP: 0000000000000000 R08: 00005574e2321a90 R09: 00005574e231de60 R10: 00007f94458c8b38 R11: 0000000000000293 R12: 00007f94458c8ae0 R13: 00007ffe81ba3800 R14: 0000000000000000 R15: 00005574e2116560 Code: 04 74 0e 89 f6 48 8d 04 76 48 8d 04 c5 f0 05 00 00 48 8b bf b8 05 00 00 48 01 c7 31 c0 48 8b 0f 48 85 c9 74 18 8b b2 30 08 00 00 <3b> 71 04 77 0d 48 c1 e6 05 48 01 f1 48 3b 51 38 74 09 5d c3 8b RIP: __task_pid_nr_ns+0xc7/0xf0 RSP: ffff96d5c315bd50 Fix it by moving the initialization of the cursor below the repeat label. While at it, rename it to @next for readability. Signed-off-by: Tejun Heo Fixes: bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") Cc: stable@vger.kernel.org # v4.14+ Reported-by: Laura Abbott Reported-by: Bronek Kozicki Reported-by: George Amanakis Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f4c2f8cb5748..2cf06c274e4c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && -- cgit v1.2.3 From 040ee69226f8a96b7943645d68f41d5d44b5ff7d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Dec 2017 20:20:38 -0500 Subject: fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'" Descriptor table is a shared object; it's not a place where you can stick temporary references to files, especially when we don't need an opened file at all. Cc: stable@vger.kernel.org # v4.14 Fixes: 98589a0998b8 ("netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'") Signed-off-by: Al Viro --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/inode.c | 40 +++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 2 +- net/netfilter/xt_bpf.c | 14 ++------------ 4 files changed, 52 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e55e4255a210..b63a592ad29d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -419,6 +419,8 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) attr->numa_node : NUMA_NO_NODE; } +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -506,6 +508,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, { return 0; } + +static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, + enum bpf_prog_type type) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -514,6 +522,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, return bpf_prog_get_type_dev(ufd, type, false); } +bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); + int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..5bb5e49ef4c3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,7 +368,45 @@ out: putname(pname); return ret; } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + if (ret) + return ERR_PTR(ret); + + if (inode->i_op == &bpf_map_iops) + return ERR_PTR(-EINVAL); + if (inode->i_op != &bpf_prog_iops) + return ERR_PTR(-EACCES); + + prog = inode->i_private; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ERR_PTR(ret); + + if (!bpf_prog_get_ok(prog, &type, false)) + return ERR_PTR(-EINVAL); + + return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + struct path path; + int ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + prog = __get_prog_inode(d_backing_inode(path.dentry), type); + if (!IS_ERR(prog)) + touch_atime(&path); + path_put(&path); + return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..5cb783fc8224 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 041da0d9c06f..fa2ca0a13619 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -52,18 +52,8 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) { - mm_segment_t oldfs = get_fs(); - int retval, fd; - - set_fs(KERNEL_DS); - fd = bpf_obj_get_user(path, 0); - set_fs(oldfs); - if (fd < 0) - return fd; - - retval = __bpf_mt_check_fd(fd, ret); - sys_close(fd); - return retval; + *ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER); + return PTR_ERR_OR_ZERO(*ret); } static int bpf_mt_check(const struct xt_mtchk_param *par) -- cgit v1.2.3 From 5731a879d03bdaa00265f8ebc32dfd0e65d25276 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 4 Jan 2018 20:02:09 -0800 Subject: bpf: sockmap missing NULL psock check Add psock NULL check to handle a racing sock event that can get the sk_callback_lock before this case but after xchg happens causing the refcnt to hit zero and sock user data (psock) to be null and queued for garbage collection. Also add a comment in the code because this is a bit subtle and not obvious in my opinion. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..1712d319c2d8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); -- cgit v1.2.3 From 527187d28569e39c5d489d6306d3b79605cf85a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jan 2018 17:27:19 +0100 Subject: locking/lockdep: Remove cross-release leftovers There's two cross-release leftover facilities: - the crossrelease_hist_*() irq-tracing callbacks (NOPs currently) - the complete_release_commit() callback (NOP as well) Remove them. Cc: David Sterba Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/completion.h | 1 - include/linux/irqflags.h | 4 ---- include/linux/lockdep.h | 2 -- kernel/sched/completion.c | 5 ----- 4 files changed, 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/completion.h b/include/linux/completion.h index 94a59ba7d422..519e94915d18 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -32,7 +32,6 @@ struct completion { #define init_completion(x) __init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} -static inline void complete_release_commit(struct completion *x) {} #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 46cb57d5eb13..1b3996ff3f16 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -27,22 +27,18 @@ # define trace_hardirq_enter() \ do { \ current->hardirq_context++; \ - crossrelease_hist_start(XHLOCK_HARD); \ } while (0) # define trace_hardirq_exit() \ do { \ current->hardirq_context--; \ - crossrelease_hist_end(XHLOCK_HARD); \ } while (0) # define lockdep_softirq_enter() \ do { \ current->softirq_context++; \ - crossrelease_hist_start(XHLOCK_SOFT); \ } while (0) # define lockdep_softirq_exit() \ do { \ current->softirq_context--; \ - crossrelease_hist_end(XHLOCK_SOFT); \ } while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 2e75dc34bff5..3251d9c0d313 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -475,8 +475,6 @@ enum xhlock_context_t { #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } -static inline void crossrelease_hist_start(enum xhlock_context_t c) {} -static inline void crossrelease_hist_end(enum xhlock_context_t c) {} static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); -- cgit v1.2.3 From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 7 Jan 2018 17:33:02 -0800 Subject: bpf: prevent out-of-bounds speculation Under speculation, CPUs may mis-predict branches in bounds checks. Thus, memory accesses under a bounds check may be speculated even if the bounds check fails, providing a primitive for building a side channel. To avoid leaking kernel data round up array-based maps and mask the index after bounds check, so speculated load with out of bounds index will load either valid value from the array or zero from the padded area. Unconditionally mask index for all array types even when max_entries are not rounded to power of 2 for root user. When map is created by unpriv user generate a sequence of bpf insns that includes AND operation to make sure that JITed code includes the same 'index & index_mask' operation. If prog_array map is created by unpriv user replace bpf_tail_call(ctx, map, index); with if (index >= max_entries) { index &= map->index_mask; bpf_tail_call(ctx, map, index); } (along with roundup to power 2) to prevent out-of-bounds speculation. There is secondary redundant 'if (index >= max_entries)' in the interpreter and in all JITs, but they can be optimized later if necessary. Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array) cannot be used by unpriv, so no changes there. That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on all architectures with and without JIT. v2->v3: Daniel noticed that attack potentially can be crafted via syscall commands without loading the program, so add masking to those paths as well. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 47 ++++++++++++++++++++++++++++++++++++----------- kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e55e4255a210..1b985ca4ffbe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,6 +52,7 @@ struct bpf_map { u32 pages; u32 id; int numa_node; + bool unpriv_array; struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; @@ -221,6 +222,7 @@ struct bpf_prog_aux { struct bpf_array { struct bpf_map map; u32 elem_size; + u32 index_mask; /* 'ownership' of prog_array is claimed by the first program that * is going to use this map or by the first program which FD is stored * in the map to make sure that all callers and callees have the same diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..aaa319848e7d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; u64 array_size; - u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + index_mask = roundup_pow_of_two(max_entries) - 1; + + if (unpriv) + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -121,12 +133,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +195,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +243,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +281,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -613,6 +632,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +641,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04b24876cd23..b414d6b2d470 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -4456,6 +4463,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call obusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } -- cgit v1.2.3 From 290af86629b25ffd1ed6232c4e9107da031705cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Jan 2018 10:04:29 -0800 Subject: bpf: introduce BPF_JIT_ALWAYS_ON config The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- init/Kconfig | 7 +++++++ kernel/bpf/core.c | 19 +++++++++++++++++++ lib/test_bpf.c | 11 +++++++---- net/core/filter.c | 6 ++---- net/core/sysctl_net_core.c | 6 ++++++ net/socket.c | 9 +++++++++ 6 files changed, 50 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 2934249fba46..5e2a4a391ba9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1392,6 +1392,13 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + config USERFAULTFD bool "Enable userfaultfd() system call" select ANON_INODES diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..51ec2dda7f08 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9e9748089270..f369889e521d 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6250,9 +6250,8 @@ static struct bpf_prog *generate_filter(int which, int *err) return NULL; } } - /* We don't expect to fail. */ if (*err) { - pr_cont("FAIL to attach err=%d len=%d\n", + pr_cont("FAIL to prog_create err=%d len=%d\n", *err, fprog.len); return NULL; } @@ -6276,6 +6275,10 @@ static struct bpf_prog *generate_filter(int which, int *err) * checks. */ fp = bpf_prog_select_runtime(fp, err); + if (*err) { + pr_cont("FAIL to select_runtime err=%d\n", *err); + return NULL; + } break; } @@ -6461,8 +6464,8 @@ static __init int test_bpf(void) pass_cnt++; continue; } - - return err; + err_cnt++; + continue; } pr_cont("jited:%u ", fp->jited); diff --git a/net/core/filter.c b/net/core/filter.c index 6a85e67fafce..d339ef170df6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1054,11 +1054,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - /* We are guaranteed to never error here with cBPF to eBPF - * transitions, since there's no issue with type compatibility - * checks on program arrays. - */ fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; kfree(old_prog); return fp; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cbc3dde4cfcc..a47ad6cd41c0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -325,7 +325,13 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, +#ifndef CONFIG_BPF_JIT_ALWAYS_ON .proc_handler = proc_dointvec +#else + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, +#endif }, # ifdef CONFIG_HAVE_EBPF_JIT { diff --git a/net/socket.c b/net/socket.c index 05f361faec45..78acd6ce74c7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2619,6 +2619,15 @@ out_fs: core_initcall(sock_init); /* early initcall */ +static int __init jit_init(void) +{ +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + bpf_jit_enable = 1; +#endif + return 0; +} +pure_initcall(jit_init); + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { -- cgit v1.2.3 From 541676078b52f365f53d46ee5517d305cd1b6350 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 15 Dec 2017 14:23:10 -0500 Subject: membarrier: Disable preemption when calling smp_call_function_many() smp_call_function_many() requires disabling preemption around the call. Signed-off-by: Mathieu Desnoyers Cc: # v4.14+ Cc: Andrea Parri Cc: Andrew Hunter Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171215192310.25293-1-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..9bcbacba82a8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); -- cgit v1.2.3 From 4f58424da3deead2605e39a9df65f5f06107a3cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 10 Jan 2018 04:35:12 -0800 Subject: cgroup: make cgroup.threads delegatable Make cgroup.threads file delegatable. The behavior of cgroup.threads should follow the behavior of cgroup.procs. Signed-off-by: Roman Gushchin Discovered-by: Michael Kerrisk Cc: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2cf06c274e4c..7e4c44538119 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4447,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.threads", + .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, -- cgit v1.2.3 From 40950343932879247861ae152dcb55e4555afdff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Jan 2018 09:20:54 +0000 Subject: bpf: fix spelling mistake: "obusing" -> "abusing" Trivial fix to spelling mistake in error message text. Signed-off-by: Colin Ian King Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b414d6b2d470..96ab165c873c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4472,7 +4472,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON) { - verbose(env, "tail_call obusing map_ptr\n"); + verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } if (!map_ptr->unpriv_array) -- cgit v1.2.3 From 7891a87efc7116590eaba57acc3c422487802c6f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jan 2018 20:04:37 +0100 Subject: bpf: arsh is not supported in 32 bit alu thus reject it The following snippet was throwing an 'unknown opcode cc' warning in BPF interpreter: 0: (18) r0 = 0x0 2: (7b) *(u64 *)(r10 -16) = r0 3: (cc) (u32) r0 s>>= (u32) r0 4: (95) exit Although a number of JITs do support BPF_ALU | BPF_ARSH | BPF_{K,X} generation, not all of them do and interpreter does neither. We can leave existing ones and implement it later in bpf-next for the remaining ones, but reject this properly in verifier for the time being. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Reported-by: syzbot+93c4904c5c70348a6890@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++++ tools/testing/selftests/bpf/test_verifier.c | 40 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 96ab165c873c..20eb04fd155e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2493,6 +2493,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b51017404c62..6bafa5456568 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -272,6 +272,46 @@ static struct bpf_test tests[] = { .errstr = "invalid bpf_ld_imm64 insn", .result = REJECT, }, + { + "arsh32 on imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 5), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "BPF_ARSH not supported for 32 bit ALU", + }, + { + "arsh32 on reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "BPF_ARSH not supported for 32 bit ALU", + }, + { + "arsh64 on imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_0, 5), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "arsh64 on reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_ALU64_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, { "no bpf_exit", .insns = { -- cgit v1.2.3 From bbeb6e4323dad9b5e0ee9f60c223dd532e2403b1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jan 2018 23:25:05 +0100 Subject: bpf, array: fix overflow in max_entries and undefined behavior in index_mask syzkaller tried to alloc a map with 0xfffffffd entries out of a userns, and thus unprivileged. With the recently added logic in b2157399cc98 ("bpf: prevent out-of-bounds speculation") we round this up to the next power of two value for max_entries for unprivileged such that we can apply proper masking into potentially zeroed out map slots. However, this will generate an index_mask of 0xffffffff, and therefore a + 1 will let this overflow into new max_entries of 0. This will pass allocation, etc, and later on map access we still enforce on the original attr->max_entries value which was 0xfffffffd, therefore triggering GPF all over the place. Thus bail out on overflow in such case. Moreover, on 32 bit archs roundup_pow_of_two() can also not be used, since fls_long(max_entries - 1) can result in 32 and 1UL << 32 in 32 bit space is undefined. Therefore, do this by hand in a 64 bit variable. This fixes all the issues triggered by syzkaller's reproducers. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: syzbot+b0efb8e572d01bce1ae0@syzkaller.appspotmail.com Reported-by: syzbot+6c15e9744f75f2364773@syzkaller.appspotmail.com Reported-by: syzbot+d2f5524fb46fd3b312ee@syzkaller.appspotmail.com Reported-by: syzbot+61d23c95395cc90dbc2b@syzkaller.appspotmail.com Reported-by: syzbot+0d363c942452cca68c01@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index aaa319848e7d..ab94d304a634 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; - u64 array_size; + u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -74,13 +74,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; - index_mask = roundup_pow_of_two(max_entries) - 1; - if (unpriv) + /* On 32 bit archs roundup_pow_of_two() with max_entries that has + * upper most bit set in u32 space is undefined behavior due to + * resulting 1U << 32, so do it manually here in u64 space. + */ + mask64 = fls_long(max_entries - 1); + mask64 = 1ULL << mask64; + mask64 -= 1; + + index_mask = mask64; + if (unpriv) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; + /* Check for overflows. */ + if (max_entries < attr->max_entries) + return ERR_PTR(-E2BIG); + } array_size = sizeof(*array); if (percpu) -- cgit v1.2.3 From 62635ea8c18f0f62df4cc58379e4f1d33afd5801 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 11 Jan 2018 09:53:35 +0900 Subject: workqueue: avoid hard lockups in show_workqueue_state() show_workqueue_state() can print out a lot of messages while being in atomic context, e.g. sysrq-t -> show_workqueue_state(). If the console device is slow it may end up triggering NMI hard lockup watchdog. Signed-off-by: Sergey Senozhatsky Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.5+ --- kernel/workqueue.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..f699122dab32 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -4463,6 +4464,12 @@ void show_workqueue_state(void) if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } } @@ -4490,6 +4497,12 @@ void show_workqueue_state(void) pr_cont("\n"); next_pool: spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } rcu_read_unlock_sched(); -- cgit v1.2.3 From a0b1280368d1e91ab72f849ef095b4f07a39bbf1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 12 Jan 2018 16:53:14 -0800 Subject: kdump: write correct address of mem_section into vmcoreinfo Depending on configuration mem_section can now be an array or a pointer to an array allocated dynamically. In most cases, we can continue to refer to it as 'mem_section' regardless of what it is. But there's one exception: '&mem_section' means "address of the array" if mem_section is an array, but if mem_section is a pointer, it would mean "address of the pointer". We've stepped onto this in kdump code. VMCOREINFO_SYMBOL(mem_section) writes down address of pointer into vmcoreinfo, not array as we wanted. Let's introduce VMCOREINFO_SYMBOL_ARRAY() that would handle the situation correctly for both cases. Link: http://lkml.kernel.org/r/20180112162532.35896-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Acked-by: Baoquan He Acked-by: Dave Young Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 2 ++ kernel/crash_core.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 06097ef30449..b511f6d24b42 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -42,6 +42,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("PAGESIZE=%ld\n", value) #define VMCOREINFO_SYMBOL(name) \ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ARRAY(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) #define VMCOREINFO_SIZE(name) \ vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ (unsigned long)sizeof(name)) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b3663896278e..4f63597c824d 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); -- cgit v1.2.3 From c366287ebd698ef5e3de300d90cd62ee9ee7373e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2018 17:43:23 -0800 Subject: bpf: fix divides by zero Divides by zero are not nice, lets avoid them if possible. Also do_div() seems not needed when dealing with 32bit operands, but this seems a minor detail. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 51ec2dda7f08..7949e8b8f94e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -956,7 +956,7 @@ select_insn: DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -975,7 +975,7 @@ select_insn: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); -- cgit v1.2.3 From c1e2f0eaf015fb7076d51a339011f2383e6dd389 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Dec 2017 13:49:39 +0100 Subject: futex: Avoid violating the 10th rule of futex Julia reported futex state corruption in the following scenario: waiter waker stealer (prio > waiter) futex(WAIT_REQUEUE_PI, uaddr, uaddr2, timeout=[N ms]) futex_wait_requeue_pi() futex_wait_queue_me() freezable_schedule() futex(LOCK_PI, uaddr2) futex(CMP_REQUEUE_PI, uaddr, uaddr2, 1, 0) /* requeues waiter to uaddr2 */ futex(UNLOCK_PI, uaddr2) wake_futex_pi() cmp_futex_value_locked(uaddr2, waiter) wake_up_q() task> futex(LOCK_PI, uaddr2) __rt_mutex_start_proxy_lock() try_to_take_rt_mutex() /* steals lock */ rt_mutex_set_owner(lock, stealer) rt_mutex_wait_proxy_lock() __rt_mutex_slowlock() try_to_take_rt_mutex() /* fails, lock held by stealer */ if (timeout && !timeout->task) return -ETIMEDOUT; fixup_owner() /* lock wasn't acquired, so, fixup_pi_state_owner skipped */ return -ETIMEDOUT; /* At this point, we've returned -ETIMEDOUT to userspace, but the * futex word shows waiter to be the owner, and the pi_mutex has * stealer as the owner */ futex_lock(LOCK_PI, uaddr2) -> bails with EDEADLK, futex word says we're owner. And suggested that what commit: 73d786bd043e ("futex: Rework inconsistent rt_mutex/futex_q state") removes from fixup_owner() looks to be just what is needed. And indeed it is -- I completely missed that requeue_pi could also result in this case. So we need to restore that, except that subsequent patches, like commit: 16ffa12d7425 ("futex: Pull rt_mutex_futex_unlock() out from under hb->lock") changed all the locking rules. Even without that, the sequence: - if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); already suggests there were races; otherwise we'd never have to look at next_owner. So instead of doing 3 consecutive wait_lock sections with who knows what races, we do it all in a single section. Additionally, the usage of pi_state->owner in fixup_owner() was only safe because only the rt_mutex owner would modify it, which this additional case wrecks. Luckily the values can only change away and not to the value we're testing, this means we can do a speculative test and double check once we have the wait_lock. Fixes: 73d786bd043e ("futex: Rework inconsistent rt_mutex/futex_q state") Reported-by: Julia Cartwright Reported-by: Gratian Crisan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Julia Cartwright Tested-by: Gratian Crisan Cc: Darren Hart Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20171208124939.7livp7no2ov65rrc@hirez.programming.kicks-ass.net --- kernel/futex.c | 83 +++++++++++++++++++++++++++++++++-------- kernel/locking/rtmutex.c | 26 +++++++++---- kernel/locking/rtmutex_common.h | 1 + 3 files changed, 87 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 57d0b3657e16..9e69589b9248 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2294,21 +2294,17 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *argowner) { - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; u32 uval, uninitialized_var(curval), newval; - struct task_struct *oldowner; + struct task_struct *oldowner, *newowner; + u32 newtid; int ret; + lockdep_assert_held(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; @@ -2317,11 +2313,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, newtid |= FUTEX_OWNER_DIED; /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but have failed to get the rtmutex the first time. + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), * - * We have to replace the newowner TID in the user space variable. + * or: + * + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state @@ -2334,6 +2336,42 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the PID check in lookup_pi_state. */ retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + ret = 0; + goto out_unlock; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; @@ -2434,15 +2472,28 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: * - * We can safely read pi_state->owner without holding wait_lock - * because we now own the rt_mutex, only the owner will attempt - * to change it. + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); goto out; } + /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + goto out; + } + /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6f3dba6e4e9e..65cc0cb984e6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + /* * Slow path try-lock function: */ @@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) return rt_mutex_slowtrylock(lock); } +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 124e98ca0b17..68686b3ec3c1 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, -- cgit v1.2.3 From fbe0e839d1e22d88810f3ee3e2f1479be4c0aa4a Mon Sep 17 00:00:00 2001 From: Li Jinyue Date: Thu, 14 Dec 2017 17:04:54 +0800 Subject: futex: Prevent overflow by strengthen input validation UBSAN reports signed integer overflow in kernel/futex.c: UBSAN: Undefined behaviour in kernel/futex.c:2041:18 signed integer overflow: 0 - -2147483648 cannot be represented in type 'int' Add a sanity check to catch negative values of nr_wake and nr_requeue. Signed-off-by: Li Jinyue Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: dvhart@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1513242294-31786-1-git-send-email-lijinyue@huawei.com --- kernel/futex.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 9e69589b9248..8c5424dd5924 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past -- cgit v1.2.3 From 68fda450a7df51cff9e5a4d4a4d9d0d5f2589153 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jan 2018 18:59:52 -0800 Subject: bpf: fix 32-bit divide by zero due to some JITs doing if (src_reg == 0) check in 64-bit mode for div/mod operations mask upper 32-bits of src register before doing the check Fixes: 622582786c9e ("net: filter: x86: internal BPF JIT") Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") Reported-by: syzbot+48340bb518e88849e2e3@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 4 ++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20eb04fd155e..b7448347e6b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4445,6 +4445,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; diff --git a/net/core/filter.c b/net/core/filter.c index d339ef170df6..1c0eb436671f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -458,6 +458,10 @@ do_pass: convert_bpf_extensions(fp, &insn)) break; + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; -- cgit v1.2.3 From a0e3a18f4baf8e3754ac1e56f0ade924d0c0c721 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 15 Jan 2018 10:47:09 -0500 Subject: ring-buffer: Bring back context level recursive checks Commit 1a149d7d3f45 ("ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler") replaced the context level recursion checks with a simple counter. This would prevent the ring buffer code from recursively calling itself more than the max number of contexts that exist (Normal, softirq, irq, nmi). But this change caused a lockup in a specific case, which was during suspend and resume using a global clock. Adding a stack dump to see where this occurred, the issue was in the trace global clock itself: trace_buffer_lock_reserve+0x1c/0x50 __trace_graph_entry+0x2d/0x90 trace_graph_entry+0xe8/0x200 prepare_ftrace_return+0x69/0xc0 ftrace_graph_caller+0x78/0xa8 queued_spin_lock_slowpath+0x5/0x1d0 trace_clock_global+0xb0/0xc0 ring_buffer_lock_reserve+0xf9/0x390 The function graph tracer traced queued_spin_lock_slowpath that was called by trace_clock_global. This pointed out that the trace_clock_global() is not reentrant, as it takes a spin lock. It depended on the ring buffer recursive lock from letting that happen. By removing the context detection and adding just a max number of allowable recursions, it allowed the trace_clock_global() to be entered again and try to retake the spinlock it already held, causing a deadlock. Fixes: 1a149d7d3f45 ("ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler") Reported-by: David Weinehall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 62 +++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ab18995ff1e..0cddf60186da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2534,29 +2534,59 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. There are four - * different contexts that we need to consider. + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. * - * Normal context. - * SoftIRQ context - * IRQ context - * NMI context + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. * - * If for some reason the ring buffer starts to recurse, we - * only allow that to happen at most 4 times (one for each - * context). If it happens 5 times, then we consider this a - * recusive loop and do not let it go further. + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - if (cpu_buffer->current_context >= 4) + unsigned int val = cpu_buffer->current_context; + unsigned long pc = preempt_count(); + int bit; + + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + bit = RB_CTX_NORMAL; + else + bit = pc & NMI_MASK ? RB_CTX_NMI : + pc & HARDIRQ_MASK ? RB_CTX_IRQ : + pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ; + + if (unlikely(val & (1 << bit))) return 1; - cpu_buffer->current_context++; - /* Interrupts must see this update */ - barrier(); + val |= (1 << bit); + cpu_buffer->current_context = val; return 0; } @@ -2564,9 +2594,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - /* Don't let the dec leak out */ - barrier(); - cpu_buffer->current_context--; + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } /** -- cgit v1.2.3 From 68e76e034b6b1c1ce2eece1ab8ae4008e14be470 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jan 2018 11:07:27 -0800 Subject: tracing: Prevent PROFILE_ALL_BRANCHES when FORTIFY_SOURCE=y I regularly get 50 MB - 60 MB files during kernel randconfig builds. These large files mostly contain (many repeats of; e.g., 124,594): In file included from ../include/linux/string.h:6:0, from ../include/linux/uuid.h:20, from ../include/linux/mod_devicetable.h:13, from ../scripts/mod/devicetable-offsets.c:3: ../include/linux/compiler.h:64:4: warning: '______f' is static but declared in inline function 'strcpy' which is not static [enabled by default] ______f = { \ ^ ../include/linux/compiler.h:56:23: note: in expansion of macro '__trace_if' ^ ../include/linux/string.h:425:2: note: in expansion of macro 'if' if (p_size == (size_t)-1 && q_size == (size_t)-1) ^ This only happens when CONFIG_FORTIFY_SOURCE=y and CONFIG_PROFILE_ALL_BRANCHES=y, so prevent PROFILE_ALL_BRANCHES if FORTIFY_SOURCE=y. Link: http://lkml.kernel.org/r/9199446b-a141-c0c3-9678-a3f9107f2750@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 904c952ac383..f54dc62b599c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES - bool "Profile all if conditionals" + bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () -- cgit v1.2.3 From c96f5471ce7d2aefd0dda560cc23f08ab00bc65d Mon Sep 17 00:00:00 2001 From: Josh Snyder Date: Mon, 18 Dec 2017 16:15:10 +0000 Subject: delayacct: Account blkio completion on the correct task Before commit: e33a9bba85a8 ("sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler") delayacct_blkio_end() was called after context-switching into the task which completed I/O. This resulted in double counting: the task would account a delay both waiting for I/O and for time spent in the runqueue. With e33a9bba85a8, delayacct_blkio_end() is called by try_to_wake_up(). In ttwu, we have not yet context-switched. This is more correct, in that the delay accounting ends when the I/O is complete. But delayacct_blkio_end() relies on 'get_current()', and we have not yet context-switched into the task whose I/O completed. This results in the wrong task having its delay accounting statistics updated. Instead of doing that, pass the task_struct being woken to delayacct_blkio_end(), so that it can update the statistics of the correct task. Signed-off-by: Josh Snyder Acked-by: Tejun Heo Acked-by: Balbir Singh Cc: Cc: Brendan Gregg Cc: Jens Axboe Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-block@vger.kernel.org Fixes: e33a9bba85a8 ("sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler") Link: http://lkml.kernel.org/r/1513613712-571-1-git-send-email-joshs@netflix.com Signed-off-by: Ingo Molnar --- include/linux/delayacct.h | 8 ++++---- kernel/delayacct.c | 42 ++++++++++++++++++++++++++---------------- kernel/sched/core.c | 6 +++--- 3 files changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 4178d2493547..5e335b6203f4 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -71,7 +71,7 @@ extern void delayacct_init(void); extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); -extern void __delayacct_blkio_end(void); +extern void __delayacct_blkio_end(struct task_struct *); extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); @@ -122,10 +122,10 @@ static inline void delayacct_blkio_start(void) __delayacct_blkio_start(); } -static inline void delayacct_blkio_end(void) +static inline void delayacct_blkio_end(struct task_struct *p) { if (current->delays) - __delayacct_blkio_end(); + __delayacct_blkio_end(p); delayacct_clear_flag(DELAYACCT_PF_BLKIO); } @@ -169,7 +169,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) {} static inline void delayacct_blkio_start(void) {} -static inline void delayacct_blkio_end(void) +static inline void delayacct_blkio_end(struct task_struct *p) {} static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 4a1c33416b6a..e2764d767f18 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(u64 *start, u64 *total, u32 *count) +static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(¤t->delays->lock, flags); + spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + spin_unlock_irqrestore(lock, flags); } } @@ -69,17 +69,25 @@ void __delayacct_blkio_start(void) current->delays->blkio_start = ktime_get_ns(); } -void __delayacct_blkio_end(void) +/* + * We cannot rely on the `current` macro, as we haven't yet switched back to + * the process being woken. + */ +void __delayacct_blkio_end(struct task_struct *p) { - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); + struct task_delay_info *delays = p->delays; + u64 *total; + u32 *count; + + if (p->delays->flags & DELAYACCT_PF_SWAPIN) { + total = &delays->swapin_delay; + count = &delays->swapin_count; + } else { + total = &delays->blkio_delay; + count = &delays->blkio_count; + } + + delayacct_end(&delays->lock, &delays->blkio_start, total, count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,8 +161,10 @@ void __delayacct_freepages_start(void) void __delayacct_freepages_end(void) { - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + delayacct_end( + ¤t->delays->lock, + ¤t->delays->freepages_start, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 644fa2e3d993..a7bf32aabfda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); -- cgit v1.2.3 From f37a8cb84cce18762e8f86a70bd6a49a66ab964c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 16 Jan 2018 23:30:10 +0100 Subject: bpf: reject stores into ctx via st and xadd Alexei found that verifier does not reject stores into context via BPF_ST instead of BPF_STX. And while looking at it, we also should not allow XADD variant of BPF_STX. The context rewriter is only assuming either BPF_LDX_MEM- or BPF_STX_MEM-type operations, thus reject anything other than that so that assumptions in the rewriter properly hold. Add test cases as well for BPF selftests. Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ tools/testing/selftests/bpf/test_verifier.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b7448347e6b6..eb062b0fbf27 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return reg->type == PTR_TO_CTX; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1258,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -3993,6 +4006,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 6bafa5456568..67e7c41674d2 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2592,6 +2592,29 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, + { + "context stores via ST", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_1, offsetof(struct __sk_buff, mark), 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_ST stores into R1 context is not allowed", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "context stores via XADD", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_1, + BPF_REG_0, offsetof(struct __sk_buff, mark), 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_XADD stores into R1 context is not allowed", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, { "direct packet access: test1", .insns = { @@ -4312,7 +4335,8 @@ static struct bpf_test tests[] = { .fixup_map1 = { 2 }, .errstr_unpriv = "R2 leaks addr into mem", .result_unpriv = REJECT, - .result = ACCEPT, + .result = REJECT, + .errstr = "BPF_XADD stores into R1 context is not allowed", }, { "leak pointer into ctx 2", @@ -4326,7 +4350,8 @@ static struct bpf_test tests[] = { }, .errstr_unpriv = "R10 leaks addr into mem", .result_unpriv = REJECT, - .result = ACCEPT, + .result = REJECT, + .errstr = "BPF_XADD stores into R1 context is not allowed", }, { "leak pointer into ctx 3", -- cgit v1.2.3 From 6f16101e6a8b4324c36e58a29d9e0dbb287cdedb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 Jan 2018 01:15:21 +0100 Subject: bpf: mark dst unknown on inconsistent {s, u}bounds adjustments syzkaller generated a BPF proglet and triggered a warning with the following: 0: (b7) r0 = 0 1: (d5) if r0 s<= 0x0 goto pc+0 R0=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (1f) r0 -= r1 R0=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 verifier internal error: known but bad sbounds What happens is that in the first insn, r0's min/max value are both 0 due to the immediate assignment, later in the jsle test the bounds are updated for the min value in the false path, meaning, they yield smin_val = 1, smax_val = 0, and when ctx pointer is subtracted from r0, verifier bails out with the internal error and throwing a WARN since smin_val != smax_val for the known constant. For min_val > max_val scenario it means that reg_set_min_max() and reg_set_min_max_inv() (which both refine existing bounds) demonstrated that such branch cannot be taken at runtime. In above scenario for the case where it will be taken, the existing [0, 0] bounds are kept intact. Meaning, the rejection is not due to a verifier internal error, and therefore the WARN() is not necessary either. We could just reject such cases in adjust_{ptr,scalar}_min_max_vals() when either known scalars have smin_val != smax_val or umin_val != umax_val or any scalar reg with bounds smin_val > smax_val or umin_val > umax_val. However, there may be a small risk of breakage of buggy programs, so handle this more gracefully and in adjust_{ptr,scalar}_min_max_vals() just taint the dst reg as unknown scalar when we see ops with such kind of src reg. Reported-by: syzbot+6d362cadd45dc0a12ba4@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 27 +++--- tools/testing/selftests/bpf/test_verifier.c | 123 +++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb062b0fbf27..13551e623501 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1895,17 +1895,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; - if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad sbounds\n"); - return -EINVAL; - } - if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad ubounds\n"); - return -EINVAL; + if ((known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; } if (BPF_CLASS(insn->code) != BPF_ALU64) { @@ -2097,6 +2093,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; + } + if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { __mark_reg_unknown(dst_reg); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 67e7c41674d2..5ed4175c4ff8 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -6732,7 +6732,7 @@ static struct bpf_test tests[] = { BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, .fixup_map1 = { 4 }, - .errstr = "unbounded min value", + .errstr = "R0 invalid mem access 'inv'", .result = REJECT, }, { @@ -8633,6 +8633,127 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_XDP, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, + { + "check deducing bounds from const, 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 1, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check deducing bounds from const, 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check deducing bounds from const, 5", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 6", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 7", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, ~0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "check deducing bounds from const, 8", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, ~0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "check deducing bounds from const, 9", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 10", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0), + /* Marks reg as unknown. */ + BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "math between ctx pointer and register with unbounded min value is not allowed", + }, { "bpf_exit with invalid return code. test1", .insns = { -- cgit v1.2.3 From a0c9259dc4e1923a98356967ce8b732da1979df8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Jan 2018 16:01:47 +0100 Subject: irq/matrix: Spread interrupts on allocation Keith reported an issue with vector space exhaustion on a server machine which is caused by the i40e driver allocating 168 MSI interrupts when the driver is initialized, even when most of these interrupts are not used at all. The x86 vector allocation code tries to avoid the immediate allocation with the reservation mode, but the card uses MSI and does not support MSI entry masking, which prevents reservation mode and requires immediate vector allocation. The matrix allocator is a bit naive and prefers the first CPU in the cpumask which describes the possible target CPUs for an allocation. That results in allocating all 168 vectors on CPU0 which later causes vector space exhaustion when the NVMe driver tries to allocate managed interrupts on each CPU for the per CPU queues. Avoid this by finding the CPU which has the lowest vector allocation count to spread out the non managed interrupt accross the possible target CPUs. Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator") Reported-by: Keith Busch Signed-off-by: Thomas Gleixner Tested-by: Keith Busch Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801171557330.1777@nanos --- kernel/irq/matrix.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 0ba0dd8863a7..5187dfe809ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu) { - unsigned int cpu; + unsigned int cpu, best_cpu, maxavl = 0; + struct cpumap *cm; + unsigned int bit; + best_cpu = UINT_MAX; for_each_cpu(cpu, msk) { - struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - unsigned int bit; + cm = per_cpu_ptr(m->maps, cpu); - if (!cm->online) + if (!cm->online || cm->available <= maxavl) continue; + best_cpu = cpu; + maxavl = cm->available; + } + + if (maxavl) { + cm = per_cpu_ptr(m->maps, best_cpu); bit = matrix_alloc_area(m, cm, 1, false); if (bit < m->alloc_end) { cm->allocated++; @@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, m->global_available--; if (reserved) m->global_reserved--; - *mapped_cpu = cpu; - trace_irq_matrix_alloc(bit, cpu, m, cm); + *mapped_cpu = best_cpu; + trace_irq_matrix_alloc(bit, best_cpu, m, cm); return bit; } } -- cgit v1.2.3 From 0164e0d7e803af3ee1c63770978c728f8778ad01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 18 Jan 2018 15:42:09 -0500 Subject: ring-buffer: Fix duplicate results in mapping context to bits in recursive lock In bringing back the context checks, the code checks first if its normal (non-interrupt) context, and then for NMI then IRQ then softirq. The final check is redundant. Since the if branch is only hit if the context is one of NMI, IRQ, or SOFTIRQ, if it's not NMI or IRQ there's no reason to check if it is SOFTIRQ. The current code returns the same result even if its not a SOFTIRQ. Which is confusing. pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ Is redundant as RB_CTX_SOFTIRQ *is* 2! Fixes: a0e3a18f4baf ("ring-buffer: Bring back context level recursive checks") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0cddf60186da..5af2842dea96 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2579,8 +2579,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = RB_CTX_NORMAL; else bit = pc & NMI_MASK ? RB_CTX_NMI : - pc & HARDIRQ_MASK ? RB_CTX_IRQ : - pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ; + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; if (unlikely(val & (1 << bit))) return 1; -- cgit v1.2.3 From 1ebe1eaf2f02784921759992ae1fde1a9bec8fd0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 18 Jan 2018 15:53:10 -0500 Subject: tracing: Fix converting enum's from the map in trace_event_eval_update() Since enums do not get converted by the TRACE_EVENT macro into their values, the event format displaces the enum name and not the value. This breaks tools like perf and trace-cmd that need to interpret the raw binary data. To solve this, an enum map was created to convert these enums into their actual numbers on boot up. This is done by TRACE_EVENTS() adding a TRACE_DEFINE_ENUM() macro. Some enums were not being converted. This was caused by an optization that had a bug in it. All calls get checked against this enum map to see if it should be converted or not, and it compares the call's system to the system that the enum map was created under. If they match, then they call is processed. To cut down on the number of iterations needed to find the maps with a matching system, since calls and maps are grouped by system, when a match is made, the index into the map array is saved, so that the next call, if it belongs to the same system as the previous call, could start right at that array index and not have to scan all the previous arrays. The problem was, the saved index was used as the variable to know if this is a call in a new system or not. If the index was zero, it was assumed that the call is in a new system and would keep incrementing the saved index until it found a matching system. The issue arises when the first matching system was at index zero. The next map, if it belonged to the same system, would then think it was the first match and increment the index to one. If the next call belong to the same system, it would begin its search of the maps off by one, and miss the first enum that should be converted. This left a single enum not converted properly. Also add a comment to describe exactly what that index was for. It took me a bit too long to figure out what I was thinking when debugging this issue. Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com Cc: stable@vger.kernel.org Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values") Reported-by: Chuck Lever Teste-by: Chuck Lever Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa4e151..1b87157edbff 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } -- cgit v1.2.3 From 6be7fa3c74d1e0cd50f2157b5c1524f152bf641e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 22 Jan 2018 22:32:51 -0500 Subject: ftrace, orc, x86: Handle ftrace dynamically allocated trampolines The function tracer can create a dynamically allocated trampoline that is called by the function mcount or fentry hook that is used to call the function callback that is registered. The problem is that the orc undwinder will bail if it encounters one of these trampolines. This breaks the stack trace of function callbacks, which include the stack tracer and setting the stack trace for individual functions. Since these dynamic trampolines are basically copies of the static ftrace trampolines defined in ftrace_*.S, we do not need to create new orc entries for the dynamic trampolines. Finding the return address on the stack will be identical as the functions that were copied to create the dynamic trampolines. When encountering a ftrace dynamic trampoline, we can just use the orc entry of the ftrace static function that was copied for that trampoline. Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/unwind_orc.c | 48 +++++++++++++++++++++++++++++++++++++++++++- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 29 +++++++++++++++----------- 3 files changed, 66 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index be86a865087a..1f9188f5357c 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the undwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entrie, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + struct ftrace_ops *ops; + unsigned long caller; + + ops = ftrace_ops_trampoline(ip); + if (!ops) + return NULL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + caller = (unsigned long)ftrace_regs_call; + else + caller = (unsigned long)ftrace_call; + + /* Prevent unlikely recursion */ + if (ip == caller) + return NULL; + + return orc_find(caller); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + return NULL; +} +#endif + static struct orc_entry *orc_find(unsigned long ip) { + static struct orc_entry *orc; + if (!orc_init) return NULL; @@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip) __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); /* Module lookup: */ - return orc_module_find(ip); + orc = orc_module_find(ip); + if (orc) + return orc; + + return orc_ftrace_find(ip); } static void orc_sort_swap(void *_a, void *_b, int size) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2bab81951ced..3319df9727aa 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -332,6 +332,8 @@ extern int ftrace_text_reserved(const void *start, const void *end); extern int ftrace_nr_registered_ops(void); +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr); + bool is_ftrace_trampoline(unsigned long addr); /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf3664e4a9..554b517c61a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { }; /* - * This is used by __kernel_text_address() to return true if the - * address is on a dynamically allocated trampoline that would - * not return true for either core_kernel_text() or - * is_module_text_address(). + * Used by the stack undwinder to know about dynamic ftrace trampolines. */ -bool is_ftrace_trampoline(unsigned long addr) +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { - struct ftrace_ops *op; - bool ret = false; + struct ftrace_ops *op = NULL; /* * Some of the ops may be dynamically allocated, @@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) if (op->trampoline && op->trampoline_size) if (addr >= op->trampoline && addr < op->trampoline + op->trampoline_size) { - ret = true; - goto out; + preempt_enable_notrace(); + return op; } } while_for_each_ftrace_op(op); - - out: preempt_enable_notrace(); - return ret; + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; } struct ftrace_page { -- cgit v1.2.3 From 2ee5b92a2598d9e403337185fdf88f661dee8616 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 23 Jan 2018 13:25:04 -0500 Subject: tracing: Update stack trace skipping for ORC unwinder With the addition of ORC unwinder and FRAME POINTER unwinder, the stack trace skipping requirements have changed. I went through the tracing stack trace dumps with ORC and with frame pointers and recalculated the proper values. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 34 ++++++++++++++----------- kernel/trace/trace_events_trigger.c | 13 ++++++++-- kernel/trace/trace_functions.c | 49 +++++++++++++++++++++++++++---------- 3 files changed, 67 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a8d8a294345..8e3f20a18a06 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() +*/ +# define STACK_SKIP 3 + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, __buffer_unlock_commit(buffer, event); /* - * If regs is not set, then skip the following callers: - * trace_buffer_unlock_commit_regs - * event_trigger_unlock_commit - * trace_event_buffer_commit - * trace_event_raw_event_sched_switch + * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or - * two. They are that meaningful. + * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* - * Add two, for this function and the call to save_stack_trace() + * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ +#ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip += 2; + trace.skip++; +#endif /* * Since events can happen in NMIs there's no safe way to @@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip) local_save_flags(flags); - /* - * Skip 3 more, seems to get us at the caller of - * this function. - */ - skip += 3; +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f2ac9d44f6c4..87411482a46f 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE +#ifdef CONFIG_UNWINDER_ORC +/* Skip 2: + * event_triggers_post_call() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 2 +#else /* - * Skip 3: + * Skip 4: * stacktrace_trigger() * event_triggers_post_call() + * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ -#define STACK_SKIP 3 +#define STACK_SKIP 4 +#endif static void stacktrace_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 27f7ad12c4b1..b611cd36e22d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 2: + * + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 2 +#else +/* + * Skip 3: + * __trace_stack() + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 3 +#endif + static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) @@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); - /* - * skip over 5 funcs: - * __ftrace_trace_stack, - * __trace_stack, - * function_stack_trace_call - * ftrace_list_func - * ftrace_call - */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, flags, STACK_SKIP, pc); } atomic_dec(&data->disabled); @@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, tracer_tracing_off(tr); } +#ifdef CONFIG_UNWINDER_ORC /* - * Skip 4: + * Skip 3: + * + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 3 +#else +/* + * Skip 5: + * + * __trace_stack() * ftrace_stacktrace() * function_trace_probe_call() - * ftrace_ops_list_func() + * ftrace_ops_assist_func() * ftrace_call() */ -#define STACK_SKIP 4 +#define FTRACE_STACK_SKIP 5 +#endif static __always_inline void trace_stack(struct trace_array *tr) { @@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr) local_save_flags(flags); pc = preempt_count(); - __trace_stack(tr, flags, STACK_SKIP, pc); + __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); } static void -- cgit v1.2.3 From d5421ea43d30701e03cadc56a38854c36a8b4433 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jan 2018 14:54:32 +0100 Subject: hrtimer: Reset hrtimer cpu base proper on CPU hotplug The hrtimer interrupt code contains a hang detection and mitigation mechanism, which prevents that a long delayed hrtimer interrupt causes a continous retriggering of interrupts which prevent the system from making progress. If a hang is detected then the timer hardware is programmed with a certain delay into the future and a flag is set in the hrtimer cpu base which prevents newly enqueued timers from reprogramming the timer hardware prior to the chosen delay. The subsequent hrtimer interrupt after the delay clears the flag and resumes normal operation. If such a hang happens in the last hrtimer interrupt before a CPU is unplugged then the hang_detected flag is set and stays that way when the CPU is plugged in again. At that point the timer hardware is not armed and it cannot be armed because the hang_detected flag is still active, so nothing clears that flag. As a consequence the CPU does not receive hrtimer interrupts and no timers expire on that CPU which results in RCU stalls and other malfunctions. Clear the flag along with some other less critical members of the hrtimer cpu base to ensure starting from a clean state when a CPU is plugged in. Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the root cause of that hard to reproduce heisenbug. Once understood it's trivial and certainly justifies a brown paperbag. Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Anna-Maria Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d32520840fde..aa9d2a2b1210 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); return 0; -- cgit v1.2.3