From 9ec23531fd48031d1b6ca5366f5f967d17a8bc28 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:07 +0200 Subject: sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with disabled pagefaults Commit 662bbcb2747c ("mm, sched: Allow uaccess in atomic with pagefault_disable()") removed might_sleep() checks for all user access code (that uses might_fault()). The reason was to disable wrong "sleep in atomic" warnings in the following scenario: pagefault_disable() rc = copy_to_user(...) pagefault_enable() Which is valid, as pagefault_disable() increments the preempt counter and therefore disables the pagefault handler. copy_to_user() will not sleep and return an error code if a page is not available. However, as all might_sleep() checks are removed, CONFIG_DEBUG_ATOMIC_SLEEP would no longer detect the following scenario: spin_lock(&lock); rc = copy_to_user(...) spin_unlock(&lock) If the kernel is compiled with preemption turned on, preempt_disable() will make in_atomic() detect disabled preemption. The fault handler would correctly never sleep on user access. However, with preemption turned off, preempt_disable() is usually a NOP (with !CONFIG_PREEMPT_COUNT), therefore in_atomic() will not be able to detect disabled preemption nor disabled pagefaults. The fault handler could sleep. We really want to enable CONFIG_DEBUG_ATOMIC_SLEEP checks for user access functions again, otherwise we can end up with horrible deadlocks. Root of all evil is that pagefault_disable() acts almost as preempt_disable(), depending on preemption being turned on/off. As we now have pagefault_disabled(), we can use it to distinguish whether user acces functions might sleep. Convert might_fault() into a makro that calls __might_fault(), to allow proper file + line messages in case of a might_sleep() warning. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-3-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..060dd7b61c6d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro) #if defined(CONFIG_MMU) && \ (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)) -void might_fault(void); +#define might_fault() __might_fault(__FILE__, __LINE__) +void __might_fault(const char *file, int line); #else static inline void might_fault(void) { } #endif -- cgit v1.2.3 From 28b8d0c8f560300836dff352348e513cdf328e50 Mon Sep 17 00:00:00 2001 From: Gobinda Charan Maji Date: Wed, 27 May 2015 11:09:38 +0930 Subject: sysfs: tightened sysfs permission checks There were some inconsistency in restriction to VERIFY_OCTAL_PERMISSIONS(). Previously the test was "User perms >= group perms >= other perms". The permission field of User, Group or Other consists of three bits. LSB is EXECUTE permission, MSB is READ permission and the middle bit is WRITE permission. But logically WRITE is "more privileged" than READ. Say for example, permission value is "0430". Here User has only READ permission whereas Group has both WRITE and EXECUTE permission. So, the checks could be tightened and the tests are separated to USER_READABLE >= GROUP_READABLE >= OTHER_READABLE, USER_WRITABLE >= GROUP_WRITABLE and OTHER_WRITABLE is not permitted. Signed-off-by: Gobinda Charan Maji Signed-off-by: Rusty Russell --- include/linux/kernel.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..cd54b357c934 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -818,13 +818,15 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */ -#define VERIFY_OCTAL_PERMISSIONS(perms) \ - (BUILD_BUG_ON_ZERO((perms) < 0) + \ - BUILD_BUG_ON_ZERO((perms) > 0777) + \ - /* User perms >= group perms >= other perms */ \ - BUILD_BUG_ON_ZERO(((perms) >> 6) < (((perms) >> 3) & 7)) + \ - BUILD_BUG_ON_ZERO((((perms) >> 3) & 7) < ((perms) & 7)) + \ - /* Other writable? Generally considered a bad idea. */ \ - BUILD_BUG_ON_ZERO((perms) & 2) + \ +#define VERIFY_OCTAL_PERMISSIONS(perms) \ + (BUILD_BUG_ON_ZERO((perms) < 0) + \ + BUILD_BUG_ON_ZERO((perms) > 0777) + \ + /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \ + BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \ + /* USER_WRITABLE >= GROUP_WRITABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \ + /* OTHER_WRITABLE? Generally considered a bad idea. */ \ + BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) #endif -- cgit v1.2.3 From 3c6296f716ebef704b76070d90567ab4faa8462c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 13:21:34 -0400 Subject: ring-buffer: Remove useless unused tracing_off_permanent() The tracing_off_permanent() call is a way to disable all ring_buffers. Nothing uses it and nothing should use it, as tracing_off() and friends are better, as they disable the ring buffers related to tracing. The tracing_off_permanent() even disabled non tracing ring buffers. This is a bit drastic, and was added to handle NMIs doing outputs that could corrupt the ring buffer when only tracing used them. It is now obsolete and adds a little overhead, it should be removed. Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 6 ----- kernel/trace/ring_buffer.c | 61 ---------------------------------------------- 2 files changed, 67 deletions(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..d948718a83d7 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -532,12 +532,6 @@ bool mac_pton(const char *s, u8 *mac); * * Most likely, you want to use tracing_on/tracing_off. */ -#ifdef CONFIG_RING_BUFFER -/* trace_off_permanent stops recording with no way to bring it back */ -void tracing_off_permanent(void); -#else -static inline void tracing_off_permanent(void) { } -#endif enum ftrace_dump_mode { DUMP_NONE, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e9420fdc7409..0fc5add6423b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -115,63 +115,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) * */ -/* - * A fast way to enable or disable all ring buffers is to - * call tracing_on or tracing_off. Turning off the ring buffers - * prevents all ring buffers from being recorded to. - * Turning this switch on, makes it OK to write to the - * ring buffer, if the ring buffer is enabled itself. - * - * There's three layers that must be on in order to write - * to the ring buffer. - * - * 1) This global flag must be set. - * 2) The ring buffer must be enabled for recording. - * 3) The per cpu buffer must be enabled for recording. - * - * In case of an anomaly, this global flag has a bit set that - * will permantly disable all ring buffers. - */ - -/* - * Global flag to disable all recording to ring buffers - * This has two bits: ON, DISABLED - * - * ON DISABLED - * ---- ---------- - * 0 0 : ring buffers are off - * 1 0 : ring buffers are on - * X 1 : ring buffers are permanently disabled - */ - -enum { - RB_BUFFERS_ON_BIT = 0, - RB_BUFFERS_DISABLED_BIT = 1, -}; - -enum { - RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, - RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, -}; - -static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; - /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) -/** - * tracing_off_permanent - permanently disable ring buffers - * - * This function, once called, will disable all ring buffers - * permanently. - */ -void tracing_off_permanent(void) -{ - set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); -} - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -2728,9 +2676,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) struct ring_buffer_event *event; int cpu; - if (ring_buffer_flags != RB_BUFFERS_ON) - return NULL; - /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); @@ -2992,9 +2937,6 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu; - if (ring_buffer_flags != RB_BUFFERS_ON) - return -EBUSY; - preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) @@ -4350,9 +4292,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, ret = -EAGAIN; - if (ring_buffer_flags != RB_BUFFERS_ON) - goto out; - if (atomic_read(&buffer_a->record_disabled)) goto out; -- cgit v1.2.3 From 5375b708f2547f70cd2bee2fd8663ab7035f9551 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Tue, 30 Jun 2015 14:57:46 -0700 Subject: kernel/panic/kexec: fix "crash_kexec_post_notifiers" option issue in oops path Commit f06e5153f4ae2e ("kernel/panic.c: add "crash_kexec_post_notifiers" option for kdump after panic_notifers") introduced "crash_kexec_post_notifiers" kernel boot option, which toggles wheather panic() calls crash_kexec() before panic_notifiers and dump kmsg or after. The problem is that the commit overlooks panic_on_oops kernel boot option. If it is enabled, crash_kexec() is called directly without going through panic() in oops path. To fix this issue, this patch adds a check to "crash_kexec_post_notifiers" in the condition of kexec_should_crash(). Also, put a comment in kexec_should_crash() to explain not obvious things on this patch. Signed-off-by: HATAYAMA Daisuke Acked-by: Baoquan He Tested-by: Hidehiro Kawai Reviewed-by: Masami Hiramatsu Cc: Vivek Goyal Cc: Ingo Molnar Cc: Hidehiro Kawai Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 3 +++ kernel/kexec.c | 11 +++++++++++ kernel/panic.c | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5acf5b70866d..0dfa4e31563d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,6 +439,9 @@ extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; extern int sysctl_panic_on_stackoverflow; + +extern bool crash_kexec_post_notifiers; + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/kernel/kexec.c b/kernel/kexec.c index 7a36fdcca5bf..a785c1015e25 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -84,6 +84,17 @@ struct resource crashk_low_res = { int kexec_should_crash(struct task_struct *p) { + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in do_exit() path, each of which + * corresponds to each of these 4 conditions. + */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) return 1; return 0; diff --git a/kernel/panic.c b/kernel/panic.c index 774614f72cbd..04e91ff7560b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,7 +32,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -static bool crash_kexec_post_notifiers; +bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; -- cgit v1.2.3