summaryrefslogtreecommitdiff
path: root/kernel/trace/ring_buffer.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-18 00:22:17 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-18 00:22:17 +0300
commit09c0796adf0c793462fda1d7c8c43324551405c7 (patch)
tree90893d337b215482f059dd7e522279b23ffa0961 /kernel/trace/ring_buffer.c
parent312dcaf967219effe0483785f24e4072a5bed9a5 (diff)
parentf6a694665f132cbf6e2222dd2f173dc35330a8aa (diff)
downloadlinux-09c0796adf0c793462fda1d7c8c43324551405c7.tar.xz
Merge tag 'trace-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace
Pull tracing updates from Steven Rostedt: "The major update to this release is that there's a new arch config option called CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS. Currently, only x86_64 enables it. All the ftrace callbacks now take a struct ftrace_regs instead of a struct pt_regs. If the architecture has HAVE_DYNAMIC_FTRACE_WITH_ARGS enabled, then the ftrace_regs will have enough information to read the arguments of the function being traced, as well as access to the stack pointer. This way, if a user (like live kernel patching) only cares about the arguments, then it can avoid using the heavier weight "regs" callback, that puts in enough information in the struct ftrace_regs to simulate a breakpoint exception (needed for kprobes). A new config option that audits the timestamps of the ftrace ring buffer at most every event recorded. Ftrace recursion protection has been cleaned up to move the protection to the callback itself (this saves on an extra function call for those callbacks). Perf now handles its own RCU protection and does not depend on ftrace to do it for it (saving on that extra function call). New debug option to add "recursed_functions" file to tracefs that lists all the places that triggered the recursion protection of the function tracer. This will show where things need to be fixed as recursion slows down the function tracer. The eval enum mapping updates done at boot up are now offloaded to a work queue, as it caused a noticeable pause on slow embedded boards. Various clean ups and last minute fixes" * tag 'trace-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace: (33 commits) tracing: Offload eval map updates to a work queue Revert: "ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS" ring-buffer: Add rb_check_bpage in __rb_allocate_pages ring-buffer: Fix two typos in comments tracing: Drop unneeded assignment in ring_buffer_resize() tracing: Disable ftrace selftests when any tracer is running seq_buf: Avoid type mismatch for seq_buf_init ring-buffer: Fix a typo in function description ring-buffer: Remove obsolete rb_event_is_commit() ring-buffer: Add test to validate the time stamp deltas ftrace/documentation: Fix RST C code blocks tracing: Clean up after filter logic rewriting tracing: Remove the useless value assignment in test_create_synth_event() livepatch: Use the default ftrace_ops instead of REGS when ARGS is available ftrace/x86: Allow for arguments to be passed in to ftrace_regs by default ftrace: Have the callbacks receive a struct ftrace_regs instead of pt_regs MAINTAINERS: assign ./fs/tracefs to TRACING tracing: Fix some typos in comments ftrace: Remove unused varible 'ret' ring-buffer: Add recording of ring buffer recursion into recursed_functions ...
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r--kernel/trace/ring_buffer.c223
1 files changed, 188 insertions, 35 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a6268e09160a..ec08f948dd80 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -129,7 +130,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT)
+
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -1422,7 +1432,8 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ long nr_pages, struct list_head *pages)
{
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
@@ -1462,13 +1473,15 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
struct page *page;
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- mflags, cpu_to_node(cpu));
+ mflags, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
+ rb_check_bpage(cpu_buffer, bpage);
+
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
@@ -1500,7 +1513,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(!nr_pages);
- if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages))
return -ENOMEM;
/*
@@ -1973,8 +1986,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
- size = nr_pages * BUF_PAGE_SIZE;
-
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
@@ -2009,8 +2020,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
* allocated without receiving ENOMEM
*/
INIT_LIST_HEAD(&cpu_buffer->new_pages);
- if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu)) {
+ if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
/* not enough memory for new pages */
err = -ENOMEM;
goto out_err;
@@ -2075,8 +2086,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (cpu_buffer->nr_pages_to_update > 0 &&
- __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu_id)) {
+ __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
err = -ENOMEM;
goto out_err;
}
@@ -2628,9 +2639,6 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
return skip_time_extend(event);
}
-static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event);
-
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static inline bool sched_clock_stable(void)
{
@@ -2719,7 +2727,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA) {
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
@@ -2734,11 +2742,11 @@ static unsigned rb_calculate_event_length(unsigned length)
if (!length)
length++;
- if (length > RB_MAX_SMALL_DATA)
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ALIGNMENT);
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
* In case the time delta is larger than the 27 bits for it
@@ -2758,20 +2766,6 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
static u64 rb_time_delta(struct ring_buffer_event *event)
{
switch (event->type_len) {
@@ -3006,6 +3000,13 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
irq_work_queue(&cpu_buffer->irq_work.work);
}
+#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
+# define do_ring_buffer_record_recursion() \
+ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_)
+#else
+# define do_ring_buffer_record_recursion() do { } while (0)
+#endif
+
/*
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
@@ -3088,8 +3089,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
* been updated yet. In this case, use the TRANSITION bit.
*/
bit = RB_CTX_TRANSITION;
- if (val & (1 << (bit + cpu_buffer->nest)))
+ if (val & (1 << (bit + cpu_buffer->nest))) {
+ do_ring_buffer_record_recursion();
return 1;
+ }
}
val |= (1 << (bit + cpu_buffer->nest));
@@ -3183,6 +3186,153 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+/* Special value to validate all deltas on a page. */
+#define CHECK_FULL_PAGE 1L
+
+#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+static void dump_buffer_page(struct buffer_data_page *bpage,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int e;
+
+ ts = bpage->time_stamp;
+ pr_warn(" [%lld] PAGE TIME STAMP\n", ts);
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = ring_buffer_event_time_stamp(event);
+ ts += delta;
+ pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = ring_buffer_event_time_stamp(event);
+ ts = delta;
+ pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ ts += event->time_delta;
+ pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ break;
+
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ break;
+
+ default:
+ break;
+ }
+ }
+}
+
+static DEFINE_PER_CPU(atomic_t, checking);
+static atomic_t ts_dump;
+
+/*
+ * Check if the current event time stamp matches the deltas on
+ * the buffer page.
+ */
+static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ struct buffer_data_page *bpage;
+ u64 ts, delta;
+ bool full = false;
+ int e;
+
+ bpage = info->tail_page->page;
+
+ if (tail == CHECK_FULL_PAGE) {
+ full = true;
+ tail = local_read(&bpage->commit);
+ } else if (info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
+ /* Ignore events with absolute time stamps */
+ return;
+ }
+
+ /*
+ * Do not check the first event (skip possible extends too).
+ * Also do not check if previous events have not been committed.
+ */
+ if (tail <= 8 || tail > local_read(&bpage->commit))
+ return;
+
+ /*
+ * If this interrupted another event,
+ */
+ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
+ goto out;
+
+ ts = bpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = ring_buffer_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = ring_buffer_event_time_stamp(event);
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ /* fall through */
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ break;
+
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ }
+ }
+ if ((full && ts > info->ts) ||
+ (!full && ts + info->delta != info->ts)) {
+ /* If another report is happening, ignore this one */
+ if (atomic_inc_return(&ts_dump) != 1) {
+ atomic_dec(&ts_dump);
+ goto out;
+ }
+ atomic_inc(&cpu_buffer->record_disabled);
+ pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta, info->after);
+ dump_buffer_page(bpage, info, tail);
+ atomic_dec(&ts_dump);
+ /* Do not re-enable checking */
+ return;
+ }
+out:
+ atomic_dec(this_cpu_ptr(&checking));
+}
+#else
+static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+}
+#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */
+
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
@@ -3240,6 +3390,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (a_ok && b_ok && info->before != info->after)
(void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
info->before, info->after);
+ if (a_ok && b_ok)
+ check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
@@ -3257,9 +3409,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* This did not interrupt any time update */
info->delta = info->ts - info->after;
else
- /* Just use full timestamp for inerrupting event */
+ /* Just use full timestamp for interrupting event */
info->delta = info->ts;
barrier();
+ check_buffer(cpu_buffer, info, tail);
if (unlikely(info->ts != save_before)) {
/* SLOW PATH - Interrupted between C and E */
@@ -3293,7 +3446,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
info->ts = ts;
} else {
/*
- * Interrupted beween C and E:
+ * Interrupted between C and E:
* Lost the previous events time stamp. Just set the
* delta to zero, and this will be the same time as
* the event this event interrupted. And the events that
@@ -3500,7 +3653,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * ring_buffer_commit_discard - discard an event that has not been committed
+ * ring_buffer_discard_commit - discard an event that has not been committed
* @buffer: the ring buffer
* @event: non committed event to discard
*