diff options
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r-- | kernel/trace/ring_buffer.c | 226 |
1 files changed, 185 insertions, 41 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dcf1c4dd3efe..c9cb9767d49b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -22,6 +22,7 @@ #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> +#include <linux/oom.h> #include <asm/local.h> @@ -41,6 +42,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\ttime_stamp : type == %d\n", + RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); @@ -140,12 +143,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s) enum { RB_LEN_TIME_EXTEND = 8, - RB_LEN_TIME_STAMP = 16, + RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) +#define extended_time(event) \ + (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -209,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); @@ -231,7 +237,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); @@ -248,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static __always_inline void * rb_event_data(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ @@ -275,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +/** + * ring_buffer_event_time_stamp - return the event's extended timestamp + * @event: the event to get the timestamp of + * + * Returns the extended timestamp associated with a data event. + * An extended time_stamp is a 64-bit timestamp represented + * internally in a special way that makes the best use of space + * contained within a ring buffer event. This function decodes + * it and maps it to a straight u64 value. + */ +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) +{ + u64 ts; + + ts = event->array[0]; + ts <<= TS_SHIFT; + ts += event->time_delta; + + return ts; +} + /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ @@ -451,6 +478,7 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; @@ -488,6 +516,7 @@ struct ring_buffer { u64 (*clock)(void); struct rb_irq_work irq_work; + bool time_stamp_abs; }; struct ring_buffer_iter { @@ -1134,30 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { struct buffer_page *bpage, *tmp; + bool user_thread = current->mm != NULL; + gfp_t mflags; long i; + /* + * Check if the available memory is there first. + * Note, si_mem_available() only gives us a rough estimate of available + * memory. It may not be accurate. But we don't care, we just want + * to prevent doing any allocation when it is obvious that it is + * not going to succeed. + */ + i = si_mem_available(); + if (i < nr_pages) + return -ENOMEM; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; + + /* + * If a user thread allocates too much, and si_mem_available() + * reports there's enough memory, even though there is not. + * Make sure the OOM killer kills this thread. This can happen + * even with RETRY_MAYFAIL because another task may be doing + * an allocation after this task has taken all memory. + * This is the task the OOM killer needs to take out during this + * loop, even if it was triggered by an allocation somewhere else. + */ + if (user_thread) + set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; - /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, - cpu_to_node(cpu)); + mflags, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); + page = alloc_pages_node(cpu_to_node(cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); rb_init_page(bpage->page); + + if (user_thread && fatal_signal_pending(current)) + goto free_pages; } + if (user_thread) + clear_current_oom_origin(); return 0; @@ -1166,6 +1225,8 @@ free_pages: list_del_init(&bpage->list); free_buffer_page(bpage); } + if (user_thread) + clear_current_oom_origin(); return -ENOMEM; } @@ -1382,6 +1443,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, buffer->clock = clock; } +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs) +{ + buffer->time_stamp_abs = abs; +} + +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer) +{ + return buffer->time_stamp_abs; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static inline unsigned long rb_page_entries(struct buffer_page *bpage) @@ -2206,12 +2277,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path, do not inline */ static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { - event->type_len = RINGBUF_TYPE_TIME_EXTEND; + if (abs) + event->type_len = RINGBUF_TYPE_TIME_STAMP; + else + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* Not the first event on the page? */ - if (rb_event_index(event)) { + /* Not the first event on the page, or not delta? */ + if (abs || rb_event_index(event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2254,7 +2328,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * add it to the start of the resevered space. */ if (unlikely(info->add_timestamp)) { - event = rb_add_time_stamp(event, delta); + bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); + + event = rb_add_time_stamp(event, info->delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; } @@ -2442,7 +2518,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer static inline void rb_event_discard(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ @@ -2486,10 +2562,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->write_stamp += delta; + } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->write_stamp = delta; } else cpu_buffer->write_stamp += event->time_delta; } @@ -2581,10 +2658,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << bit))) + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) return 1; - val |= (1 << bit); + val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return 0; @@ -2593,7 +2670,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->current_context &= cpu_buffer->current_context - 1; + cpu_buffer->current_context &= + cpu_buffer->current_context - (1 << cpu_buffer->nest); +} + +/* The recursive locking above uses 4 bits */ +#define NESTED_BITS 4 + +/** + * ring_buffer_nest_start - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * The ring buffer has a safty mechanism to prevent recursion. + * But there may be a case where a trace needs to be done while + * tracing something else. In this case, calling this function + * will allow this function to nest within a currently active + * ring_buffer_lock_reserve(). + * + * Call this function before calling another ring_buffer_lock_reserve() and + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_start(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* Enabled by ring_buffer_nest_end() */ + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest += NESTED_BITS; +} + +/** + * ring_buffer_nest_end - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * Must be called after ring_buffer_nest_start() and after the + * ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_end(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* disabled by ring_buffer_nest_start() */ + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest -= NESTED_BITS; + preempt_enable_notrace(); } /** @@ -2637,7 +2764,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); info->add_timestamp = 1; } @@ -2669,7 +2797,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (!tail) + if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) info->delta = 0; /* See if we shot pass the end of this buffer page */ @@ -2746,8 +2874,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, /* make sure this diff is calculated here */ barrier(); - /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { + if (ring_buffer_time_stamp_abs(buffer)) { + info.delta = info.ts; + rb_handle_timestamp(cpu_buffer, &info); + } else /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { info.delta = diff; if (unlikely(test_time_stamp(info.delta))) rb_handle_timestamp(cpu_buffer, &info); @@ -3429,14 +3560,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3460,14 +3590,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3691,6 +3820,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, struct buffer_page *reader; int nr_loops = 0; + if (ts) + *ts = 0; again: /* * We repeat when a time extend is encountered. @@ -3727,12 +3858,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); @@ -3757,6 +3893,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; + if (ts) + *ts = 0; + cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; @@ -3809,12 +3948,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); |