summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/bpf_trace.c146
-rw-r--r--kernel/trace/ring_buffer.c949
-rw-r--r--kernel/trace/trace.c376
-rw-r--r--kernel/trace/trace.h14
-rw-r--r--kernel/trace/trace_functions_graph.c23
-rw-r--r--kernel/trace/trace_osnoise.c14
-rw-r--r--kernel/trace/trace_output.c17
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_syscalls.c12
-rw-r--r--kernel/trace/trace_uprobe.c44
10 files changed, 1290 insertions, 307 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cd098846e251..a582cd25ca87 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,7 +24,6 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
-#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -798,29 +797,6 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = {
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
-BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
-{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct cgroup *cgrp;
-
- if (unlikely(idx >= array->map.max_entries))
- return -E2BIG;
-
- cgrp = READ_ONCE(array->ptrs[idx]);
- if (unlikely(!cgrp))
- return -EAGAIN;
-
- return task_under_cgroup_hierarchy(current, cgrp);
-}
-
-static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
- .func = bpf_current_task_under_cgroup,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-
struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
@@ -1226,7 +1202,8 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_LONG,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg3_size = sizeof(u64),
};
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
@@ -1242,7 +1219,8 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
.func = get_func_ret,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_LONG,
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg2_size = sizeof(u64),
};
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
@@ -1439,73 +1417,6 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
-/* filesystem kfuncs */
-__bpf_kfunc_start_defs();
-
-/**
- * bpf_get_file_xattr - get xattr of a file
- * @file: file to get xattr from
- * @name__str: name of the xattr
- * @value_p: output buffer of the xattr value
- *
- * Get xattr *name__str* of *file* and store the output in *value_ptr*.
- *
- * For security reasons, only *name__str* with prefix "user." is allowed.
- *
- * Return: 0 on success, a negative value on error.
- */
-__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
- struct bpf_dynptr *value_p)
-{
- struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
- struct dentry *dentry;
- u32 value_len;
- void *value;
- int ret;
-
- if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EPERM;
-
- value_len = __bpf_dynptr_size(value_ptr);
- value = __bpf_dynptr_data_rw(value_ptr, value_len);
- if (!value)
- return -EINVAL;
-
- dentry = file_dentry(file);
- ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
- if (ret)
- return ret;
- return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(fs_kfunc_set_ids)
-BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_KFUNCS_END(fs_kfunc_set_ids)
-
-static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
-{
- if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
- return 0;
-
- /* Only allow to attach from LSM hooks, to avoid recursion */
- return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
-}
-
-static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
- .owner = THIS_MODULE,
- .set = &fs_kfunc_set_ids,
- .filter = bpf_get_file_xattr_filter,
-};
-
-static int __init bpf_fs_kfuncs_init(void)
-{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
-}
-
-late_initcall(bpf_fs_kfuncs_init);
-
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1548,8 +1459,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_probe_write_user:
@@ -1578,6 +1487,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1598,7 +1509,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
- return &bpf_get_task_stack_proto;
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
@@ -1654,7 +1566,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto;
+ return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
@@ -3160,6 +3072,7 @@ struct bpf_uprobe {
loff_t offset;
unsigned long ref_ctr_offset;
u64 cookie;
+ struct uprobe *uprobe;
struct uprobe_consumer consumer;
};
@@ -3178,15 +3091,15 @@ struct bpf_uprobe_multi_run_ctx {
struct bpf_uprobe *uprobe;
};
-static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
- u32 cnt)
+static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
u32 i;
- for (i = 0; i < cnt; i++) {
- uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- &uprobes[i].consumer);
- }
+ for (i = 0; i < cnt; i++)
+ uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
+
+ if (cnt)
+ uprobe_unregister_sync();
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
@@ -3194,7 +3107,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
if (umulti_link->task)
put_task_struct(umulti_link->task);
path_put(&umulti_link->path);
@@ -3298,7 +3211,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
struct bpf_run_ctx *old_run_ctx;
int err = 0;
- if (link->task && current->mm != link->task->mm)
+ if (link->task && !same_thread_group(current, link->task))
return 0;
if (sleepable)
@@ -3322,8 +3235,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
}
static bool
-uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
- struct mm_struct *mm)
+uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
{
struct bpf_uprobe *uprobe;
@@ -3480,22 +3392,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
&bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) {
- err = uprobe_register_refctr(d_real_inode(link->path.dentry),
- uprobes[i].offset,
- uprobes[i].ref_ctr_offset,
- &uprobes[i].consumer);
- if (err) {
- bpf_uprobe_unregister(&path, uprobes, i);
- goto error_free;
+ uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ uprobes[i].ref_ctr_offset,
+ &uprobes[i].consumer);
+ if (IS_ERR(uprobes[i].uprobe)) {
+ err = PTR_ERR(uprobes[i].uprobe);
+ link->cnt = i;
+ goto error_unregister;
}
}
err = bpf_link_prime(&link->link, &link_primer);
if (err)
- goto error_free;
+ goto error_unregister;
return bpf_link_settle(&link_primer);
+error_unregister:
+ bpf_uprobe_unregister(uprobes, link->cnt);
+
error_free:
kvfree(uprobes);
kfree(link);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cebd879a30cb..77dc0b25140e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -32,6 +32,8 @@
#include <asm/local64.h>
#include <asm/local.h>
+#include "trace.h"
+
/*
* The "absolute" timestamp in the buffer is only 59 bits.
* If a clock has the 5 MSBs set, it needs to be saved and
@@ -42,6 +44,21 @@
static void update_pages_handler(struct work_struct *work);
+#define RING_BUFFER_META_MAGIC 0xBADFEED
+
+struct ring_buffer_meta {
+ int magic;
+ int struct_size;
+ unsigned long text_addr;
+ unsigned long data_addr;
+ unsigned long first_buffer;
+ unsigned long head_buffer;
+ unsigned long commit_buffer;
+ __u32 subbuf_size;
+ __u32 nr_subbufs;
+ int buffers[];
+};
+
/*
* The ring buffer header is special. We must manually up keep it.
*/
@@ -342,7 +359,8 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
unsigned order; /* order of the page */
- u32 id; /* ID for external mapping */
+ u32 id:30; /* ID for external mapping */
+ u32 range:1; /* Mapped via a range */
struct buffer_data_page *page; /* Actual data page */
};
@@ -373,7 +391,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_pages((unsigned long)bpage->page, bpage->order);
+ /* Range pages are not to be freed */
+ if (!bpage->range)
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -491,9 +511,11 @@ struct ring_buffer_per_cpu {
unsigned long pages_removed;
unsigned int mapped;
+ unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
unsigned long *subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
+ struct ring_buffer_meta *ring_meta;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -523,6 +545,12 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+ unsigned long range_addr_start;
+ unsigned long range_addr_end;
+
+ long last_text_delta;
+ long last_data_delta;
+
unsigned int subbuf_size;
unsigned int subbuf_order;
unsigned int max_data_size;
@@ -1239,6 +1267,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
* Set the previous list pointer to have the HEAD flag.
*/
rb_set_list_to_head(head->list.prev);
+
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->head_buffer = (unsigned long)head->page;
+ }
}
static void rb_list_head_clear(struct list_head *list)
@@ -1478,9 +1511,484 @@ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
}
}
+/*
+ * Take an address, add the meta data size as well as the array of
+ * array subbuffer indexes, then align it to a subbuffer size.
+ *
+ * This is used to help find the next per cpu subbuffer within a mapped range.
+ */
+static unsigned long
+rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
+{
+ addr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_subbufs;
+ return ALIGN(addr, subbuf_size);
+}
+
+/*
+ * Return the ring_buffer_meta for a given @cpu.
+ */
+static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
+{
+ int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *meta;
+ int nr_subbufs;
+
+ if (!ptr)
+ return NULL;
+
+ /* When nr_pages passed in is zero, the first meta has already been initialized */
+ if (!nr_pages) {
+ meta = (struct ring_buffer_meta *)ptr;
+ nr_subbufs = meta->nr_subbufs;
+ } else {
+ meta = NULL;
+ /* Include the reader page */
+ nr_subbufs = nr_pages + 1;
+ }
+
+ /*
+ * The first chunk may not be subbuffer aligned, where as
+ * the rest of the chunks are.
+ */
+ if (cpu) {
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* We can use multiplication to find chunks greater than 1 */
+ if (cpu > 1) {
+ unsigned long size;
+ unsigned long p;
+
+ /* Save the beginning of this CPU chunk */
+ p = ptr;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* Now all chunks after this are the same size */
+ size = ptr - p;
+ ptr += size * (cpu - 2);
+ }
+ }
+ return (void *)ptr;
+}
+
+/* Return the start of subbufs given the meta pointer */
+static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+{
+ int subbuf_size = meta->subbuf_size;
+ unsigned long ptr;
+
+ ptr = (unsigned long)meta;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs);
+
+ return (void *)ptr;
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long ptr;
+ int subbuf_size;
+
+ meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu);
+ if (!meta)
+ return NULL;
+
+ if (WARN_ON_ONCE(idx >= meta->nr_subbufs))
+ return NULL;
+
+ subbuf_size = meta->subbuf_size;
+
+ /* Map this buffer to the order that's in meta->buffers[] */
+ idx = meta->buffers[idx];
+
+ ptr = (unsigned long)rb_subbufs_from_meta(meta);
+
+ ptr += subbuf_size * idx;
+ if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end)
+ return NULL;
+
+ return (void *)ptr;
+}
+
+/*
+ * See if the existing memory contains valid ring buffer data.
+ * As the previous kernel must be the same as this kernel, all
+ * the calculations (size of buffers and number of buffers)
+ * must be the same.
+ */
+static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages)
+{
+ int subbuf_size = PAGE_SIZE;
+ struct buffer_data_page *subbuf;
+ unsigned long buffers_start;
+ unsigned long buffers_end;
+ int i;
+
+ /* Check the meta magic and meta struct size */
+ if (meta->magic != RING_BUFFER_META_MAGIC ||
+ meta->struct_size != sizeof(*meta)) {
+ pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu);
+ return false;
+ }
+
+ /* The subbuffer's size and number of subbuffers must match */
+ if (meta->subbuf_size != subbuf_size ||
+ meta->nr_subbufs != nr_pages + 1) {
+ pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu);
+ return false;
+ }
+
+ buffers_start = meta->first_buffer;
+ buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
+
+ /* Is the head and commit buffers within the range of buffers? */
+ if (meta->head_buffer < buffers_start ||
+ meta->head_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu);
+ return false;
+ }
+
+ if (meta->commit_buffer < buffers_start ||
+ meta->commit_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu);
+ return false;
+ }
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ /* Is the meta buffers and the subbufs themselves have correct data? */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ if (meta->buffers[i] < 0 ||
+ meta->buffers[i] >= meta->nr_subbufs) {
+ pr_info("Ring buffer boot meta [%d] array out of range\n", cpu);
+ return false;
+ }
+
+ if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
+ pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
+ return false;
+ }
+
+ subbuf = (void *)subbuf + subbuf_size;
+ }
+
+ return true;
+}
+
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
+ unsigned long long *timestamp, u64 *delta_ptr)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int events = 0;
+ int e;
+
+ *delta_ptr = 0;
+ *timestamp = 0;
+
+ ts = dpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(dpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ *delta_ptr = delta;
+ *timestamp = ts;
+ return -1;
+ }
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ fallthrough;
+ case RINGBUF_TYPE_DATA:
+ events++;
+ ts += event->time_delta;
+ break;
+
+ default:
+ return -1;
+ }
+ }
+ *timestamp = ts;
+ return events;
+}
+
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+{
+ unsigned long long ts;
+ u64 delta;
+ int tail;
+
+ tail = local_read(&dpage->commit);
+ return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+}
+
+/* If the meta data has been validated, now validate the events */
+static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct buffer_page *head_page;
+ unsigned long entry_bytes = 0;
+ unsigned long entries = 0;
+ int ret;
+ int i;
+
+ if (!meta || !meta->head_buffer)
+ return;
+
+ /* Do the reader page first */
+ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer reader page is invalid\n");
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
+ local_set(&cpu_buffer->reader_page->entries, ret);
+
+ head_page = cpu_buffer->head_page;
+
+ /* If both the head and commit are on the reader_page then we are done. */
+ if (head_page == cpu_buffer->reader_page &&
+ head_page == cpu_buffer->commit_page)
+ goto done;
+
+ /* Iterate until finding the commit page */
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
+
+ /* Reader page has already been done */
+ if (head_page == cpu_buffer->reader_page)
+ continue;
+
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer meta [%d] invalid buffer page\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&head_page->page->commit);
+ local_set(&cpu_buffer->head_page->entries, ret);
+
+ if (head_page == cpu_buffer->commit_page)
+ break;
+ }
+
+ if (head_page != cpu_buffer->commit_page) {
+ pr_info("Ring buffer meta [%d] commit page not found\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ done:
+ local_set(&cpu_buffer->entries, entries);
+ local_set(&cpu_buffer->entries_bytes, entry_bytes);
+
+ pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+ return;
+
+ invalid:
+ /* The content of the buffers are invalid, reset the meta data */
+ meta->head_buffer = 0;
+ meta->commit_buffer = 0;
+
+ /* Reset the reader page */
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
+
+ /* Reset all the subbuffers */
+ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
+ local_set(&head_page->entries, 0);
+ local_set(&head_page->page->commit, 0);
+ }
+}
+
+/* Used to calculate data delta */
+static char rb_data_ptr[] = "";
+
+#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr)
+#define THIS_DATA_PTR ((unsigned long)rb_data_ptr)
+
+static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
+{
+ meta->text_addr = THIS_TEXT_PTR;
+ meta->data_addr = THIS_DATA_PTR;
+}
+
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long delta;
+ void *subbuf;
+ int cpu;
+ int i;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ void *next_meta;
+
+ meta = rb_range_meta(buffer, nr_pages, cpu);
+
+ if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+ /* Make the mappings match the current address */
+ subbuf = rb_subbufs_from_meta(meta);
+ delta = (unsigned long)subbuf - meta->first_buffer;
+ meta->first_buffer += delta;
+ meta->head_buffer += delta;
+ meta->commit_buffer += delta;
+ buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr;
+ buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr;
+ continue;
+ }
+
+ if (cpu < nr_cpu_ids - 1)
+ next_meta = rb_range_meta(buffer, nr_pages, cpu + 1);
+ else
+ next_meta = (void *)buffer->range_addr_end;
+
+ memset(meta, 0, next_meta - (void *)meta);
+
+ meta->magic = RING_BUFFER_META_MAGIC;
+ meta->struct_size = sizeof(*meta);
+
+ meta->nr_subbufs = nr_pages + 1;
+ meta->subbuf_size = PAGE_SIZE;
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ meta->first_buffer = (unsigned long)subbuf;
+ rb_meta_init_text_addr(meta);
+
+ /*
+ * The buffers[] array holds the order of the sub-buffers
+ * that are after the meta data. The sub-buffers may
+ * be swapped out when read and inserted into a different
+ * location of the ring buffer. Although their addresses
+ * remain the same, the buffers[] array contains the
+ * index into the sub-buffers holding their actual order.
+ */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ meta->buffers[i] = i;
+ rb_init_page(subbuf);
+ subbuf += meta->subbuf_size;
+ }
+ }
+}
+
+static void *rbm_start(struct seq_file *m, loff_t *pos)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val;
+
+ if (!meta)
+ return NULL;
+
+ if (*pos > meta->nr_subbufs)
+ return NULL;
+
+ val = *pos;
+ val++;
+
+ return (void *)val;
+}
+
+static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return rbm_start(m, pos);
+}
+
+static int rbm_show(struct seq_file *m, void *v)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val = (unsigned long)v;
+
+ if (val == 1) {
+ seq_printf(m, "head_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->head_buffer));
+ seq_printf(m, "commit_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer));
+ seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size);
+ seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs);
+ return 0;
+ }
+
+ val -= 2;
+ seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]);
+
+ return 0;
+}
+
+static void rbm_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations rb_meta_seq_ops = {
+ .start = rbm_start,
+ .next = rbm_next,
+ .show = rbm_show,
+ .stop = rbm_stop,
+};
+
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &rb_meta_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = buffer->buffers[cpu];
+
+ return 0;
+}
+
+/* Map the buffer_pages to the previous head and commit pages */
+static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+
+ if (meta->head_buffer == (unsigned long)bpage->page)
+ cpu_buffer->head_page = bpage;
+
+ if (meta->commit_buffer == (unsigned long)bpage->page) {
+ cpu_buffer->commit_page = bpage;
+ cpu_buffer->tail_page = bpage;
+ }
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
+ struct trace_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
gfp_t mflags;
@@ -1515,6 +2023,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
if (user_thread)
set_current_oom_origin();
+
+ if (buffer->range_addr_start)
+ meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1525,16 +2037,32 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_check_bpage(cpu_buffer, bpage);
- list_add(&bpage->list, pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
+ /*
+ * Append the pages as for mapped buffers we want to keep
+ * the order
+ */
+ list_add_tail(&bpage->list, pages);
+
+ if (meta) {
+ /* A range was given. Use that for the buffer page */
+ bpage->page = rb_range_buffer(cpu_buffer, i + 1);
+ if (!bpage->page)
+ goto free_pages;
+ /* If this is valid from a previous boot */
+ if (meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ bpage->id = i + 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ mflags | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto free_pages;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
bpage->order = cpu_buffer->buffer->subbuf_order;
- rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
goto free_pages;
@@ -1584,6 +2112,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
struct buffer_page *bpage;
struct page *page;
int ret;
@@ -1614,12 +2143,28 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ if (buffer->range_addr_start) {
+ /*
+ * Range mapped buffers have the same restrictions as memory
+ * mapped ones do.
+ */
+ cpu_buffer->mapped = 1;
+ cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu);
+ bpage->page = rb_range_buffer(cpu_buffer, 0);
+ if (!bpage->page)
+ goto fail_free_reader;
+ if (cpu_buffer->ring_meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto fail_free_reader;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
@@ -1628,11 +2173,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (ret < 0)
goto fail_free_reader;
- cpu_buffer->head_page
- = list_entry(cpu_buffer->pages, struct buffer_page, list);
- cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_meta_validate_events(cpu_buffer);
+
+ /* If the boot meta was valid then this has already been updated */
+ meta = cpu_buffer->ring_meta;
+ if (!meta || !meta->head_buffer ||
+ !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) {
+ if (meta && meta->head_buffer &&
+ (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) {
+ pr_warn("Ring buffer meta buffers not all mapped\n");
+ if (!cpu_buffer->head_page)
+ pr_warn(" Missing head_page\n");
+ if (!cpu_buffer->commit_page)
+ pr_warn(" Missing commit_page\n");
+ if (!cpu_buffer->tail_page)
+ pr_warn(" Missing tail_page\n");
+ }
- rb_head_page_activate(cpu_buffer);
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ rb_head_page_activate(cpu_buffer);
+
+ if (cpu_buffer->ring_meta)
+ meta->commit_buffer = meta->head_buffer;
+ } else {
+ /* The valid meta buffer still needs to activate the head page */
+ rb_head_page_activate(cpu_buffer);
+ }
return cpu_buffer;
@@ -1669,22 +2238,14 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
-/**
- * __ring_buffer_alloc - allocate a new ring_buffer
- * @size: the size in bytes per cpu that is needed.
- * @flags: attributes to set for the ring buffer.
- * @key: ring buffer reader_lock_key.
- *
- * Currently the only flag that is available is the RB_FL_OVERWRITE
- * flag. This flag means that the buffer will overwrite old data
- * when the buffer wraps. If this flag is not set, the buffer will
- * drop data when the tail hits the head.
- */
-struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
- struct lock_class_key *key)
+static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long end,
+ struct lock_class_key *key)
{
struct trace_buffer *buffer;
long nr_pages;
+ int subbuf_size;
int bsize;
int cpu;
int ret;
@@ -1698,14 +2259,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- /* Default buffer page size - one system page */
- buffer->subbuf_order = 0;
- buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+ buffer->subbuf_order = order;
+ subbuf_size = (PAGE_SIZE << order);
+ buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE;
/* Max payload is buffer page size - header (8bytes) */
buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
- nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1713,10 +2273,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&buffer->irq_work.waiters);
- /* need at least two pages */
- if (nr_pages < 2)
- nr_pages = 2;
-
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
@@ -1725,6 +2281,56 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ /* If start/end are specified, then that overrides size */
+ if (start && end) {
+ unsigned long ptr;
+ int n;
+
+ size = end - start;
+ size = size / nr_cpu_ids;
+
+ /*
+ * The number of sub-buffers (nr_pages) is determined by the
+ * total size allocated minus the meta data size.
+ * Then that is divided by the number of per CPU buffers
+ * needed, plus account for the integer array index that
+ * will be appended to the meta data.
+ */
+ nr_pages = (size - sizeof(struct ring_buffer_meta)) /
+ (subbuf_size + sizeof(int));
+ /* Need at least two pages plus the reader page */
+ if (nr_pages < 3)
+ goto fail_free_buffers;
+
+ again:
+ /* Make sure that the size fits aligned */
+ for (n = 0, ptr = start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_pages;
+ ptr = ALIGN(ptr, subbuf_size);
+ ptr += subbuf_size * nr_pages;
+ }
+ if (ptr > end) {
+ if (nr_pages <= 3)
+ goto fail_free_buffers;
+ nr_pages--;
+ goto again;
+ }
+
+ /* nr_pages should not count the reader page */
+ nr_pages--;
+ buffer->range_addr_start = start;
+ buffer->range_addr_end = end;
+
+ rb_range_meta_init(buffer, nr_pages);
+ } else {
+
+ /* need at least two pages */
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
+ if (nr_pages < 2)
+ nr_pages = 2;
+ }
+
cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
@@ -1753,9 +2359,73 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
kfree(buffer);
return NULL;
}
+
+/**
+ * __ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
+{
+ /* Default buffer page size - one system page */
+ return alloc_buffer(size, flags, 0, 0, 0,key);
+
+}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
/**
+ * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @start: start of allocated range
+ * @range_size: size of allocated range
+ * @order: sub-buffer order
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long range_size,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(size, flags, order, start, start + range_size, key);
+}
+
+/**
+ * ring_buffer_last_boot_delta - return the delta offset from last boot
+ * @buffer: The buffer to return the delta from
+ * @text: Return text delta
+ * @data: Return data delta
+ *
+ * Returns: The true if the delta is non zero
+ */
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+ long *data)
+{
+ if (!buffer)
+ return false;
+
+ if (!buffer->last_text_delta)
+ return false;
+
+ *text = buffer->last_text_delta;
+ *data = buffer->last_data_delta;
+
+ return true;
+}
+
+/**
* ring_buffer_free - free a ring buffer.
* @buffer: the buffer to free.
*/
@@ -2364,6 +3034,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->next_event = 0;
}
+/* Return the index into the sub-buffers for a given sub-buffer */
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
+{
+ void *subbuf_array;
+
+ subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs;
+ subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size);
+ return (subbuf - subbuf_array) / meta->subbuf_size;
+}
+
+static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *next_page)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long old_head = (unsigned long)next_page->page;
+ unsigned long new_head;
+
+ rb_inc_page(&next_page);
+ new_head = (unsigned long)next_page->page;
+
+ /*
+ * Only move it forward once, if something else came in and
+ * moved it forward, then we don't want to touch it.
+ */
+ (void)cmpxchg(&meta->head_buffer, old_head, new_head);
+}
+
+static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *reader)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ void *old_reader = cpu_buffer->reader_page->page;
+ void *new_reader = reader->page;
+ int id;
+
+ id = reader->id;
+ cpu_buffer->reader_page->id = id;
+ reader->id = 0;
+
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader);
+ meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader);
+
+ /* The head pointer is the one after the reader */
+ rb_update_meta_head(cpu_buffer, reader);
+}
+
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2413,6 +3129,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_head(cpu_buffer, next_page);
/*
* The entries will be zeroed out when we move the
* tail page.
@@ -2974,6 +3692,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
+ }
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -3420,11 +4142,10 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
- struct ring_buffer_event *event;
struct buffer_data_page *bpage;
u64 ts, delta;
bool full = false;
- int e;
+ int ret;
bpage = info->tail_page->page;
@@ -3450,39 +4171,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
- ts = bpage->time_stamp;
-
- for (e = 0; e < tail; e += rb_event_length(event)) {
-
- event = (struct ring_buffer_event *)(bpage->data + e);
-
- switch (event->type_len) {
-
- case RINGBUF_TYPE_TIME_EXTEND:
- delta = rb_event_time_stamp(event);
- ts += delta;
- break;
-
- case RINGBUF_TYPE_TIME_STAMP:
- delta = rb_event_time_stamp(event);
- delta = rb_fix_abs_ts(delta, ts);
- if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
- }
- ts = delta;
- break;
-
- case RINGBUF_TYPE_PADDING:
- if (event->time_delta == 1)
- break;
- fallthrough;
- case RINGBUF_TYPE_DATA:
- ts += event->time_delta;
- break;
-
- default:
- RB_WARN_ON(cpu_buffer, 1);
+ ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
+ if (ret < 0) {
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ goto out;
}
}
if ((full && ts > info->ts) ||
@@ -4591,6 +5285,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (!ret)
goto spin;
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_reader(cpu_buffer, reader);
+
/*
* Yay! We succeeded in replacing the page.
*
@@ -5212,6 +5909,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+ if (!meta)
+ return;
+
meta->reader.read = cpu_buffer->reader_page->read;
meta->reader.id = cpu_buffer->reader_page->id;
meta->reader.lost_events = cpu_buffer->lost_events;
@@ -5268,11 +5968,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
- if (cpu_buffer->mapped)
- rb_update_meta_page(cpu_buffer);
-
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
+
+ if (cpu_buffer->mapped) {
+ rb_update_meta_page(cpu_buffer);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = meta->head_buffer;
+ }
+ }
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
@@ -5303,6 +6008,7 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_meta *meta;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -5321,6 +6027,11 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -5335,6 +6046,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
int cpu;
/* prevent another thread from changing buffer sizes */
@@ -5362,6 +6074,11 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
reset_disabled_cpu_buffer(cpu_buffer);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
@@ -6135,10 +6852,10 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
/* install subbuf ID to kern VA translation */
cpu_buffer->subbuf_ids = subbuf_ids;
- meta->meta_page_size = PAGE_SIZE;
meta->meta_struct_len = sizeof(*meta);
meta->nr_subbufs = nr_subbufs;
meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ meta->meta_page_size = meta->subbuf_size;
rb_update_meta_page(cpu_buffer);
}
@@ -6155,7 +6872,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
mutex_lock(&cpu_buffer->mapping_lock);
- if (!cpu_buffer->mapped) {
+ if (!cpu_buffer->user_mapped) {
mutex_unlock(&cpu_buffer->mapping_lock);
return ERR_PTR(-ENODEV);
}
@@ -6179,19 +6896,26 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
lockdep_assert_held(&cpu_buffer->mapping_lock);
+ /* mapped is always greater or equal to user_mapped */
+ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ return -EINVAL;
+
if (inc && cpu_buffer->mapped == UINT_MAX)
return -EBUSY;
- if (WARN_ON(!inc && cpu_buffer->mapped == 0))
+ if (WARN_ON(!inc && cpu_buffer->user_mapped == 0))
return -EINVAL;
mutex_lock(&cpu_buffer->buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- if (inc)
+ if (inc) {
+ cpu_buffer->user_mapped++;
cpu_buffer->mapped++;
- else
+ } else {
+ cpu_buffer->user_mapped--;
cpu_buffer->mapped--;
+ }
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
mutex_unlock(&cpu_buffer->buffer->mutex);
@@ -6214,7 +6938,7 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
struct vm_area_struct *vma)
{
- unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
+ unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
unsigned int subbuf_pages, subbuf_order;
struct page **pages;
int p = 0, s = 0;
@@ -6225,6 +6949,12 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
!(vma->vm_flags & VM_MAYSHARE))
return -EPERM;
+ subbuf_order = cpu_buffer->buffer->subbuf_order;
+ subbuf_pages = 1 << subbuf_order;
+
+ if (subbuf_order && pgoff % subbuf_pages)
+ return -EINVAL;
+
/*
* Make sure the mapping cannot become writable later. Also tell the VM
* to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
@@ -6234,37 +6964,38 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
lockdep_assert_held(&cpu_buffer->mapping_lock);
- subbuf_order = cpu_buffer->buffer->subbuf_order;
- subbuf_pages = 1 << subbuf_order;
-
nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
- nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+ nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */
- vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
- if (!vma_pages || vma_pages > nr_pages)
+ nr_vma_pages = vma_pages(vma);
+ if (!nr_vma_pages || nr_vma_pages > nr_pages)
return -EINVAL;
- nr_pages = vma_pages;
+ nr_pages = nr_vma_pages;
pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
if (!pgoff) {
+ unsigned long meta_page_padding;
+
pages[p++] = virt_to_page(cpu_buffer->meta_page);
/*
- * TODO: Align sub-buffers on their size, once
- * vm_insert_pages() supports the zero-page.
+ * Pad with the zero-page to align the meta-page with the
+ * sub-buffers.
*/
- } else {
- /* Skip the meta-page */
- pgoff--;
+ meta_page_padding = subbuf_pages - 1;
+ while (meta_page_padding-- && p < nr_pages) {
+ unsigned long __maybe_unused zero_addr =
+ vma->vm_start + (PAGE_SIZE * p);
- if (pgoff % subbuf_pages) {
- err = -EINVAL;
- goto out;
+ pages[p++] = ZERO_PAGE(zero_addr);
}
+ } else {
+ /* Skip the meta-page */
+ pgoff -= subbuf_pages;
s += pgoff / subbuf_pages;
}
@@ -6316,7 +7047,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
mutex_lock(&cpu_buffer->mapping_lock);
- if (cpu_buffer->mapped) {
+ if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
@@ -6347,12 +7078,15 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
*/
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
err = __rb_map_vma(cpu_buffer, vma);
if (!err) {
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- cpu_buffer->mapped = 1;
+ /* This is the first time it is mapped by user */
+ cpu_buffer->mapped++;
+ cpu_buffer->user_mapped = 1;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
kfree(cpu_buffer->subbuf_ids);
@@ -6380,10 +7114,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
mutex_lock(&cpu_buffer->mapping_lock);
- if (!cpu_buffer->mapped) {
+ if (!cpu_buffer->user_mapped) {
err = -ENODEV;
goto out;
- } else if (cpu_buffer->mapped > 1) {
+ } else if (cpu_buffer->user_mapped > 1) {
__rb_inc_dec_mapped(cpu_buffer, false);
goto out;
}
@@ -6391,7 +7125,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
mutex_lock(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- cpu_buffer->mapped = 0;
+ /* This is the last user space mapping */
+ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ cpu_buffer->mapped--;
+ cpu_buffer->user_mapped = 0;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index edf6bc817aa1..b4f348b4653f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -482,7 +482,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR)
+ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
+ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK)
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -500,6 +500,29 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+static struct trace_array *printk_trace = &global_trace;
+
+static __always_inline bool printk_binsafe(struct trace_array *tr)
+{
+ /*
+ * The binary format of traceprintk can cause a crash if used
+ * by a buffer from another boot. Force the use of the
+ * non binary version of trace_printk if the trace_printk
+ * buffer is a boot mapped ring buffer.
+ */
+ return !(tr->flags & TRACE_ARRAY_FL_BOOT);
+}
+
+static void update_printk_trace(struct trace_array *tr)
+{
+ if (printk_trace == tr)
+ return;
+
+ printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK;
+ printk_trace = tr;
+ tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
+}
+
void trace_set_ring_buffer_expanded(struct trace_array *tr)
{
if (!tr)
@@ -1117,7 +1140,7 @@ EXPORT_SYMBOL_GPL(__trace_array_puts);
*/
int __trace_puts(unsigned long ip, const char *str, int size)
{
- return __trace_array_puts(&global_trace, ip, str, size);
+ return __trace_array_puts(printk_trace, ip, str, size);
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -1128,6 +1151,7 @@ EXPORT_SYMBOL_GPL(__trace_puts);
*/
int __trace_bputs(unsigned long ip, const char *str)
{
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
@@ -1135,14 +1159,17 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
int ret = 0;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!printk_binsafe(tr))
+ return __trace_puts(ip, str, strlen(str));
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
trace_ctx = tracing_gen_ctx();
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
@@ -1155,7 +1182,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -2226,10 +2253,6 @@ static __init int init_trace_selftests(void)
}
core_initcall(init_trace_selftests);
#else
-static inline int run_tracer_selftest(struct tracer *type)
-{
- return 0;
-}
static inline int do_run_tracer_selftest(struct tracer *type)
{
return 0;
@@ -3025,7 +3048,7 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(global_trace.array_buffer.buffer,
+ __ftrace_trace_stack(printk_trace->array_buffer.buffer,
tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3244,12 +3267,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- struct trace_array *tr = &global_trace;
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct bprint_entry *entry;
unsigned int trace_ctx;
char *tbuffer;
int len = 0, size;
+ if (!printk_binsafe(tr))
+ return trace_vprintk(ip, fmt, args);
+
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3342,7 +3368,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3438,7 +3464,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -3450,7 +3476,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
- return trace_array_vprintk(&global_trace, ip, fmt, args);
+ return trace_array_vprintk(printk_trace, ip, fmt, args);
}
EXPORT_SYMBOL_GPL(trace_vprintk);
@@ -3671,8 +3697,11 @@ static void test_can_verify(void)
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
va_list ap)
{
+ long text_delta = iter->tr->text_delta;
+ long data_delta = iter->tr->data_delta;
const char *p = fmt;
const char *str;
+ bool good;
int i, j;
if (WARN_ON_ONCE(!fmt))
@@ -3691,7 +3720,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
j = 0;
- /* We only care about %s and variants */
+ /*
+ * We only care about %s and variants
+ * as well as %p[sS] if delta is non-zero
+ */
for (i = 0; p[i]; i++) {
if (i + 1 >= iter->fmt_size) {
/*
@@ -3720,6 +3752,11 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
}
if (p[i+j] == 's')
break;
+
+ if (text_delta && p[i+1] == 'p' &&
+ ((p[i+2] == 's' || p[i+2] == 'S')))
+ break;
+
star = false;
}
j = 0;
@@ -3733,6 +3770,24 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
iter->fmt[i] = '\0';
trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ /* Add delta to %pS pointers */
+ if (p[i+1] == 'p') {
+ unsigned long addr;
+ char fmt[4];
+
+ fmt[0] = '%';
+ fmt[1] = 'p';
+ fmt[2] = p[i+2]; /* Either %ps or %pS */
+ fmt[3] = '\0';
+
+ addr = va_arg(ap, unsigned long);
+ addr += text_delta;
+ trace_seq_printf(&iter->seq, fmt, (void *)addr);
+
+ p += i + 3;
+ continue;
+ }
+
/*
* If iter->seq is full, the above call no longer guarantees
* that ap is in sync with fmt processing, and further calls
@@ -3751,6 +3806,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
/* The ap now points to the string data of the %s */
str = va_arg(ap, const char *);
+ good = trace_safe_str(iter, str, star, len);
+
+ /* Could be from the last boot */
+ if (data_delta && !good) {
+ str += data_delta;
+ good = trace_safe_str(iter, str, star, len);
+ }
+
/*
* If you hit this warning, it is likely that the
* trace event in question used %s on a string that
@@ -3760,8 +3823,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
- "fmt: '%s' current_buffer: '%s'",
+ if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
fmt, seq_buf_str(&iter->seq.seq))) {
int ret;
@@ -4923,6 +4985,11 @@ static int tracing_open(struct inode *inode, struct file *file)
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
+#ifdef CONFIG_TRACER_SNAPSHOT
+ /* arrays with mapped buffer range do not have snapshots */
+ if (tr->range_addr_start && t->use_max_tr)
+ return false;
+#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}
@@ -5015,7 +5082,7 @@ static int show_traces_open(struct inode *inode, struct file *file)
return 0;
}
-static int show_traces_release(struct inode *inode, struct file *file)
+static int tracing_seq_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -5056,7 +5123,7 @@ static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = show_traces_release,
+ .release = tracing_seq_release,
};
static ssize_t
@@ -5241,7 +5308,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
if ((mask == TRACE_ITER_RECORD_TGID) ||
- (mask == TRACE_ITER_RECORD_CMD))
+ (mask == TRACE_ITER_RECORD_CMD) ||
+ (mask == TRACE_ITER_TRACE_PRINTK))
lockdep_assert_held(&event_mutex);
/* do nothing if flag is already set */
@@ -5253,6 +5321,25 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
+ if (mask == TRACE_ITER_TRACE_PRINTK) {
+ if (enabled) {
+ update_printk_trace(tr);
+ } else {
+ /*
+ * The global_trace cannot clear this.
+ * It's flag only gets cleared if another instance sets it.
+ */
+ if (printk_trace == &global_trace)
+ return -EINVAL;
+ /*
+ * An instance must always have it set.
+ * by default, that's the global_trace instane.
+ */
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+ }
+ }
+
if (enabled)
tr->trace_flags |= mask;
else
@@ -6038,6 +6125,18 @@ out:
return ret;
}
+static void update_last_data(struct trace_array *tr)
+{
+ if (!tr->text_delta && !tr->data_delta)
+ return;
+
+ /* Clear old data */
+ tracing_reset_online_cpus(&tr->array_buffer);
+
+ /* Using current data now */
+ tr->text_delta = 0;
+ tr->data_delta = 0;
+}
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -6055,6 +6154,9 @@ int tracing_update_buffers(struct trace_array *tr)
int ret = 0;
mutex_lock(&trace_types_lock);
+
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6110,6 +6212,8 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
mutex_lock(&trace_types_lock);
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6858,6 +6962,37 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
}
static ssize_t
+tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct seq_buf seq;
+ char buf[64];
+
+ seq_buf_init(&seq, buf, 64);
+
+ seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
+ seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq));
+}
+
+static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu);
+ if (ret < 0)
+ __trace_array_put(tr);
+ return ret;
+}
+
+static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -7433,6 +7568,13 @@ static const struct file_operations tracing_entries_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_buffer_meta_fops = {
+ .open = tracing_buffer_meta_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
+};
+
static const struct file_operations tracing_total_entries_fops = {
.open = tracing_open_generic_tr,
.read = tracing_total_entries_read,
@@ -7473,6 +7615,13 @@ static const struct file_operations trace_time_stamp_mode_fops = {
.release = tracing_single_release_tr,
};
+static const struct file_operations last_boot_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_last_boot_read,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
.open = tracing_snapshot_open,
@@ -8665,12 +8814,17 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
+ if (tr->range_addr_start)
+ trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &tracing_buffer_meta_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
- tr, cpu, &snapshot_fops);
+ if (!tr->range_addr_start) {
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
+ tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
- tr, cpu, &snapshot_raw_fops);
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+ }
#endif
}
@@ -9207,7 +9361,21 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
buf->tr = tr;
- buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (tr->range_addr_start && tr->range_addr_size) {
+ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
+ tr->range_addr_start,
+ tr->range_addr_size);
+
+ ring_buffer_last_boot_delta(buf->buffer,
+ &tr->text_delta, &tr->data_delta);
+ /*
+ * This is basically the same as a mapped buffer,
+ * with the same restrictions.
+ */
+ tr->mapped++;
+ } else {
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ }
if (!buf->buffer)
return -ENOMEM;
@@ -9244,6 +9412,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
@@ -9344,7 +9516,9 @@ static int trace_array_create_dir(struct trace_array *tr)
}
static struct trace_array *
-trace_array_create_systems(const char *name, const char *systems)
+trace_array_create_systems(const char *name, const char *systems,
+ unsigned long range_addr_start,
+ unsigned long range_addr_size)
{
struct trace_array *tr;
int ret;
@@ -9370,6 +9544,10 @@ trace_array_create_systems(const char *name, const char *systems)
goto out_free_tr;
}
+ /* Only for boot up memory mapped ring buffers */
+ tr->range_addr_start = range_addr_start;
+ tr->range_addr_size = range_addr_size;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9427,7 +9605,7 @@ trace_array_create_systems(const char *name, const char *systems)
static struct trace_array *trace_array_create(const char *name)
{
- return trace_array_create_systems(name, NULL);
+ return trace_array_create_systems(name, NULL, 0, 0);
}
static int instance_mkdir(const char *name)
@@ -9452,6 +9630,31 @@ out_unlock:
return ret;
}
+static u64 map_pages(u64 start, u64 size)
+{
+ struct page **pages;
+ phys_addr_t page_start;
+ unsigned int page_count;
+ unsigned int i;
+ void *vaddr;
+
+ page_count = DIV_ROUND_UP(size, PAGE_SIZE);
+
+ page_start = start;
+ pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ for (i = 0; i < page_count; i++) {
+ phys_addr_t addr = page_start + i * PAGE_SIZE;
+ pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+ }
+ vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
+ kfree(pages);
+
+ return (u64)(unsigned long)vaddr;
+}
+
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
@@ -9481,7 +9684,7 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
goto out_unlock;
}
- tr = trace_array_create_systems(name, systems);
+ tr = trace_array_create_systems(name, systems, 0, 0);
if (IS_ERR(tr))
tr = NULL;
@@ -9511,6 +9714,9 @@ static int __remove_instance(struct trace_array *tr)
set_tracer_flag(tr, 1 << i, 0);
}
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9673,10 +9879,15 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
+ if (tr->range_addr_start) {
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
- tr, &snapshot_fops);
+ } else {
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
+ tr, &snapshot_fops);
#endif
+ }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
@@ -10296,6 +10507,7 @@ __init static void enable_instances(void)
{
struct trace_array *tr;
char *curr_str;
+ char *name;
char *str;
char *tok;
@@ -10304,19 +10516,107 @@ __init static void enable_instances(void)
str = boot_instance_info;
while ((curr_str = strsep(&str, "\t"))) {
+ phys_addr_t start = 0;
+ phys_addr_t size = 0;
+ unsigned long addr = 0;
+ bool traceprintk = false;
+ bool traceoff = false;
+ char *flag_delim;
+ char *addr_delim;
tok = strsep(&curr_str, ",");
- if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
- do_allocate_snapshot(tok);
+ flag_delim = strchr(tok, '^');
+ addr_delim = strchr(tok, '@');
- tr = trace_array_get_by_name(tok, NULL);
- if (!tr) {
- pr_warn("Failed to create instance buffer %s\n", curr_str);
+ if (addr_delim)
+ *addr_delim++ = '\0';
+
+ if (flag_delim)
+ *flag_delim++ = '\0';
+
+ name = tok;
+
+ if (flag_delim) {
+ char *flag;
+
+ while ((flag = strsep(&flag_delim, "^"))) {
+ if (strcmp(flag, "traceoff") == 0) {
+ traceoff = true;
+ } else if ((strcmp(flag, "printk") == 0) ||
+ (strcmp(flag, "traceprintk") == 0) ||
+ (strcmp(flag, "trace_printk") == 0)) {
+ traceprintk = true;
+ } else {
+ pr_info("Tracing: Invalid instance flag '%s' for %s\n",
+ flag, name);
+ }
+ }
+ }
+
+ tok = addr_delim;
+ if (tok && isdigit(*tok)) {
+ start = memparse(tok, &tok);
+ if (!start) {
+ pr_warn("Tracing: Invalid boot instance address for %s\n",
+ name);
+ continue;
+ }
+ if (*tok != ':') {
+ pr_warn("Tracing: No size specified for instance %s\n", name);
+ continue;
+ }
+ tok++;
+ size = memparse(tok, &tok);
+ if (!size) {
+ pr_warn("Tracing: Invalid boot instance size for %s\n",
+ name);
+ continue;
+ }
+ } else if (tok) {
+ if (!reserve_mem_find_by_name(tok, &start, &size)) {
+ start = 0;
+ pr_warn("Failed to map boot instance %s to %s\n", name, tok);
+ continue;
+ }
+ }
+
+ if (start) {
+ addr = map_pages(start, size);
+ if (addr) {
+ pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
+ name, &start, (unsigned long)size);
+ } else {
+ pr_warn("Tracing: Failed to map boot instance %s\n", name);
+ continue;
+ }
+ } else {
+ /* Only non mapped buffers have snapshot buffers */
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(name);
+ }
+
+ tr = trace_array_create_systems(name, NULL, addr, size);
+ if (IS_ERR(tr)) {
+ pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str);
continue;
}
- /* Allow user space to delete it */
- trace_array_put(tr);
+
+ if (traceoff)
+ tracer_tracing_off(tr);
+
+ if (traceprintk)
+ update_printk_trace(tr);
+
+ /*
+ * If start is set, then this is a mapped buffer, and
+ * cannot be deleted by user space, so keep the reference
+ * to it.
+ */
+ if (start)
+ tr->flags |= TRACE_ARRAY_FL_BOOT;
+ else
+ trace_array_put(tr);
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd3e3069300e..c866991b9c78 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,7 +336,6 @@ struct trace_array {
bool allocated_snapshot;
spinlock_t snapshot_trigger_lock;
unsigned int snapshot;
- unsigned int mapped;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -344,6 +343,13 @@ struct trace_array {
struct irq_work fsnotify_irqwork;
#endif
#endif
+ /* The below is for memory mapped ring buffer */
+ unsigned int mapped;
+ unsigned long range_addr_start;
+ unsigned long range_addr_size;
+ long text_delta;
+ long data_delta;
+
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
/*
@@ -423,7 +429,8 @@ struct trace_array {
};
enum {
- TRACE_ARRAY_FL_GLOBAL = (1 << 0)
+ TRACE_ARRAY_FL_GLOBAL = BIT(0),
+ TRACE_ARRAY_FL_BOOT = BIT(1),
};
extern struct list_head ftrace_trace_arrays;
@@ -644,6 +651,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer,
unsigned long len,
unsigned int trace_ctx);
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu);
+
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -1312,6 +1321,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(IRQ_INFO, "irq-info"), \
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
+ C(TRACE_PRINTK, "trace_printk_dest"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 13d0387ac6a6..a569daaac4c4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -544,6 +544,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
+ addr += iter->tr->text_delta;
+
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return;
@@ -710,6 +712,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
+ unsigned long func;
int cpu = iter->cpu;
int i;
@@ -717,6 +720,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
call = &entry->graph_ent;
duration = graph_ret->rettime - graph_ret->calltime;
+ func = call->func + iter->tr->text_delta;
+
if (data) {
struct fgraph_cpu_data *cpu_data;
@@ -747,10 +752,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
* enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL)
- print_graph_retval(s, graph_ret->retval, true, (void *)call->func,
+ print_graph_retval(s, graph_ret->retval, true, (void *)func,
!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
else
- trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ trace_seq_printf(s, "%ps();\n", (void *)func);
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -766,6 +771,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
int i;
if (data) {
@@ -788,7 +794,9 @@ print_graph_entry_nested(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
- trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+ func = call->func + iter->tr->text_delta;
+
+ trace_seq_printf(s, "%ps() {\n", (void *)func);
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
@@ -863,6 +871,8 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
int *depth_irq;
struct fgraph_data *data = iter->private;
+ addr += iter->tr->text_delta;
+
/*
* If we are either displaying irqs, or we got called as
* a graph event and private data does not exist,
@@ -990,11 +1000,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
int i;
+ func = trace->func + iter->tr->text_delta;
+
if (check_irq_return(iter, flags, trace->depth))
return TRACE_TYPE_HANDLED;
@@ -1033,7 +1046,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* function-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, trace->retval, false, (void *)trace->func,
+ print_graph_retval(s, trace->retval, false, (void *)func,
!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
} else {
/*
@@ -1046,7 +1059,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
trace_seq_puts(s, "}\n");
else
- trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ trace_seq_printf(s, "} /* %ps */\n", (void *)func);
}
/* Overrun */
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index bbe47781617e..1439064f65d6 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void)
return this_cpu_ptr(&per_cpu_osnoise_var);
}
+/*
+ * Protect the interface.
+ */
+static struct mutex interface_lock;
+
#ifdef CONFIG_TIMERLAT_TRACER
/*
* Runtime information for the timer mode.
@@ -253,11 +258,6 @@ static inline struct timerlat_variables *this_cpu_tmr_var(void)
}
/*
- * Protect the interface.
- */
-static struct mutex interface_lock;
-
-/*
* tlat_var_reset - Reset the values of the given timerlat_variables
*/
static inline void tlat_var_reset(void)
@@ -1541,7 +1541,7 @@ static int run_osnoise(void)
* This will eventually cause unwarranted noise as PREEMPT_RCU
* will force preemption as the means of ending the current
* grace period. We avoid this problem by calling
- * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * rcu_momentary_eqs(), which performs a zero duration
* EQS allowing PREEMPT_RCU to end the current grace period.
* This call shouldn't be wrapped inside an RCU critical
* section.
@@ -1553,7 +1553,7 @@ static int run_osnoise(void)
if (!disable_irq)
local_irq_disable();
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
if (!disable_irq)
local_irq_enable();
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..868f2f912f28 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -990,8 +990,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, int flags)
+ unsigned long parent_ip, long delta, int flags)
{
+ ip += delta;
+ parent_ip += delta;
+
seq_print_ip_sym(s, ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
@@ -1009,7 +1012,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1230,6 +1233,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
+ long delta = iter->tr->text_delta;
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1242,7 +1246,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
break;
trace_seq_puts(s, " => ");
- seq_print_ip_sym(s, *p, flags);
+ seq_print_ip_sym(s, (*p) + delta, flags);
trace_seq_putc(s, '\n');
}
@@ -1587,10 +1591,13 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long ip;
trace_assign_type(field, iter->ent);
- seq_print_ip_sym(s, field->ip, flags);
+ ip = field->ip + iter->tr->text_delta;
+
+ seq_print_ip_sym(s, ip, flags);
trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
@@ -1674,7 +1681,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 130ca7e7787e..ae2ace5e515a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
* - wakeup_dl handles tasks belonging to sched_dl class only.
*/
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
- (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+ (wakeup_rt && !rt_or_dl_task(p)) ||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c581d6da843..785733245ead 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
@@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
+ !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
@@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
} __aligned(8) param;
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
@@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
@@ -701,7 +705,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -709,7 +713,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->ret = syscall_get_return_value(current, regs);
if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
+ !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c3df411a2684..c40531d2cbad 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -59,8 +59,8 @@ struct trace_uprobe {
struct dyn_event devent;
struct uprobe_consumer consumer;
struct path path;
- struct inode *inode;
char *filename;
+ struct uprobe *uprobe;
unsigned long offset;
unsigned long ref_ctr_offset;
unsigned long __percpu *nhits;
@@ -1095,43 +1095,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
return trace_handle_return(s);
}
-typedef bool (*filter_func_t)(struct uprobe_consumer *self,
- enum uprobe_filter_ctx ctx,
- struct mm_struct *mm);
+typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm);
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
- int ret;
+ struct inode *inode = d_real_inode(tu->path.dentry);
+ struct uprobe *uprobe;
tu->consumer.filter = filter;
- tu->inode = d_real_inode(tu->path.dentry);
-
- if (tu->ref_ctr_offset)
- ret = uprobe_register_refctr(tu->inode, tu->offset,
- tu->ref_ctr_offset, &tu->consumer);
- else
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
+ if (IS_ERR(uprobe))
+ return PTR_ERR(uprobe);
- if (ret)
- tu->inode = NULL;
-
- return ret;
+ tu->uprobe = uprobe;
+ return 0;
}
static void __probe_event_disable(struct trace_probe *tp)
{
struct trace_uprobe *tu;
+ bool sync = false;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- if (!tu->inode)
+ if (!tu->uprobe)
continue;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->inode = NULL;
+ uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
+ sync = true;
+ tu->uprobe = NULL;
}
+ if (sync)
+ uprobe_unregister_sync();
}
static int probe_event_enable(struct trace_event_call *call,
@@ -1327,7 +1324,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
if (ret)
break;
}
@@ -1351,7 +1348,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ err = uprobe_apply(tu->uprobe, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
break;
@@ -1361,8 +1358,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return err;
}
-static bool uprobe_perf_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
@@ -1448,7 +1444,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
struct uprobe_cpu_buffer **ucbp)
{
- if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ if (!uprobe_perf_filter(&tu->consumer, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))