From 7d776a36277ff2685ffc3dc7eff32002d0333ac9 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:47 +0000 Subject: ring-buffer: Add page statistics to the meta-page Add two fields pages_touched and pages_lost to the ring-buffer meta-page. Those fields are useful to get the number of used pages in the ring-buffer. Link: https://patch.msgid.link/20260309162516.2623589-2-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/uapi/linux/trace_mmap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h index c102ef35d11e..e8185889a1c8 100644 --- a/include/uapi/linux/trace_mmap.h +++ b/include/uapi/linux/trace_mmap.h @@ -17,8 +17,8 @@ * @entries: Number of entries in the ring-buffer. * @overrun: Number of entries lost in the ring-buffer. * @read: Number of entries that have been read. - * @Reserved1: Internal use only. - * @Reserved2: Internal use only. + * @pages_lost: Number of pages overwritten by the writer. + * @pages_touched: Number of pages written by the writer. */ struct trace_buffer_meta { __u32 meta_page_size; @@ -39,8 +39,8 @@ struct trace_buffer_meta { __u64 overrun; __u64 read; - __u64 Reserved1; - __u64 Reserved2; + __u64 pages_lost; + __u64 pages_touched; }; #define TRACE_MMAP_IOCTL_GET_READER _IO('R', 0x20) -- cgit v1.2.3 From 2e67fabd8b77c4f482df9b211bca1b495c6c2c24 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:49 +0000 Subject: ring-buffer: Introduce ring-buffer remotes Add ring-buffer remotes to support entities outside of the kernel (such as firmware or a hypervisor) that writes events into a ring-buffer using the tracefs format Require a description of the ring-buffer pages (struct trace_buffer_desc) and callbacks (swap_reader_page and reset) to set up the ring-buffer on the kernel side. Expect the remote entity to maintain and update the meta-page. Link: https://patch.msgid.link/20260309162516.2623589-4-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 58 +++++++++++ kernel/trace/ring_buffer.c | 233 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 283 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index d862fa610270..994f52b34344 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -251,4 +251,62 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); + +struct ring_buffer_desc { + int cpu; + unsigned int nr_page_va; /* excludes the meta page */ + unsigned long meta_va; + unsigned long page_va[] __counted_by(nr_page_va); +}; + +struct trace_buffer_desc { + int nr_cpus; + size_t struct_len; + char __data[]; /* list of ring_buffer_desc */ +}; + +static inline struct ring_buffer_desc *__next_ring_buffer_desc(struct ring_buffer_desc *desc) +{ + size_t len = struct_size(desc, page_va, desc->nr_page_va); + + return (struct ring_buffer_desc *)((void *)desc + len); +} + +static inline struct ring_buffer_desc *__first_ring_buffer_desc(struct trace_buffer_desc *desc) +{ + return (struct ring_buffer_desc *)(&desc->__data[0]); +} + +static inline size_t trace_buffer_desc_size(size_t buffer_size, unsigned int nr_cpus) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + struct ring_buffer_desc *rbdesc; + + return size_add(offsetof(struct trace_buffer_desc, __data), + size_mul(nr_cpus, struct_size(rbdesc, page_va, nr_pages))); +} + +#define for_each_ring_buffer_desc(__pdesc, __cpu, __trace_pdesc) \ + for (__pdesc = __first_ring_buffer_desc(__trace_pdesc), __cpu = 0; \ + (__cpu) < (__trace_pdesc)->nr_cpus; \ + (__cpu)++, __pdesc = __next_ring_buffer_desc(__pdesc)) + +struct ring_buffer_remote { + struct trace_buffer_desc *desc; + int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); + void *priv; +}; + +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu); + +struct trace_buffer * +__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key); + +#define ring_buffer_alloc_remote(remote) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_remote(remote, &__key); \ +}) #endif /* _LINUX_RING_BUFFER_H */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3d2804a7e8ab..88218377fa29 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -559,6 +559,8 @@ struct ring_buffer_per_cpu { struct trace_buffer_meta *meta_page; struct ring_buffer_cpu_meta *ring_meta; + struct ring_buffer_remote *remote; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -581,6 +583,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -2238,6 +2242,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, int page_id) +{ + return page_id > desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -2245,6 +2283,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2273,6 +2312,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { bpage = alloc_cpu_page(cpu_buffer->cpu); @@ -2297,6 +2342,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); @@ -2394,6 +2449,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu, order); @@ -2453,6 +2532,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2475,7 +2557,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, unsigned long scratch_size, - struct lock_class_key *key) + struct lock_class_key *key, + struct ring_buffer_remote *remote) { struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; @@ -2515,6 +2598,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long buffers_start; @@ -2570,6 +2655,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2578,7 +2672,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2620,7 +2713,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0, 0, key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2647,7 +2740,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, - scratch_size, key); + scratch_size, key, NULL); +} + +/** + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. + */ +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) +{ + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); } void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) @@ -5274,6 +5378,16 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -5428,7 +5542,43 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + cpu_buffer->reader_page->page = new_reader->page; + cpu_buffer->reader_page->id = new_reader->id; + cpu_buffer->reader_page->read = 0; + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5598,6 +5748,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -5998,7 +6155,7 @@ ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return NULL; iter = kzalloc_obj(*iter, flags); @@ -6166,6 +6323,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6396,6 +6570,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6634,6 +6848,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int commit; unsigned int read; u64 save_timestamp; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -1; @@ -6671,6 +6886,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6680,7 +6897,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -7259,7 +7476,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, unsigned long flags; int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; -- cgit v1.2.3 From 96e43537af5461b26f50904c6055046ba65d742f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:51 +0000 Subject: tracing: Introduce trace remotes A trace remote relies on ring-buffer remotes to read and control compatible tracing buffers, written by entity such as firmware or hypervisor. Add a Tracefs directory remotes/ that contains all instances of trace remotes. Each instance follows the same hierarchy as any other to ease the support by existing user-space tools. This currently does not provide any event support, which will come later. Link: https://patch.msgid.link/20260309162516.2623589-6-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 36 +++ kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 6 + kernel/trace/trace_remote.c | 619 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 666 insertions(+), 1 deletion(-) create mode 100644 include/linux/trace_remote.h create mode 100644 kernel/trace/trace_remote.c (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h new file mode 100644 index 000000000000..65b7e7b8267c --- /dev/null +++ b/include/linux/trace_remote.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_H +#define _LINUX_TRACE_REMOTE_H + +#include + +/** + * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first + * time. Must return a &trace_buffer_desc + * (most likely filled with trace_remote_alloc_buffer()) + * @unload_trace_buffer: + * Called once Tracefs has no use for the trace buffer + * (most likely call trace_remote_free_buffer()) + * @enable_tracing: Called on Tracefs tracing_on. It is expected from the + * remote to allow writing. + * @swap_reader_page: Called when Tracefs consumes a new page from a + * ring-buffer. It is expected from the remote to isolate a + * new reader-page from the @cpu ring-buffer. + */ +struct trace_remote_callbacks { + struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); + void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); + int (*enable_tracing)(bool enable, void *priv); + int (*swap_reader_page)(unsigned int cpu, void *priv); +}; + +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); + +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask); + +void trace_remote_free_buffer(struct trace_buffer_desc *desc); + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 49de13cae428..384dd36c8e29 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1281,4 +1281,7 @@ config HIST_TRIGGERS_DEBUG source "kernel/trace/rv/Kconfig" +config TRACE_REMOTE + bool + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 04096c21d06b..318923ce39f5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -128,4 +128,5 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ +obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebd996f8710e..e33cb3c39f6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8589,7 +8589,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } -static struct dentry * +struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b8f3804586a0..d6af4405b341 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -689,6 +689,12 @@ struct dentry *trace_create_file(const char *name, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *trace_create_cpu_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + long cpu, + const struct file_operations *fops); /** diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c new file mode 100644 index 000000000000..8b06f730376e --- /dev/null +++ b/kernel/trace/trace_remote.c @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +#define TRACEFS_DIR "remotes" +#define TRACEFS_MODE_WRITE 0640 +#define TRACEFS_MODE_READ 0440 + +struct trace_remote_iterator { + struct trace_remote *remote; + struct trace_seq seq; + struct delayed_work poll_work; + unsigned long lost_events; + u64 ts; + int cpu; + int evt_cpu; +}; + +struct trace_remote { + struct trace_remote_callbacks *cbs; + void *priv; + struct trace_buffer *trace_buffer; + struct trace_buffer_desc *trace_buffer_desc; + unsigned long trace_buffer_size; + struct ring_buffer_remote rb_remote; + struct mutex lock; + unsigned int nr_readers; + unsigned int poll_ms; + bool tracing_on; +}; + +static bool trace_remote_loaded(struct trace_remote *remote) +{ + return !!remote->trace_buffer; +} + +static int trace_remote_load(struct trace_remote *remote) +{ + struct ring_buffer_remote *rb_remote = &remote->rb_remote; + struct trace_buffer_desc *desc; + + lockdep_assert_held(&remote->lock); + + if (trace_remote_loaded(remote)) + return 0; + + desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + rb_remote->desc = desc; + rb_remote->swap_reader_page = remote->cbs->swap_reader_page; + rb_remote->priv = remote->priv; + remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); + if (!remote->trace_buffer) { + remote->cbs->unload_trace_buffer(desc, remote->priv); + return -ENOMEM; + } + + remote->trace_buffer_desc = desc; + + return 0; +} + +static void trace_remote_try_unload(struct trace_remote *remote) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + /* The buffer is being read or writable */ + if (remote->nr_readers || remote->tracing_on) + return; + + /* The buffer has readable data */ + if (!ring_buffer_empty(remote->trace_buffer)) + return; + + ring_buffer_free(remote->trace_buffer); + remote->trace_buffer = NULL; + remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv); +} + +static int trace_remote_enable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (remote->tracing_on) + return 0; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + ret = remote->cbs->enable_tracing(true, remote->priv); + if (ret) { + trace_remote_try_unload(remote); + return ret; + } + + remote->tracing_on = true; + + return 0; +} + +static int trace_remote_disable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (!remote->tracing_on) + return 0; + + ret = remote->cbs->enable_tracing(false, remote->priv); + if (ret) + return ret; + + ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS); + remote->tracing_on = false; + trace_remote_try_unload(remote); + + return 0; +} + +static ssize_t +tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote *remote = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote); + if (ret) + return ret; + + return cnt; +} +static int tracing_on_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%d\n", remote->tracing_on); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); + +static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_remote *remote = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* KiB to Bytes */ + if (!val || check_shl_overflow(val, 10, &val)) + return -EINVAL; + + guard(mutex)(&remote->lock); + + if (trace_remote_loaded(remote)) + return -EBUSY; + + remote->trace_buffer_size = val; + + return cnt; +} + +static int buffer_size_kb_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10, + trace_remote_loaded(remote) ? "loaded" : "unloaded"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb); + +static int trace_remote_get(struct trace_remote *remote, int cpu) +{ + int ret; + + if (remote->nr_readers == UINT_MAX) + return -EBUSY; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + remote->nr_readers++; + + return 0; +} + +static void trace_remote_put(struct trace_remote *remote) +{ + if (WARN_ON(!remote->nr_readers)) + return; + + remote->nr_readers--; + if (remote->nr_readers) + return; + + trace_remote_try_unload(remote); +} + +static void __poll_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct trace_remote_iterator *iter; + + iter = container_of(dwork, struct trace_remote_iterator, poll_work); + ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu); + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(iter->remote->poll_ms)); +} + +static struct trace_remote_iterator *trace_remote_iter(struct trace_remote *remote, int cpu) +{ + struct trace_remote_iterator *iter = NULL; + int ret; + + lockdep_assert_held(&remote->lock); + + + ret = trace_remote_get(remote, cpu); + if (ret) + return ERR_PTR(ret); + + /* Test the CPU */ + ret = ring_buffer_poll_remote(remote->trace_buffer, cpu); + if (ret) + goto err; + + iter = kzalloc_obj(*iter); + if (iter) { + iter->remote = remote; + iter->cpu = cpu; + trace_seq_init(&iter->seq); + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote); + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms)); + + return iter; + } + ret = -ENOMEM; + +err: + kfree(iter); + trace_remote_put(remote); + + return ERR_PTR(ret); +} + +static void trace_remote_iter_free(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote; + + if (!iter) + return; + + remote = iter->remote; + + lockdep_assert_held(&remote->lock); + + kfree(iter); + trace_remote_put(remote); +} + +static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int cpu = iter->cpu; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + return false; + + if (!ring_buffer_peek(trace_buffer, cpu, &iter->ts, &iter->lost_events)) + return false; + + iter->evt_cpu = cpu; + return true; + } + + iter->ts = U64_MAX; + for_each_possible_cpu(cpu) { + unsigned long lost_events; + u64 ts; + + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + continue; + + if (!ring_buffer_peek(trace_buffer, cpu, &ts, &lost_events)) + continue; + + if (ts >= iter->ts) + continue; + + iter->ts = ts; + iter->evt_cpu = cpu; + iter->lost_events = lost_events; + } + + return iter->ts != U64_MAX; +} + +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) +{ + unsigned long usecs_rem; + u64 ts = iter->ts; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->evt_cpu, iter->lost_events); + + do_div(ts, 1000); + usecs_rem = do_div(ts, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, + ts, usecs_rem); + + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; +} + +static int trace_pipe_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter; + int cpu = RING_BUFFER_ALL_CPUS; + + if (inode->i_cdev) + cpu = (long)inode->i_cdev - 1; + + guard(mutex)(&remote->lock); + iter = trace_remote_iter(remote, cpu); + filp->private_data = iter; + + return IS_ERR(iter) ? PTR_ERR(iter) : 0; +} + +static int trace_pipe_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_remote *remote = iter->remote; + + guard(mutex)(&remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int ret; + +copy_to_user: + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret != -EBUSY) + return ret; + + trace_seq_init(&iter->seq); + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL); + if (ret < 0) + return ret; + + while (trace_remote_iter_read_event(iter)) { + int prev_len = iter->seq.seq.len; + + if (trace_remote_iter_print_event(iter)) { + iter->seq.seq.len = prev_len; + break; + } + + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL); + } + + goto copy_to_user; +} + +static const struct file_operations trace_pipe_fops = { + .open = trace_pipe_open, + .read = trace_pipe_read, + .release = trace_pipe_release, +}; + +static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) +{ + struct dentry *remote_d, *percpu_d, *d; + static struct dentry *root; + static DEFINE_MUTEX(lock); + bool root_inited = false; + int cpu; + + guard(mutex)(&lock); + + if (!root) { + root = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n"); + return -ENOMEM; + } + root_inited = true; + } + + remote_d = tracefs_create_dir(name, root); + if (!remote_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name); + goto err; + } + + d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops); + if (!d) + goto err; + + d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, + &buffer_size_kb_fops); + if (!d) + goto err; + + d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops); + if (!d) + goto err; + + percpu_d = tracefs_create_dir("per_cpu", remote_d); + if (!percpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); + goto err; + } + + for_each_possible_cpu(cpu) { + struct dentry *cpu_d; + char cpu_name[16]; + + snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu); + cpu_d = tracefs_create_dir(cpu_name, percpu_d); + if (!cpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n", + name, cpu); + goto err; + } + + d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, + &trace_pipe_fops); + if (!d) + goto err; + } + + return 0; + +err: + if (root_inited) { + tracefs_remove(root); + root = NULL; + } else { + tracefs_remove(remote_d); + } + + return -ENOMEM; +} + +/** + * trace_remote_register() - Register a Tracefs remote + * @name: Name of the remote, used for the Tracefs remotes/ directory. + * @cbs: Set of callbacks used to control the remote. + * @priv: Private data, passed to each callback from @cbs. + * @events: Array of events. &remote_event.name and &remote_event.id must be + * filled by the caller. + * @nr_events: Number of events in the @events array. + * + * A trace remote is an entity, outside of the kernel (most likely firmware or + * hypervisor) capable of writing events into a Tracefs compatible ring-buffer. + * The kernel would then act as a reader. + * + * The registered remote will be found under the Tracefs directory + * remotes/. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) +{ + struct trace_remote *remote; + + remote = kzalloc_obj(*remote); + if (!remote) + return -ENOMEM; + + remote->cbs = cbs; + remote->priv = priv; + remote->trace_buffer_size = 7 << 10; + remote->poll_ms = 100; + mutex_init(&remote->lock); + + if (trace_remote_init_tracefs(name, remote)) { + kfree(remote); + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_GPL(trace_remote_register); + +/** + * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer() + * @desc: Descriptor of the per-CPU ring-buffers, originally filled by + * trace_remote_alloc_buffer() + * + * Most likely called from &trace_remote_callbacks.unload_trace_buffer. + */ +void trace_remote_free_buffer(struct trace_buffer_desc *desc) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + unsigned int id; + + free_page(rb_desc->meta_va); + + for (id = 0; id < rb_desc->nr_page_va; id++) + free_page(rb_desc->page_va[id]); + } +} +EXPORT_SYMBOL_GPL(trace_remote_free_buffer); + +/** + * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer + * @desc: Uninitialized trace_buffer_desc + * @desc_size: Size of the trace_buffer_desc. Must be at least equal to + * trace_buffer_desc_size() + * @buffer_size: Size in bytes of each per-CPU ring-buffer + * @cpumask: CPUs to allocate a ring-buffer for + * + * Helper to dynamically allocate a set of pages (enough to cover @buffer_size) + * for each CPU from @cpumask and fill @desc. Most likely called from + * &trace_remote_callbacks.load_trace_buffer. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + void *desc_end = desc + desc_size; + struct ring_buffer_desc *rb_desc; + int cpu, ret = -ENOMEM; + + if (desc_size < struct_size(desc, __data, 0)) + return -EINVAL; + + desc->nr_cpus = 0; + desc->struct_len = struct_size(desc, __data, 0); + + rb_desc = (struct ring_buffer_desc *)&desc->__data[0]; + + for_each_cpu(cpu, cpumask) { + unsigned int id; + + if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) { + ret = -EINVAL; + goto err; + } + + rb_desc->cpu = cpu; + rb_desc->nr_page_va = 0; + rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->meta_va) + goto err; + + for (id = 0; id < nr_pages; id++) { + rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->page_va[id]) + goto err; + + rb_desc->nr_page_va++; + } + desc->nr_cpus++; + desc->struct_len += offsetof(struct ring_buffer_desc, page_va); + desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va); + rb_desc = __next_ring_buffer_desc(rb_desc); + } + + return 0; + +err: + trace_remote_free_buffer(desc); + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); -- cgit v1.2.3 From 9af4ab0e11e336e2671d303ffcc6578e3546d9fc Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:52 +0000 Subject: tracing: Add reset to trace remotes Allow to reset the trace remote buffer by writing to the Tracefs "trace" file. This is similar to the regular Tracefs interface. Link: https://patch.msgid.link/20260309162516.2623589-7-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 3 +++ kernel/trace/trace_remote.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 65b7e7b8267c..10ca03dc192b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -17,6 +17,8 @@ * remote to allow writing. * @swap_reader_page: Called when Tracefs consumes a new page from a * ring-buffer. It is expected from the remote to isolate a + * @reset: Called on `echo 0 > trace`. It is expected from the + * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { @@ -24,6 +26,7 @@ struct trace_remote_callbacks { void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); }; int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 8b06f730376e..a7b94736dd38 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -63,6 +63,7 @@ static int trace_remote_load(struct trace_remote *remote) rb_remote->desc = desc; rb_remote->swap_reader_page = remote->cbs->swap_reader_page; rb_remote->priv = remote->priv; + rb_remote->reset = remote->cbs->reset; remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); if (!remote->trace_buffer) { remote->cbs->unload_trace_buffer(desc, remote->priv); @@ -138,6 +139,21 @@ static int trace_remote_disable_tracing(struct trace_remote *remote) return 0; } +static void trace_remote_reset(struct trace_remote *remote, int cpu) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) + ring_buffer_reset(remote->trace_buffer); + else + ring_buffer_reset_cpu(remote->trace_buffer, cpu); + + trace_remote_try_unload(remote); +} + static ssize_t tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -414,6 +430,26 @@ static const struct file_operations trace_pipe_fops = { .release = trace_pipe_release, }; +static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_remote *remote = inode->i_private; + int cpu = RING_BUFFER_ALL_CPUS; + + if (inode->i_cdev) + cpu = (long)inode->i_cdev - 1; + + guard(mutex)(&remote->lock); + + trace_remote_reset(remote, cpu); + + return cnt; +} + +static const struct file_operations trace_fops = { + .write = trace_write, +}; + static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) { struct dentry *remote_d, *percpu_d, *d; @@ -452,6 +488,10 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo if (!d) goto err; + d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops); + if (!d) + goto err; + percpu_d = tracefs_create_dir("per_cpu", remote_d); if (!percpu_d) { pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); @@ -474,6 +514,11 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo &trace_pipe_fops); if (!d) goto err; + + d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu, + &trace_fops); + if (!d) + goto err; } return 0; -- cgit v1.2.3 From bf2ba0f8ca1af14aaaa765cbb93caf564d383aad Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:54 +0000 Subject: tracing: Add init callback to trace remotes Add a .init call back so the trace remote callers can add entries to the tracefs directory. Link: https://patch.msgid.link/20260309162516.2623589-9-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 4 ++++ kernel/trace/trace_remote.c | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 10ca03dc192b..090c58b7d92b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -3,10 +3,13 @@ #ifndef _LINUX_TRACE_REMOTE_H #define _LINUX_TRACE_REMOTE_H +#include #include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @init: Called once the remote has been registered. Allows the + * caller to extend the Tracefs remote directory * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first * time. Must return a &trace_buffer_desc * (most likely filled with trace_remote_alloc_buffer()) @@ -22,6 +25,7 @@ * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { + int (*init)(struct dentry *d, void *priv); struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 039ba71c3b3e..294d051dcef1 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -863,6 +863,7 @@ err: int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) { struct trace_remote *remote; + int ret; remote = kzalloc_obj(*remote); if (!remote) @@ -880,7 +881,11 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, return -ENOMEM; } - return 0; + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; + if (ret) + pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); + + return ret; } EXPORT_SYMBOL_GPL(trace_remote_register); -- cgit v1.2.3 From 072529158e604cc964feb78dcf094c6975828146 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:55 +0000 Subject: tracing: Add events to trace remotes An event is predefined point in the writer code that allows to log data. Following the same scheme as kernel events, add remote events, described to user-space within the events/ tracefs directory found in the corresponding trace remote. Remote events are expected to be described during the trace remote registration. Add also a .enable_event callback for trace_remote to toggle the event logging, if supported. Link: https://patch.msgid.link/20260309162516.2623589-10-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 7 +- include/linux/trace_remote_event.h | 23 ++++ kernel/trace/trace_remote.c | 264 ++++++++++++++++++++++++++++++++++++- 3 files changed, 288 insertions(+), 6 deletions(-) create mode 100644 include/linux/trace_remote_event.h (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 090c58b7d92b..fcd1d46ea466 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -5,6 +5,7 @@ #include #include +#include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote @@ -23,6 +24,8 @@ * @reset: Called on `echo 0 > trace`. It is expected from the * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. + * @enable_event: Called on events/event_name/enable. It is expected from + * the remote to allow the writing event @id. */ struct trace_remote_callbacks { int (*init)(struct dentry *d, void *priv); @@ -31,9 +34,11 @@ struct trace_remote_callbacks { int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); int (*reset)(unsigned int cpu, void *priv); + int (*enable_event)(unsigned short id, bool enable, void *priv); }; -int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events); int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, const struct cpumask *cpumask); diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h new file mode 100644 index 000000000000..a4449008a075 --- /dev/null +++ b/include/linux/trace_remote_event.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_EVENTS_H +#define _LINUX_TRACE_REMOTE_EVENTS_H + +struct trace_remote; +struct trace_event_fields; + +struct remote_event_hdr { + unsigned short id; +}; + +#define REMOTE_EVENT_NAME_MAX 30 +struct remote_event { + char name[REMOTE_EVENT_NAME_MAX]; + unsigned short id; + bool enabled; + struct trace_remote *remote; + struct trace_event_fields *fields; + char *print_fmt; + void (*print)(void *evt, struct trace_seq *seq); +}; +#endif diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 294d051dcef1..0d0af53c0ce9 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -31,6 +31,7 @@ struct trace_remote_iterator { u64 ts; struct ring_buffer_iter *rb_iter; struct ring_buffer_iter **rb_iters; + struct remote_event_hdr *evt; int cpu; int evt_cpu; loff_t pos; @@ -42,6 +43,10 @@ struct trace_remote { void *priv; struct trace_buffer *trace_buffer; struct trace_buffer_desc *trace_buffer_desc; + struct dentry *dentry; + struct eventfs_inode *eventfs; + struct remote_event *events; + unsigned long nr_events; unsigned long trace_buffer_size; struct ring_buffer_remote rb_remote; struct mutex lock; @@ -168,7 +173,8 @@ static void trace_remote_reset(struct trace_remote *remote, int cpu) static ssize_t tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_remote *remote = filp->private_data; + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; unsigned long val; int ret; @@ -197,7 +203,8 @@ DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_remote *remote = filp->private_data; + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; unsigned long val; int ret; @@ -484,16 +491,19 @@ __peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) { struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + struct ring_buffer_event *rb_evt; int cpu = iter->cpu; if (cpu != RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(trace_buffer, cpu)) return false; - if (!__peek_event(iter, cpu, &iter->ts, &iter->lost_events)) + rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events); + if (!rb_evt) return false; iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); return true; } @@ -505,7 +515,8 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) if (ring_buffer_empty_cpu(trace_buffer, cpu)) continue; - if (!__peek_event(iter, cpu, &ts, &lost_events)) + rb_evt = __peek_event(iter, cpu, &ts, &lost_events); + if (!rb_evt) continue; if (ts >= iter->ts) @@ -513,6 +524,7 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) iter->ts = ts; iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); iter->lost_events = lost_events; } @@ -533,8 +545,11 @@ static void trace_remote_iter_move(struct trace_remote_iterator *iter) } } +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id); + static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) { + struct remote_event *evt; unsigned long usecs_rem; u64 ts = iter->ts; @@ -548,6 +563,12 @@ static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, ts, usecs_rem); + evt = trace_remote_find_event(iter->remote, iter->evt->id); + if (!evt) + trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id); + else + evt->print(iter->evt, &iter->seq); + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; } @@ -829,6 +850,8 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo goto err; } + remote->dentry = remote_d; + return 0; err: @@ -842,6 +865,9 @@ err: return -ENOMEM; } +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events); + /** * trace_remote_register() - Register a Tracefs remote * @name: Name of the remote, used for the Tracefs remotes/ directory. @@ -860,7 +886,8 @@ err: * * Return: 0 on success, negative error code on failure. */ -int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events) { struct trace_remote *remote; int ret; @@ -881,6 +908,13 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, return -ENOMEM; } + ret = trace_remote_register_events(name, remote, events, nr_events); + if (ret) { + pr_err("Failed to register events for trace remote '%s' (%d)\n", + name, ret); + return ret; + } + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; if (ret) pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); @@ -976,3 +1010,223 @@ err: return ret; } EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); + +static int +trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (evt->enabled == enable) + return 0; + + ret = remote->cbs->enable_event(evt->id, enable, remote->priv); + if (ret) + return ret; + + evt->enabled = enable; + + return 0; +} + +static int remote_event_enable_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->enabled); + + return 0; +} + +static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct remote_event *evt = seq->private; + struct trace_remote *remote = evt->remote; + u8 enable; + int ret; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = trace_remote_enable_event(remote, evt, enable); + if (ret) + return ret; + + return count; +} +DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable); + +static int remote_event_id_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_id); + +static int remote_event_format_show(struct seq_file *s, void *unused) +{ + size_t offset = sizeof(struct remote_event_hdr); + struct remote_event *evt = s->private; + struct trace_event_fields *field; + + seq_printf(s, "name: %s\n", evt->name); + seq_printf(s, "ID: %d\n", evt->id); + seq_puts(s, + "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n"); + + field = &evt->fields[0]; + while (field->name) { + seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->fields[0]) + seq_puts(s, "\n"); + + seq_printf(s, "print fmt: %s\n", evt->print_fmt); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_format); + +static int remote_event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_event_enable_fops; + return 1; + } + + if (!strcmp(name, "id")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_id_fops; + return 1; + } + + if (!strcmp(name, "format")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_format_fops; + return 1; + } + + return 0; +} + +static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote, + struct remote_event *evt) +{ + struct eventfs_inode *eventfs = remote->eventfs; + static struct eventfs_entry entries[] = { + { + .name = "enable", + .callback = remote_event_callback, + }, { + .name = "id", + .callback = remote_event_callback, + }, { + .name = "format", + .callback = remote_event_callback, + } + }; + bool eventfs_create = false; + + if (!eventfs) { + eventfs = eventfs_create_events_dir("events", remote->dentry, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + /* + * Create similar hierarchy as local events even if a single system is supported at + * the moment + */ + eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + remote->eventfs = eventfs; + eventfs_create = true; + } + + eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt); + if (IS_ERR(eventfs)) { + if (eventfs_create) { + eventfs_remove_events_dir(remote->eventfs); + remote->eventfs = NULL; + } + return PTR_ERR(eventfs); + } + + return 0; +} + +static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events, + size_t nr_events) +{ + int i; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + if (evt->remote) + return -EEXIST; + + evt->remote = remote; + + /* We need events to be sorted for efficient lookup */ + if (i && evt->id <= events[i - 1].id) + return -EINVAL; + } + + remote->events = events; + remote->nr_events = nr_events; + + return 0; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events) +{ + int i, ret; + + ret = trace_remote_attach_events(remote, events, nr_events); + if (ret) + return ret; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + ret = trace_remote_init_eventfs(remote_name, remote, evt); + if (ret) + pr_warn("Failed to init eventfs for event '%s' (%d)", + evt->name, ret); + } + + return 0; +} + +static int __cmp_events(const void *key, const void *data) +{ + const struct remote_event *evt = data; + int id = (int)((long)key); + + return id - (int)evt->id; +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id) +{ + return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events, + sizeof(*remote->events), __cmp_events); +} -- cgit v1.2.3 From 5f3efd1dcebc35d44cce39630ae00980a45d9247 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:57 +0000 Subject: tracing: Add helpers to create trace remote events Declaring remote events can be cumbersome let's add a set of macros to simplify developers life. The declaration of a remote event is very similar to kernel's events: REMOTE_EVENT(name, id, RE_STRUCT( re_field(u64 foo) ), RE_PRINTK("foo=%llu", __entry->foo) ) Link: https://patch.msgid.link/20260309162516.2623589-12-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote_event.h | 10 +++++ include/trace/define_remote_events.h | 73 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 include/trace/define_remote_events.h (limited to 'include') diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h index a4449008a075..c8ae1e1f5e72 100644 --- a/include/linux/trace_remote_event.h +++ b/include/linux/trace_remote_event.h @@ -5,6 +5,7 @@ struct trace_remote; struct trace_event_fields; +struct trace_seq; struct remote_event_hdr { unsigned short id; @@ -20,4 +21,13 @@ struct remote_event { char *print_fmt; void (*print)(void *evt, struct trace_seq *seq); }; + +#define RE_STRUCT(__args...) __args +#define re_field(__type, __field) __type __field; + +#define REMOTE_EVENT_FORMAT(__name, __struct) \ + struct remote_event_format_##__name { \ + struct remote_event_hdr hdr; \ + __struct \ + } #endif diff --git a/include/trace/define_remote_events.h b/include/trace/define_remote_events.h new file mode 100644 index 000000000000..676e803dc144 --- /dev/null +++ b/include/trace/define_remote_events.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#define REMOTE_EVENT_INCLUDE(__file) __stringify(../../__file) + +#ifdef REMOTE_EVENT_SECTION +# define __REMOTE_EVENT_SECTION(__name) __used __section(REMOTE_EVENT_SECTION"."#__name) +#else +# define __REMOTE_EVENT_SECTION(__name) +#endif + +#define REMOTE_PRINTK_COUNT_ARGS(__args...) \ + __COUNT_ARGS(, ##__args, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0) + +#define __remote_printk0() \ + trace_seq_putc(seq, '\n') + +#define __remote_printk1(__fmt) \ + trace_seq_puts(seq, " " __fmt "\n") \ + +#define __remote_printk2(__fmt, __args...) \ +do { \ + trace_seq_putc(seq, ' '); \ + trace_seq_printf(seq, __fmt, __args); \ + trace_seq_putc(seq, '\n'); \ +} while (0) + +/* Apply the appropriate trace_seq sequence according to the number of arguments */ +#define remote_printk(__args...) \ + CONCATENATE(__remote_printk, REMOTE_PRINTK_COUNT_ARGS(__args))(__args) + +#define RE_PRINTK(__args...) __args + +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + REMOTE_EVENT_FORMAT(__name, __struct); \ + static void remote_event_print_##__name(void *evt, struct trace_seq *seq) \ + { \ + struct remote_event_format_##__name __maybe_unused *__entry = evt; \ + trace_seq_puts(seq, #__name); \ + remote_printk(__printk); \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) + +#undef REMOTE_EVENT +#undef RE_PRINTK +#undef re_field +#define re_field(__type, __field) \ + { \ + .type = #__type, .name = #__field, \ + .size = sizeof(__type), .align = __alignof__(__type), \ + .is_signed = is_signed_type(__type), \ + }, +#define __entry REC +#define RE_PRINTK(__fmt, __args...) "\"" __fmt "\", " __stringify(__args) +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + static struct trace_event_fields remote_event_fields_##__name[] = { \ + __struct \ + {} \ + }; \ + static char remote_event_print_fmt_##__name[] = __printk; \ + static struct remote_event __REMOTE_EVENT_SECTION(__name) \ + remote_event_##__name = { \ + .name = #__name, \ + .id = __id, \ + .fields = remote_event_fields_##__name, \ + .print_fmt = remote_event_print_fmt_##__name, \ + .print = remote_event_print_##__name, \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) -- cgit v1.2.3 From 93ae1b76fff9e745f870a2f2cd32f472328c4a8f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:58 +0000 Subject: ring-buffer: Export buffer_data_page and macros In preparation for allowing the writing of ring-buffer compliant pages outside of ring_buffer.c, move buffer_data_page and timestamps encoding macros into the publicly available ring_buffer_types.h. Link: https://patch.msgid.link/20260309162516.2623589-13-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer_types.h | 41 +++++++++++++++++++++++++++++++++++++++ kernel/trace/ring_buffer.c | 36 +--------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) create mode 100644 include/linux/ring_buffer_types.h (limited to 'include') diff --git a/include/linux/ring_buffer_types.h b/include/linux/ring_buffer_types.h new file mode 100644 index 000000000000..54577021a49d --- /dev/null +++ b/include/linux/ring_buffer_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RING_BUFFER_TYPES_H +#define _LINUX_RING_BUFFER_TYPES_H + +#include + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline bool test_time_stamp(u64 delta) +{ + return !!(delta & TS_DELTA_TEST); +} + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; +#endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 605142e06863..96e0d80d492b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include #include @@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order) return dpage; } -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline bool test_time_stamp(u64 delta) -{ - return !!(delta & TS_DELTA_TEST); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; -- cgit v1.2.3 From 34e5b958bdad0f9cf16306368bbc2dc5b2a50143 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:59 +0000 Subject: tracing: Introduce simple_ring_buffer Add a simple implementation of the kernel ring-buffer. This intends to be used later by ring-buffer remotes such as the pKVM hypervisor, hence the need for a cut down version (write only) without any dependency. Link: https://patch.msgid.link/20260309162516.2623589-14-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 57 +++++ kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/simple_ring_buffer.c | 464 +++++++++++++++++++++++++++++++++++++ 4 files changed, 525 insertions(+) create mode 100644 include/linux/simple_ring_buffer.h create mode 100644 kernel/trace/simple_ring_buffer.c (limited to 'include') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h new file mode 100644 index 000000000000..2c4c0ae336bc --- /dev/null +++ b/include/linux/simple_ring_buffer.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SIMPLE_RING_BUFFER_H +#define _LINUX_SIMPLE_RING_BUFFER_H + +#include +#include +#include +#include + +/* + * Ideally those struct would stay private but the caller needs to know + * the allocation size for simple_ring_buffer_init(). + */ +struct simple_buffer_page { + struct list_head link; + struct buffer_data_page *page; + u64 entries; + u32 write; + u32 id; +}; + +struct simple_rb_per_cpu { + struct simple_buffer_page *tail_page; + struct simple_buffer_page *reader_page; + struct simple_buffer_page *head_page; + struct simple_buffer_page *bpages; + struct trace_buffer_meta *meta; + u32 nr_pages; + +#define SIMPLE_RB_UNAVAILABLE 0 +#define SIMPLE_RB_READY 1 +#define SIMPLE_RB_WRITING 2 + u32 status; + + u64 last_overrun; + u64 write_stamp; + + struct simple_rb_cbs *cbs; +}; + +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc); + +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer); + +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp); + +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable); + +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 384dd36c8e29..edbdd7b38f61 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1284,4 +1284,7 @@ source "kernel/trace/rv/Kconfig" config TRACE_REMOTE bool +config SIMPLE_RING_BUFFER + bool + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 318923ce39f5..2e39b09398b3 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -129,4 +129,5 @@ obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o +obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o libftrace-y := ftrace.o diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c new file mode 100644 index 000000000000..15df9781411b --- /dev/null +++ b/kernel/trace/simple_ring_buffer.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include + +#include +#include + +enum simple_rb_link_type { + SIMPLE_RB_LINK_NORMAL = 0, + SIMPLE_RB_LINK_HEAD = 1, + SIMPLE_RB_LINK_HEAD_MOVING +}; + +#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) + +static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + link &= SIMPLE_RB_LINK_MASK; + link |= SIMPLE_RB_LINK_HEAD; + + /* + * Paired with simple_rb_find_head() to order access between the head + * link and overrun. It ensures we always report an up-to-date value + * after swapping the reader page. + */ + smp_store_release(&bpage->link.next, (struct list_head *)link); +} + +static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, + struct simple_buffer_page *dst, + enum simple_rb_link_type new_type) +{ + unsigned long *link = (unsigned long *)(&bpage->link.next); + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; + unsigned long new = (unsigned long)(&dst->link) | new_type; + + return try_cmpxchg(link, &old, new); +} + +static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); +} + +static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) +{ + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; + + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); +} + +static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) +{ + return simple_bpage_from_link(bpage->link.next); +} + +static void simple_bpage_reset(struct simple_buffer_page *bpage) +{ + bpage->write = 0; + bpage->entries = 0; + + local_set(&bpage->page->commit, 0); +} + +static void simple_bpage_init(struct simple_buffer_page *bpage, unsigned long page) +{ + INIT_LIST_HEAD(&bpage->link); + bpage->page = (struct buffer_data_page *)page; + + simple_bpage_reset(bpage); +} + +#define simple_rb_meta_inc(__meta, __inc) \ + WRITE_ONCE((__meta), (__meta + __inc)) + +static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) +{ + return !!cpu_buffer->bpages; +} + +static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) +{ + int retry = cpu_buffer->nr_pages * 2; + struct simple_buffer_page *head; + + head = cpu_buffer->head_page; + + while (retry--) { + unsigned long link; + +spin: + /* See smp_store_release in simple_bpage_set_head_link() */ + link = (unsigned long)smp_load_acquire(&head->link.prev->next); + + switch (link & ~SIMPLE_RB_LINK_MASK) { + /* Found the head */ + case SIMPLE_RB_LINK_HEAD: + cpu_buffer->head_page = head; + return 0; + /* The writer caught the head, we can spin, that won't be long */ + case SIMPLE_RB_LINK_HEAD_MOVING: + goto spin; + } + + head = simple_bpage_next_page(head); + } + + return -EBUSY; +} + +/** + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader + * @cpu_buffer: A simple_rb_per_cpu + * + * This function enables consuming reading. It ensures the current head page will not be overwritten + * and can be safely read. + * + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the + * head page. + */ +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *last, *head, *reader; + unsigned long overrun; + int retry = 8; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page; + + do { + /* Run after the writer to find the head */ + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + head = cpu_buffer->head_page; + + /* Connect the reader page around the header page */ + reader->link.next = head->link.next; + reader->link.prev = head->link.prev; + + /* The last page before the head */ + last = simple_bpage_from_link(head->link.prev); + + /* The reader page points to the new header page */ + simple_bpage_set_head_link(reader); + + overrun = cpu_buffer->meta->overrun; + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); + + if (!retry) + return -EINVAL; + + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); + cpu_buffer->head_page->link.prev = &reader->link; + cpu_buffer->reader_page = head; + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; + cpu_buffer->last_overrun = overrun; + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); + +static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *tail, *new_tail; + + tail = cpu_buffer->tail_page; + new_tail = simple_bpage_next_page(tail); + + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { + /* + * Oh no! we've caught the head. There is none anymore and + * swap_reader will spin until we set the new one. Overrun must + * be written first, to make sure we report the correct number + * of lost events. + */ + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); + + simple_bpage_set_head_link(new_tail); + simple_bpage_set_normal_link(tail); + } + + simple_bpage_reset(new_tail); + cpu_buffer->tail_page = new_tail; + + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); + + return new_tail; +} + +static unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct simple_buffer_page *tail = cpu_buffer->tail_page; + struct ring_buffer_event *event; + u32 write, prev_write; + u64 time_delta; + + time_delta = timestamp - cpu_buffer->write_stamp; + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = tail->write; + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) + tail = simple_rb_move_tail(cpu_buffer); + + if (!tail->entries) { + tail->page->time_stamp = timestamp; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + tail->write = write; + tail->entries++; + + cpu_buffer->write_stamp = timestamp; + + event = (struct ring_buffer_event *)(tail->page->data + prev_write); + if (ts_ext_size) { + event = rb_event_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +/** + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @length: Size of the entry in bytes + * @timestamp: Timestamp of the entry + * + * Returns the address of the entry where to write data or NULL + */ +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp) +{ + struct ring_buffer_event *rb_event; + + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) + return NULL; + + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); + + return &rb_event->array[1]; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); + +/** + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved + */ +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->tail_page->page->commit, + cpu_buffer->tail_page->write); + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); + + /* + * Paired with simple_rb_enable_tracing() to ensure data is + * written to the ring-buffer before teardown. + */ + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); + +static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + u32 prev_status; + + if (enable) + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); + + /* Wait for the buffer to be released */ + do { + prev_status = cmpxchg_acquire(&cpu_buffer->status, + SIMPLE_RB_READY, + SIMPLE_RB_UNAVAILABLE); + } while (prev_status == SIMPLE_RB_WRITING); + + return prev_status; +} + +/** + * simple_ring_buffer_reset - Reset @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * + * This will not clear the content of the data, only reset counters and pointers + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. + */ +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *bpage; + u32 prev_status; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + prev_status = simple_rb_enable_tracing(cpu_buffer, false); + + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; + do { + simple_bpage_reset(bpage); + bpage = simple_bpage_next_page(bpage); + } while (bpage != cpu_buffer->head_page); + + simple_bpage_reset(cpu_buffer->reader_page); + + cpu_buffer->last_overrun = 0; + cpu_buffer->write_stamp = 0; + + cpu_buffer->meta->reader.read = 0; + cpu_buffer->meta->reader.lost_events = 0; + cpu_buffer->meta->entries = 0; + cpu_buffer->meta->overrun = 0; + cpu_buffer->meta->read = 0; + cpu_buffer->meta->pages_lost = 0; + cpu_buffer->meta->pages_touched = 0; + + if (prev_status == SIMPLE_RB_READY) + simple_rb_enable_tracing(cpu_buffer, true); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); + +/** + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid + */ +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + struct simple_buffer_page *bpage = bpages; + int i; + + /* At least 1 reader page and two pages in the ring-buffer */ + if (desc->nr_page_va < 3) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->bpages = bpages; + + cpu_buffer->meta = (void *)desc->meta_va; + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); + cpu_buffer->meta->meta_page_size = PAGE_SIZE; + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; + + /* The reader page is not part of the ring initially */ + simple_bpage_init(bpage, desc->page_va[0]); + bpage->id = 0; + + cpu_buffer->nr_pages = 1; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 1; i < desc->nr_page_va; i++) { + simple_bpage_init(++bpage, desc->page_va[i]); + + bpage->link.next = &(bpage + 1)->link; + bpage->link.prev = &(bpage - 1)->link; + bpage->id = i; + + cpu_buffer->nr_pages = i + 1; + } + + /* Close the ring */ + bpage->link.next = &cpu_buffer->tail_page->link; + cpu_buffer->tail_page->link.prev = &bpage->link; + + /* The last init'ed page points to the head page */ + simple_bpage_set_head_link(bpage); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + if (!simple_rb_loaded(cpu_buffer)) + return; + + simple_rb_enable_tracing(cpu_buffer, false); + + cpu_buffer->bpages = NULL; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); + +/** + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @enable: True to enable tracing, False to disable it + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded + */ +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + simple_rb_enable_tracing(cpu_buffer, enable); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing); -- cgit v1.2.3 From 635923081c792c830fb87e680d6dd5f348926b3f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:25:03 +0000 Subject: tracing: load/unload page callbacks for simple_ring_buffer Add load/unload callback used for each admitted page in the ring-buffer. This will be later useful for the pKVM hypervisor which uses a different VA space and need to dynamically map/unmap the ring-buffer pages. Link: https://patch.msgid.link/20260309162516.2623589-18-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 8 ++++ kernel/trace/simple_ring_buffer.c | 91 ++++++++++++++++++++++++++++++-------- 2 files changed, 80 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h index 2c4c0ae336bc..21aec556293e 100644 --- a/include/linux/simple_ring_buffer.h +++ b/include/linux/simple_ring_buffer.h @@ -54,4 +54,12 @@ int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)); #endif diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c index 15df9781411b..02af2297ae5a 100644 --- a/kernel/trace/simple_ring_buffer.c +++ b/kernel/trace/simple_ring_buffer.c @@ -71,7 +71,7 @@ static void simple_bpage_reset(struct simple_buffer_page *bpage) local_set(&bpage->page->commit, 0); } -static void simple_bpage_init(struct simple_buffer_page *bpage, unsigned long page) +static void simple_bpage_init(struct simple_buffer_page *bpage, void *page) { INIT_LIST_HEAD(&bpage->link); bpage->page = (struct buffer_data_page *)page; @@ -372,18 +372,15 @@ int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) } EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); -/** - * simple_ring_buffer_init - Init @cpu_buffer based on @desc - * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. - * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va - * @desc: A ring_buffer_desc - * - * Returns 0 on success or -EINVAL if the content of @desc is invalid - */ -int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, - const struct ring_buffer_desc *desc) +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)) { struct simple_buffer_page *bpage = bpages; + int ret = 0; + void *page; int i; /* At least 1 reader page and two pages in the ring-buffer */ @@ -392,15 +389,22 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ memset(cpu_buffer, 0, sizeof(*cpu_buffer)); - cpu_buffer->bpages = bpages; + cpu_buffer->meta = load_page(desc->meta_va); + if (!cpu_buffer->meta) + return -EINVAL; - cpu_buffer->meta = (void *)desc->meta_va; memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); cpu_buffer->meta->meta_page_size = PAGE_SIZE; cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* The reader page is not part of the ring initially */ - simple_bpage_init(bpage, desc->page_va[0]); + page = load_page(desc->page_va[0]); + if (!page) { + unload_page(cpu_buffer->meta); + return -EINVAL; + } + + simple_bpage_init(bpage, page); bpage->id = 0; cpu_buffer->nr_pages = 1; @@ -410,7 +414,13 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ cpu_buffer->head_page = bpage + 1; for (i = 1; i < desc->nr_page_va; i++) { - simple_bpage_init(++bpage, desc->page_va[i]); + page = load_page(desc->page_va[i]); + if (!page) { + ret = -EINVAL; + break; + } + + simple_bpage_init(++bpage, page); bpage->link.next = &(bpage + 1)->link; bpage->link.prev = &(bpage - 1)->link; @@ -419,6 +429,14 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ cpu_buffer->nr_pages = i + 1; } + if (ret) { + for (i--; i >= 0; i--) + unload_page((void *)desc->page_va[i]); + unload_page(cpu_buffer->meta); + + return ret; + } + /* Close the ring */ bpage->link.next = &cpu_buffer->tail_page->link; cpu_buffer->tail_page->link.prev = &bpage->link; @@ -426,23 +444,58 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ /* The last init'ed page points to the head page */ simple_bpage_set_head_link(bpage); + cpu_buffer->bpages = bpages; + return 0; } -EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +static void *__load_page(unsigned long page) +{ + return (void *)page; +} + +static void __unload_page(void *page) { } /** - * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion - * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid */ -void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)) { + int p; + if (!simple_rb_loaded(cpu_buffer)) return; simple_rb_enable_tracing(cpu_buffer, false); + unload_page(cpu_buffer->meta); + for (p = 0; p < cpu_buffer->nr_pages; p++) + unload_page(cpu_buffer->bpages[p].page); + cpu_buffer->bpages = NULL; } + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page); +} EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); /** -- cgit v1.2.3