From ffe3986fece696cf65e0ef99e74c75f848be8e30 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 9 Apr 2024 15:13:09 -0400 Subject: ring-buffer: Only update pages_touched when a new page is touched The "buffer_percent" logic that is used by the ring buffer splice code to only wake up the tasks when there's no data after the buffer is filled to the percentage of the "buffer_percent" file is dependent on three variables that determine the amount of data that is in the ring buffer: 1) pages_read - incremented whenever a new sub-buffer is consumed 2) pages_lost - incremented every time a writer overwrites a sub-buffer 3) pages_touched - incremented when a write goes to a new sub-buffer The percentage is the calculation of: (pages_touched - (pages_lost + pages_read)) / nr_pages Basically, the amount of data is the total number of sub-bufs that have been touched, minus the number of sub-bufs lost and sub-bufs consumed. This is divided by the total count to give the buffer percentage. When the percentage is greater than the value in the "buffer_percent" file, it wakes up splice readers waiting for that amount. It was observed that over time, the amount read from the splice was constantly decreasing the longer the trace was running. That is, if one asked for 60%, it would read over 60% when it first starts tracing, but then it would be woken up at under 60% and would slowly decrease the amount of data read after being woken up, where the amount becomes much less than the buffer percent. This was due to an accounting of the pages_touched incrementation. This value is incremented whenever a writer transfers to a new sub-buffer. But the place where it was incremented was incorrect. If a writer overflowed the current sub-buffer it would go to the next one. If it gets preempted by an interrupt at that time, and the interrupt performs a trace, it too will end up going to the next sub-buffer. But only one should increment the counter. Unfortunately, that was not the case. Change the cmpxchg() that does the real switch of the tail-page into a try_cmpxchg(), and on success, perform the increment of pages_touched. This will only increment the counter once for when the writer moves to a new sub-buffer, and not when there's a race and is incremented for when a writer and its preempting writer both move to the same new sub-buffer. Link: https://lore.kernel.org/linux-trace-kernel/20240409151309.0d0e5056@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 25476ead681b..6511dc3a00da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1393,7 +1393,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); - local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -1430,8 +1429,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - /* Again, either we update tail_page or an interrupt does */ - (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); + /* Either we update tail_page or an interrupt does */ + if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) + local_inc(&cpu_buffer->pages_touched); } } -- cgit v1.2.3 From c09d4167b550f91ecf5c3db883eea314edc7f532 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 10 May 2024 15:04:30 +0100 Subject: ring-buffer: Allocate sub-buffers with __GFP_COMP In preparation for the ring-buffer memory mapping, allocate compound pages for the ring-buffer sub-buffers to enable us to map them to user-space with vm_insert_pages(). Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-2-vdonnefort@google.com Acked-by: David Hildenbrand Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6511dc3a00da..a1e011733f6a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1524,7 +1524,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_ZERO, + mflags | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; @@ -1609,7 +1609,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; @@ -5579,7 +5579,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) goto out; page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO, + GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); -- cgit v1.2.3 From 117c39200d9d760cbd5944bb89efb7b9c51965aa Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 10 May 2024 15:04:31 +0100 Subject: ring-buffer: Introducing ring-buffer mapping functions In preparation for allowing the user-space to map a ring-buffer, add a set of mapping functions: ring_buffer_{map,unmap}() And controls on the ring-buffer: ring_buffer_map_get_reader() /* swap reader and head */ Mapping the ring-buffer also involves: A unique ID for each subbuf of the ring-buffer, currently they are only identified through their in-kernel VA. A meta-page, where are stored ring-buffer statistics and a description for the current reader The linear mapping exposes the meta-page, and each subbuf of the ring-buffer, ordered following their unique ID, assigned during the first mapping. Once mapped, no subbuf can get in or out of the ring-buffer: the buffer size will remain unmodified and the splice enabling functions will in reality simply memcpy the data instead of swapping subbufs. Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-3-vdonnefort@google.com CC: Signed-off-by: Vincent Donnefort Acked-by: David Hildenbrand Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 6 + include/uapi/linux/trace_mmap.h | 46 +++++ kernel/trace/ring_buffer.c | 414 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 463 insertions(+), 3 deletions(-) create mode 100644 include/uapi/linux/trace_mmap.h (limited to 'kernel/trace/ring_buffer.c') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index dc5ae4e96aee..96d2140b471e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -6,6 +6,8 @@ #include #include +#include + struct trace_buffer; struct ring_buffer_iter; @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); #define trace_rb_cpu_prepare NULL #endif +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma); +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h new file mode 100644 index 000000000000..b682e9925539 --- /dev/null +++ b/include/uapi/linux/trace_mmap.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _TRACE_MMAP_H_ +#define _TRACE_MMAP_H_ + +#include + +/** + * struct trace_buffer_meta - Ring-buffer Meta-page description + * @meta_page_size: Size of this meta-page. + * @meta_struct_len: Size of this structure. + * @subbuf_size: Size of each sub-buffer. + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. + * @reader.lost_events: Number of events lost at the time of the reader swap. + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] + * @reader.read: Number of bytes read on the reader subbuf. + * @flags: Placeholder for now, 0 until new features are supported. + * @entries: Number of entries in the ring-buffer. + * @overrun: Number of entries lost in the ring-buffer. + * @read: Number of entries that have been read. + * @Reserved1: Internal use only. + * @Reserved2: Internal use only. + */ +struct trace_buffer_meta { + __u32 meta_page_size; + __u32 meta_struct_len; + + __u32 subbuf_size; + __u32 nr_subbufs; + + struct { + __u64 lost_events; + __u32 id; + __u32 read; + } reader; + + __u64 flags; + + __u64 entries; + __u64 overrun; + __u64 read; + + __u64 Reserved1; + __u64 Reserved2; +}; + +#endif /* _TRACE_MMAP_H_ */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1e011733f6a..cb303052bff2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include @@ -338,6 +340,7 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id; /* ID for external mapping */ struct buffer_data_page *page; /* Actual data page */ }; @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned int mapped; + struct mutex mapping_lock; + unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct trace_buffer_meta *meta_page; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) return buffer->time_stamp_abs; } -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); - static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + + meta->reader.read = cpu_buffer->reader_page->read; + meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.lost_events = cpu_buffer->lost_events; + + meta->entries = local_read(&cpu_buffer->entries); + meta->overrun = local_read(&cpu_buffer->overrun); + meta->read = cpu_buffer->read; + + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; + if (cpu_buffer->mapped) + rb_update_meta_page(cpu_buffer); + rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; } @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + /* It's up to the callers to not try to swap mapped buffers */ + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { + ret = -EBUSY; + goto out; + } + /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || - cpu_buffer->reader_page == cpu_buffer->commit_page) { + cpu_buffer->reader_page == cpu_buffer->commit_page || + cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer = buffer->buffers[cpu]; + if (cpu_buffer->mapped) { + err = -EBUSY; + goto error; + } + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); @@ -6057,6 +6096,375 @@ error: } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct page *page; + + if (cpu_buffer->meta_page) + return 0; + + page = alloc_page(GFP_USER | __GFP_ZERO); + if (!page) + return -ENOMEM; + + cpu_buffer->meta_page = page_to_virt(page); + + return 0; +} + +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long addr = (unsigned long)cpu_buffer->meta_page; + + free_page(addr); + cpu_buffer->meta_page = NULL; +} + +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long *subbuf_ids) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; + struct buffer_page *first_subbuf, *subbuf; + int id = 0; + + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; + cpu_buffer->reader_page->id = id++; + + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); + do { + if (WARN_ON(id >= nr_subbufs)) + break; + + subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf->id = id; + + rb_inc_page(&subbuf); + id++; + } while (subbuf != first_subbuf); + + /* install subbuf ID to kern VA translation */ + cpu_buffer->subbuf_ids = subbuf_ids; + + meta->meta_page_size = PAGE_SIZE; + meta->meta_struct_len = sizeof(*meta); + meta->nr_subbufs = nr_subbufs; + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + + rb_update_meta_page(cpu_buffer); +} + +static struct ring_buffer_per_cpu * +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-EINVAL); + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + mutex_unlock(&cpu_buffer->mapping_lock); + return ERR_PTR(-ENODEV); + } + + return cpu_buffer; +} + +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + mutex_unlock(&cpu_buffer->mapping_lock); +} + +/* + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need + * to be set-up or torn-down. + */ +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, + bool inc) +{ + unsigned long flags; + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + if (inc && cpu_buffer->mapped == UINT_MAX) + return -EBUSY; + + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) + return -EINVAL; + + mutex_lock(&cpu_buffer->buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (inc) + cpu_buffer->mapped++; + else + cpu_buffer->mapped--; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + mutex_unlock(&cpu_buffer->buffer->mutex); + + return 0; +} + +/* + * +--------------+ pgoff == 0 + * | meta page | + * +--------------+ pgoff == 1 + * | subbuffer 0 | + * | | + * +--------------+ pgoff == (1 + (1 << subbuf_order)) + * | subbuffer 1 | + * | | + * ... + */ +#ifdef CONFIG_MMU +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; + unsigned int subbuf_pages, subbuf_order; + struct page **pages; + int p = 0, s = 0; + int err; + + /* Refuse MP_PRIVATE or writable mappings */ + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || + !(vma->vm_flags & VM_MAYSHARE)) + return -EPERM; + + /* + * Make sure the mapping cannot become writable later. Also tell the VM + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). + */ + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, + VM_MAYWRITE); + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */ + + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (!vma_pages || vma_pages > nr_pages) + return -EINVAL; + + nr_pages = vma_pages; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (!pgoff) { + pages[p++] = virt_to_page(cpu_buffer->meta_page); + + /* + * TODO: Align sub-buffers on their size, once + * vm_insert_pages() supports the zero-page. + */ + } else { + /* Skip the meta-page */ + pgoff--; + + if (pgoff % subbuf_pages) { + err = -EINVAL; + goto out; + } + + s += pgoff / subbuf_pages; + } + + while (p < nr_pages) { + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]); + int off = 0; + + if (WARN_ON_ONCE(s >= nr_subbufs)) { + err = -EINVAL; + goto out; + } + + for (; off < (1 << (subbuf_order)); off++, page++) { + if (p >= nr_pages) + break; + + pages[p++] = page; + } + s++; + } + + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + +out: + kfree(pages); + + return err; +} +#else +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + return -EOPNOTSUPP; +} +#endif + +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags, *subbuf_ids; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (cpu_buffer->mapped) { + err = __rb_map_vma(cpu_buffer, vma); + if (!err) + err = __rb_inc_dec_mapped(cpu_buffer, true); + mutex_unlock(&cpu_buffer->mapping_lock); + return err; + } + + /* prevent another thread from changing buffer/sub-buffer sizes */ + mutex_lock(&buffer->mutex); + + err = rb_alloc_meta_page(cpu_buffer); + if (err) + goto unlock; + + /* subbuf_ids include the reader while nr_pages does not */ + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); + if (!subbuf_ids) { + rb_free_meta_page(cpu_buffer); + err = -ENOMEM; + goto unlock; + } + + atomic_inc(&cpu_buffer->resize_disabled); + + /* + * Lock all readers to block any subbuf swap until the subbuf IDs are + * assigned. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + err = __rb_map_vma(cpu_buffer, vma); + if (!err) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + cpu_buffer->mapped = 1; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } else { + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + } + +unlock: + mutex_unlock(&buffer->mutex); + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + err = -ENODEV; + goto out; + } else if (cpu_buffer->mapped > 1) { + __rb_inc_dec_mapped(cpu_buffer, false); + goto out; + } + + mutex_lock(&buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + cpu_buffer->mapped = 0; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); + +out: + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long reader_size; + unsigned long flags; + + cpu_buffer = rb_get_mapped_buffer(buffer, cpu); + if (IS_ERR(cpu_buffer)) + return (int)PTR_ERR(cpu_buffer); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + +consume: + if (rb_per_cpu_empty(cpu_buffer)) + goto out; + + reader_size = rb_page_size(cpu_buffer->reader_page); + + /* + * There are data to be read on the current reader page, we can + * return to the caller. But before that, we assume the latter will read + * everything. Let's update the kernel reader accordingly. + */ + if (cpu_buffer->reader_page->read < reader_size) { + while (cpu_buffer->reader_page->read < reader_size) + rb_advance_reader(cpu_buffer); + goto out; + } + + if (WARN_ON(!rb_get_reader_page(cpu_buffer))) + goto out; + + goto consume; + +out: + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + + rb_update_meta_page(cpu_buffer); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + rb_put_mapped_buffer(cpu_buffer); + + return 0; +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in -- cgit v1.2.3 From fe832be05a8eee5f1488cbcc2c562dd82d079fd6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Mar 2024 17:54:05 -0400 Subject: ring-buffer: Have mmapped ring buffer keep track of missed events While testing libtracefs on the mmapped ring buffer, the test that checks if missed events are accounted for failed when using the mapped buffer. This is because the mapped page does not update the missed events that were dropped because the writer filled up the ring buffer before the reader could catch it. Add the missed events to the reader page/sub-buffer when the IOCTL is done and a new reader page is acquired. Note that all accesses to the reader_page via rb_page_commit() had to be switched to rb_page_size(), and rb_page_size() which was just a copy of rb_page_commit() but now it masks out the RB_MISSED bits. This is needed as the mapped reader page is still active in the ring buffer code and where it reads the commit field of the bpage for the size, it now must mask it otherwise the missed bits that are now set will corrupt the size returned. Link: https://lore.kernel.org/linux-trace-kernel/20240312175405.12fb6726@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 53 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cb303052bff2..a02c7a52a0f5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -314,6 +314,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_MASK (3 << 30) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -2326,7 +2328,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { - return rb_page_commit(bpage); + return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned @@ -3953,7 +3955,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) return true; /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_commit(reader)) + if (reader->read != rb_page_size(reader)) return false; /* @@ -4424,7 +4426,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && - iter->head == rb_page_commit(cpu_buffer->reader_page))); + iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -5753,7 +5755,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, event = rb_reader_event(cpu_buffer); read = reader->read; - commit = rb_page_commit(reader); + commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; @@ -5830,7 +5832,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += rb_page_commit(reader); + cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); @@ -6422,6 +6424,8 @@ out: int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + unsigned long missed_events; unsigned long reader_size; unsigned long flags; @@ -6448,9 +6452,46 @@ consume: goto out; } - if (WARN_ON(!rb_get_reader_page(cpu_buffer))) + reader = rb_get_reader_page(cpu_buffer); + if (WARN_ON(!reader)) goto out; + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { + if (missed_events) { + struct buffer_data_page *bpage = reader->page; + unsigned int commit; + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + /* + * If there is room at the end of the page to save the + * missed events, then record it there. + */ + commit = rb_page_size(reader); + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + } else { + /* + * There really shouldn't be any missed events if the commit + * is on the reader page. + */ + WARN_ON_ONCE(missed_events); + } + + cpu_buffer->lost_events = 0; + goto consume; out: -- cgit v1.2.3 From b9c6820f029abaabbc37646093866aa730ca0928 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 15 May 2024 01:05:58 -0400 Subject: ring-buffer: Add cast to unsigned long addr passed to virt_to_page() The sub-buffer pages are held in an unsigned long array, and when it is passed to virt_to_page() a cast is needed. Link: https://lore.kernel.org/all/20240515124808.06279d04@canb.auug.org.au/ Link: https://lore.kernel.org/linux-trace-kernel/20240515010558.4abaefdd@rorschach.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 117c39200d9d ("ring-buffer: Introducing ring-buffer mapping functions") Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a02c7a52a0f5..7345a8b625fb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6283,7 +6283,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { - struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]); + struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) { -- cgit v1.2.3 From ea70a9628e93168bb5a07b35f7b9a162a65afe9e Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 17 May 2024 15:40:07 +0200 Subject: ring-buffer: Correct stale comments related to non-consuming readers Adjust the following code documentation: * Kernel-doc comments for ring_buffer_read_prepare() and ring_buffer_read_finish() mention that recording to the ring buffer is disabled when the read is active. Remove mention of this restriction because it was already lifted in commit 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator"). * Function ring_buffer_read_finish() performs a self-check of the ring-buffer by locking cpu_buffer->reader_lock and then calling rb_check_pages(). The preceding comment explains that the lock is needed because rb_check_pages() clears the HEAD flag required by readers which might be running in parallel. Remove this explanation because commit 8843e06f67b1 ("ring-buffer: Handle race between rb_move_tail and rb_check_pages") simplified the function so it no longer resets the mentioned flag. Nonetheless, the lock is still needed because a reader swapping a page into the ring buffer can make the underlying doubly-linked list temporarily inconsistent. This is a non-functional change. Link: https://lore.kernel.org/linux-trace-kernel/20240517134008.24529-2-petr.pavlu@suse.com Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7345a8b625fb..42227727a49d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5046,13 +5046,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer recording + * through the buffer. Memory is allocated, buffer resizing * is disabled, and the iterator pointer is returned to the caller. * - * Disabling buffer recording prevents the reading from being - * corrupted. This is not a consuming read, so a producer is not - * expected. - * * After a sequence of ring_buffer_read_prepare calls, the user is * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going @@ -5139,8 +5135,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_start); * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * - * This re-enables the recording to the buffer, and frees the - * iterator. + * This re-enables resizing of the buffer, and frees the iterator. */ void ring_buffer_read_finish(struct ring_buffer_iter *iter) @@ -5148,12 +5143,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - /* - * Ring buffer is disabled from recording, here's a good place - * to check the integrity of the ring buffer. - * Must prevent readers from trying to read, as the check - * clears the HEAD page and readers require it. - */ + /* Use this opportunity to check the integrity of the ring buffer. */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -- cgit v1.2.3 From c2274b908db05529980ec056359fae916939fdaa Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 17 May 2024 15:40:08 +0200 Subject: ring-buffer: Fix a race between readers and resize checks The reader code in rb_get_reader_page() swaps a new reader page into the ring buffer by doing cmpxchg on old->list.prev->next to point it to the new page. Following that, if the operation is successful, old->list.next->prev gets updated too. This means the underlying doubly-linked list is temporarily inconsistent, page->prev->next or page->next->prev might not be equal back to page for some page in the ring buffer. The resize operation in ring_buffer_resize() can be invoked in parallel. It calls rb_check_pages() which can detect the described inconsistency and stop further tracing: [ 190.271762] ------------[ cut here ]------------ [ 190.271771] WARNING: CPU: 1 PID: 6186 at kernel/trace/ring_buffer.c:1467 rb_check_pages.isra.0+0x6a/0xa0 [ 190.271789] Modules linked in: [...] [ 190.271991] Unloaded tainted modules: intel_uncore_frequency(E):1 skx_edac(E):1 [ 190.272002] CPU: 1 PID: 6186 Comm: cmd.sh Kdump: loaded Tainted: G E 6.9.0-rc6-default #5 158d3e1e6d0b091c34c3b96bfd99a1c58306d79f [ 190.272011] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552c-rebuilt.opensuse.org 04/01/2014 [ 190.272015] RIP: 0010:rb_check_pages.isra.0+0x6a/0xa0 [ 190.272023] Code: [...] [ 190.272028] RSP: 0018:ffff9c37463abb70 EFLAGS: 00010206 [ 190.272034] RAX: ffff8eba04b6cb80 RBX: 0000000000000007 RCX: ffff8eba01f13d80 [ 190.272038] RDX: ffff8eba01f130c0 RSI: ffff8eba04b6cd00 RDI: ffff8eba0004c700 [ 190.272042] RBP: ffff8eba0004c700 R08: 0000000000010002 R09: 0000000000000000 [ 190.272045] R10: 00000000ffff7f52 R11: ffff8eba7f600000 R12: ffff8eba0004c720 [ 190.272049] R13: ffff8eba00223a00 R14: 0000000000000008 R15: ffff8eba067a8000 [ 190.272053] FS: 00007f1bd64752c0(0000) GS:ffff8eba7f680000(0000) knlGS:0000000000000000 [ 190.272057] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 190.272061] CR2: 00007f1bd6662590 CR3: 000000010291e001 CR4: 0000000000370ef0 [ 190.272070] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 190.272073] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 190.272077] Call Trace: [ 190.272098] [ 190.272189] ring_buffer_resize+0x2ab/0x460 [ 190.272199] __tracing_resize_ring_buffer.part.0+0x23/0xa0 [ 190.272206] tracing_resize_ring_buffer+0x65/0x90 [ 190.272216] tracing_entries_write+0x74/0xc0 [ 190.272225] vfs_write+0xf5/0x420 [ 190.272248] ksys_write+0x67/0xe0 [ 190.272256] do_syscall_64+0x82/0x170 [ 190.272363] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 190.272373] RIP: 0033:0x7f1bd657d263 [ 190.272381] Code: [...] [ 190.272385] RSP: 002b:00007ffe72b643f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 190.272391] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f1bd657d263 [ 190.272395] RDX: 0000000000000002 RSI: 0000555a6eb538e0 RDI: 0000000000000001 [ 190.272398] RBP: 0000555a6eb538e0 R08: 000000000000000a R09: 0000000000000000 [ 190.272401] R10: 0000555a6eb55190 R11: 0000000000000246 R12: 00007f1bd6662500 [ 190.272404] R13: 0000000000000002 R14: 00007f1bd6667c00 R15: 0000000000000002 [ 190.272412] [ 190.272414] ---[ end trace 0000000000000000 ]--- Note that ring_buffer_resize() calls rb_check_pages() only if the parent trace_buffer has recording disabled. Recent commit d78ab792705c ("tracing: Stop current tracer when resizing buffer") causes that it is now always the case which makes it more likely to experience this issue. The window to hit this race is nonetheless very small. To help reproducing it, one can add a delay loop in rb_get_reader_page(): ret = rb_head_page_replace(reader, cpu_buffer->reader_page); if (!ret) goto spin; for (unsigned i = 0; i < 1U << 26; i++) /* inserted delay loop */ __asm__ __volatile__ ("" : : : "memory"); rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; .. and then run the following commands on the target system: echo 1 > /sys/kernel/tracing/events/sched/sched_switch/enable while true; do echo 16 > /sys/kernel/tracing/buffer_size_kb; sleep 0.1 echo 8 > /sys/kernel/tracing/buffer_size_kb; sleep 0.1 done & while true; do for i in /sys/kernel/tracing/per_cpu/*; do timeout 0.1 cat $i/trace_pipe; sleep 0.2 done done To fix the problem, make sure ring_buffer_resize() doesn't invoke rb_check_pages() concurrently with a reader operating on the same ring_buffer_per_cpu by taking its cpu_buffer->reader_lock. Link: https://lore.kernel.org/linux-trace-kernel/20240517134008.24529-3-petr.pavlu@suse.com Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 659f451ff213 ("ring-buffer: Add integrity check at end of iter read") Signed-off-by: Petr Pavlu [ Fixed whitespace ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 42227727a49d..28853966aa9a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1460,6 +1460,11 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, * * As a safety measure we check to make sure the data pages have not * been corrupted. + * + * Callers of this function need to guarantee that the list of pages doesn't get + * modified during the check. In particular, if it's possible that the function + * is invoked with concurrent readers which can swap in a new reader page then + * the caller should take cpu_buffer->reader_lock. */ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { @@ -2210,8 +2215,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { + unsigned long flags; + cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } atomic_dec(&buffer->record_disabled); } -- cgit v1.2.3