From 318580ad7f2828f6269e0b8819e943ddedda3375 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 29 Aug 2024 14:41:09 +0800 Subject: hugetlbfs: support tracepoint Add basic tracepoints for {alloc, evict, free}_inode, setattr and fallocate. These can help users to debug hugetlbfs more conveniently. Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240829064110.67884-2-lihongbo22@huawei.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Christian Brauner --- include/trace/events/hugetlbfs.h | 156 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 include/trace/events/hugetlbfs.h (limited to 'include/trace') diff --git a/include/trace/events/hugetlbfs.h b/include/trace/events/hugetlbfs.h new file mode 100644 index 000000000000..8331c904a9ba --- /dev/null +++ b/include/trace/events/hugetlbfs.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hugetlbfs + +#if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HUGETLBFS_H + +#include + +TRACE_EVENT(hugetlbfs_alloc_inode, + + TP_PROTO(struct inode *inode, struct inode *dir, int mode), + + TP_ARGS(inode, dir, mode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(ino_t, dir) + __field(__u16, mode) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->dir, __entry->mode) +); + +DECLARE_EVENT_CLASS(hugetlbfs__inode, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(__u16, mode) + __field(loff_t, size) + __field(unsigned int, nlink) + __field(unsigned int, seals) + __field(blkcnt_t, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->size = inode->i_size; + __entry->nlink = inode->i_nlink; + __entry->seals = HUGETLBFS_I(inode)->seals; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o size %lld nlink %u seals %u blocks %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, __entry->size, __entry->nlink, __entry->seals, + (unsigned long long)__entry->blocks) +); + +DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +TRACE_EVENT(hugetlbfs_setattr, + + TP_PROTO(struct inode *inode, struct dentry *dentry, + struct iattr *attr), + + TP_ARGS(inode, dentry, attr), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, d_len) + __string(d_name, dentry->d_name.name) + __field(unsigned int, ia_valid) + __field(unsigned int, ia_mode) + __field(loff_t, old_size) + __field(loff_t, ia_size) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->d_len = dentry->d_name.len; + __assign_str(d_name); + __entry->ia_valid = attr->ia_valid; + __entry->ia_mode = attr->ia_mode; + __entry->old_size = inode->i_size; + __entry->ia_size = attr->ia_size; + ), + + TP_printk("dev %d,%d ino %lu name %.*s valid %#x mode 0%o old_size %lld size %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, + __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode, + __entry->old_size, __entry->ia_size) +); + +TRACE_EVENT(hugetlbfs_fallocate, + + TP_PROTO(struct inode *inode, int mode, + loff_t offset, loff_t len, int ret), + + TP_ARGS(inode, mode, offset, len, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(int, mode) + __field(loff_t, offset) + __field(loff_t, len) + __field(loff_t, size) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = mode; + __entry->offset = offset; + __entry->len = len; + __entry->size = inode->i_size; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o offset %lld len %lld size %lld ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long)__entry->ino, __entry->mode, + (unsigned long long)__entry->offset, + (unsigned long long)__entry->len, + (unsigned long long)__entry->size, + __entry->ret) +); + +#endif /* _TRACE_HUGETLBFS_H */ + + /* This part must be outside protection */ +#include -- cgit v1.2.3 From 1afd01db1a76cdd1d96696e3790d66c79621784c Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 20 Sep 2024 10:58:00 +0200 Subject: pwm: Add tracing for waveform callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds trace events for the recently introduced waveform callbacks. With the introduction of some helper macros consistency among the different events is ensured. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/1d71879b0de3bf01459c7a9d0f040d43eb5ace56.1726819463.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 24 ++++++-- include/trace/events/pwm.h | 134 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 146 insertions(+), 12 deletions(-) (limited to 'include/trace') diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 038f17dd2757..c3bdebf4cf6e 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -164,30 +164,46 @@ static int __pwm_round_waveform_tohw(struct pwm_chip *chip, struct pwm_device *p const struct pwm_waveform *wf, void *wfhw) { const struct pwm_ops *ops = chip->ops; + int ret; + + ret = ops->round_waveform_tohw(chip, pwm, wf, wfhw); + trace_pwm_round_waveform_tohw(pwm, wf, wfhw, ret); - return ops->round_waveform_tohw(chip, pwm, wf, wfhw); + return ret; } static int __pwm_round_waveform_fromhw(struct pwm_chip *chip, struct pwm_device *pwm, const void *wfhw, struct pwm_waveform *wf) { const struct pwm_ops *ops = chip->ops; + int ret; + + ret = ops->round_waveform_fromhw(chip, pwm, wfhw, wf); + trace_pwm_round_waveform_fromhw(pwm, wfhw, wf, ret); - return ops->round_waveform_fromhw(chip, pwm, wfhw, wf); + return ret; } static int __pwm_read_waveform(struct pwm_chip *chip, struct pwm_device *pwm, void *wfhw) { const struct pwm_ops *ops = chip->ops; + int ret; + + ret = ops->read_waveform(chip, pwm, wfhw); + trace_pwm_read_waveform(pwm, wfhw, ret); - return ops->read_waveform(chip, pwm, wfhw); + return ret; } static int __pwm_write_waveform(struct pwm_chip *chip, struct pwm_device *pwm, const void *wfhw) { const struct pwm_ops *ops = chip->ops; + int ret; + + ret = ops->write_waveform(chip, pwm, wfhw); + trace_pwm_write_waveform(pwm, wfhw, ret); - return ops->write_waveform(chip, pwm, wfhw); + return ret; } #define WFHWSIZE 20 diff --git a/include/trace/events/pwm.h b/include/trace/events/pwm.h index 8022701c446d..8ba898fd335c 100644 --- a/include/trace/events/pwm.h +++ b/include/trace/events/pwm.h @@ -8,15 +8,135 @@ #include #include +#define TP_PROTO_pwm(args...) \ + TP_PROTO(struct pwm_device *pwm, args) + +#define TP_ARGS_pwm(args...) \ + TP_ARGS(pwm, args) + +#define TP_STRUCT__entry_pwm(args...) \ + TP_STRUCT__entry( \ + __field(unsigned int, chipid) \ + __field(unsigned int, hwpwm) \ + args) + +#define TP_fast_assign_pwm(args...) \ + TP_fast_assign( \ + __entry->chipid = pwm->chip->id; \ + __entry->hwpwm = pwm->hwpwm; \ + args) + +#define TP_printk_pwm(fmt, args...) \ + TP_printk("pwmchip%u.%u: " fmt, __entry->chipid, __entry->hwpwm, args) + +#define __field_pwmwf(wf) \ + __field(u64, wf ## _period_length_ns) \ + __field(u64, wf ## _duty_length_ns) \ + __field(u64, wf ## _duty_offset_ns) \ + +#define fast_assign_pwmwf(wf) \ + __entry->wf ## _period_length_ns = wf->period_length_ns; \ + __entry->wf ## _duty_length_ns = wf->duty_length_ns; \ + __entry->wf ## _duty_offset_ns = wf->duty_offset_ns + +#define printk_pwmwf_format(wf) \ + "%lld/%lld [+%lld]" + +#define printk_pwmwf_formatargs(wf) \ + __entry->wf ## _duty_length_ns, __entry->wf ## _period_length_ns, __entry->wf ## _duty_offset_ns + +TRACE_EVENT(pwm_round_waveform_tohw, + + TP_PROTO_pwm(const struct pwm_waveform *wf, void *wfhw, int err), + + TP_ARGS_pwm(wf, wfhw, err), + + TP_STRUCT__entry_pwm( + __field_pwmwf(wf) + __field(void *, wfhw) + __field(int, err) + ), + + TP_fast_assign_pwm( + fast_assign_pwmwf(wf); + __entry->wfhw = wfhw; + __entry->err = err; + ), + + TP_printk_pwm(printk_pwmwf_format(wf) " > %p err=%d", + printk_pwmwf_formatargs(wf), __entry->wfhw, __entry->err) +); + +TRACE_EVENT(pwm_round_waveform_fromhw, + + TP_PROTO_pwm(const void *wfhw, struct pwm_waveform *wf, int err), + + TP_ARGS_pwm(wfhw, wf, err), + + TP_STRUCT__entry_pwm( + __field(const void *, wfhw) + __field_pwmwf(wf) + __field(int, err) + ), + + TP_fast_assign_pwm( + __entry->wfhw = wfhw; + fast_assign_pwmwf(wf); + __entry->err = err; + ), + + TP_printk_pwm("%p > " printk_pwmwf_format(wf) " err=%d", + __entry->wfhw, printk_pwmwf_formatargs(wf), __entry->err) +); + +TRACE_EVENT(pwm_read_waveform, + + TP_PROTO_pwm(void *wfhw, int err), + + TP_ARGS_pwm(wfhw, err), + + TP_STRUCT__entry_pwm( + __field(void *, wfhw) + __field(int, err) + ), + + TP_fast_assign_pwm( + __entry->wfhw = wfhw; + __entry->err = err; + ), + + TP_printk_pwm("%p err=%d", + __entry->wfhw, __entry->err) +); + +TRACE_EVENT(pwm_write_waveform, + + TP_PROTO_pwm(const void *wfhw, int err), + + TP_ARGS_pwm(wfhw, err), + + TP_STRUCT__entry_pwm( + __field(const void *, wfhw) + __field(int, err) + ), + + TP_fast_assign_pwm( + __entry->wfhw = wfhw; + __entry->err = err; + ), + + TP_printk_pwm("%p err=%d", + __entry->wfhw, __entry->err) +); + + DECLARE_EVENT_CLASS(pwm, TP_PROTO(struct pwm_device *pwm, const struct pwm_state *state, int err), TP_ARGS(pwm, state, err), - TP_STRUCT__entry( - __field(unsigned int, chipid) - __field(unsigned int, hwpwm) + TP_STRUCT__entry_pwm( __field(u64, period) __field(u64, duty_cycle) __field(enum pwm_polarity, polarity) @@ -24,9 +144,7 @@ DECLARE_EVENT_CLASS(pwm, __field(int, err) ), - TP_fast_assign( - __entry->chipid = pwm->chip->id; - __entry->hwpwm = pwm->hwpwm; + TP_fast_assign_pwm( __entry->period = state->period; __entry->duty_cycle = state->duty_cycle; __entry->polarity = state->polarity; @@ -34,8 +152,8 @@ DECLARE_EVENT_CLASS(pwm, __entry->err = err; ), - TP_printk("pwmchip%u.%u: period=%llu duty_cycle=%llu polarity=%d enabled=%d err=%d", - __entry->chipid, __entry->hwpwm, __entry->period, __entry->duty_cycle, + TP_printk_pwm("period=%llu duty_cycle=%llu polarity=%d enabled=%d err=%d", + __entry->period, __entry->duty_cycle, __entry->polarity, __entry->enabled, __entry->err) ); -- cgit v1.2.3 From fcd4904e2f6908d5c255fa5818bcf8ad32a6f0e8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 19:23:03 +0100 Subject: netfs: Remove call to folio_index() Calling folio_index() is pointless overhead; directly dereferencing folio->index is fine. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241005182307.3190401-2-willy@infradead.org Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 1d7c52821e55..72a208fd4496 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -451,7 +451,7 @@ TRACE_EVENT(netfs_folio, struct address_space *__m = READ_ONCE(folio->mapping); __entry->ino = __m ? __m->host->i_ino : 0; __entry->why = why; - __entry->index = folio_index(folio); + __entry->index = folio->index; __entry->nr = folio_nr_pages(folio); ), -- cgit v1.2.3 From 48bcda6848232667f13b4e97588de488c83c37d4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 18:16:29 -0400 Subject: tracing: Remove definition of trace_*_rcuidle() The trace_*_rcuidle() variant of a tracepoint was to handle places where a tracepoint was located but RCU was not "watching". All those locations have been removed, and RCU should be watching where all tracepoints are located. We can now remove the trace_*_rcuidle() variant. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Joel Fernandes Link: https://lore.kernel.org/20241003181629.36209057@gandalf.local.home Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 50 ++------------------------------------- include/trace/events/preemptirq.h | 8 ------- kernel/trace/trace_preemptirq.c | 26 +++++--------------- scripts/tags.sh | 2 -- 4 files changed, 8 insertions(+), 78 deletions(-) (limited to 'include/trace') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 2a29334bbc02..7e4af7b3633c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -196,67 +196,25 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */ -/* - * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle - * code that disallow any/all tracing/instrumentation when RCU isn't watching. - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define RCUIDLE_COND(rcuidle) (rcuidle) -#else -/* srcu can't be used from NMI */ -#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) -#endif - /* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ -#define __DO_TRACE(name, args, cond, rcuidle) \ +#define __DO_TRACE(name, args, cond) \ do { \ int __maybe_unused __idx = 0; \ \ if (!(cond)) \ return; \ \ - if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ - "Bad RCU usage for tracepoint")) \ - return; \ - \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ \ - /* \ - * For rcuidle callers, use srcu since sched-rcu \ - * doesn't work from the idle path. \ - */ \ - if (rcuidle) { \ - __idx = srcu_read_lock_notrace(&tracepoint_srcu);\ - ct_irq_enter_irqson(); \ - } \ - \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ - if (rcuidle) { \ - ct_irq_exit_irqson(); \ - srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\ - } \ - \ preempt_enable_notrace(); \ } while (0) -#ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond) \ - static inline void trace_##name##_rcuidle(proto) \ - { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 1); \ - } -#else -#define __DECLARE_TRACE_RCU(name, proto, args, cond) -#endif - /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -277,14 +235,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ - TP_CONDITION(cond), 0); \ + TP_CONDITION(cond)); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ } \ - __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ - PARAMS(cond)) \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -375,8 +331,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ static inline void trace_##name(proto) \ { } \ - static inline void trace_##name##_rcuidle(proto) \ - { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 3f249e150c0c..f99562d2b496 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -43,8 +43,6 @@ DEFINE_EVENT(preemptirq_template, irq_enable, #else #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #endif #ifdef CONFIG_TRACE_PREEMPT_TOGGLE @@ -58,8 +56,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif #endif /* _TRACE_PREEMPTIRQ_H */ @@ -69,10 +65,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */ #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index e37446f7916e..5c03633316a6 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -15,20 +15,6 @@ #define CREATE_TRACE_POINTS #include -/* - * Use regular trace points on architectures that implement noinstr - * tooling: these calls will only happen with RCU enabled, which can - * use a regular tracepoint. - * - * On older architectures, use the rcuidle tracing methods (which - * aren't NMI-safe - so exclude NMI contexts): - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define trace(point) trace_##point -#else -#define trace(point) if (!in_nmi()) trace_##point##_rcuidle -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -42,7 +28,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -53,7 +39,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -75,7 +61,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } @@ -89,7 +75,7 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -100,13 +86,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace(preempt_enable)(a0, a1); + trace_preempt_enable(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace(preempt_disable)(a0, a1); + trace_preempt_disable(a0, a1); tracer_preempt_off(a0, a1); } #endif diff --git a/scripts/tags.sh b/scripts/tags.sh index 191e0461d6d5..0d01c1cafb70 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -152,9 +152,7 @@ regex_c=( '/^BPF_CALL_[0-9]([[:space:]]*\([[:alnum:]_]*\).*/\1/' '/^COMPAT_SYSCALL_DEFINE[0-9]([[:space:]]*\([[:alnum:]_]*\).*/compat_sys_\1/' '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/get_\1_slot/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/free_\1_slot/' '/^PAGEFLAG([[:space:]]*\([[:alnum:]_]*\).*/Page\1/' -- cgit v1.2.3 From 0e6caab8db8bc9359d1a8363e1ee9b2205ed704d Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:11 -0400 Subject: tracing: Declare system call tracepoints with TRACE_EVENT_SYSCALL In preparation for allowing system call tracepoints to handle page faults, introduce TRACE_EVENT_SYSCALL to declare the sys_enter/sys_exit tracepoints. Move the common code between __DECLARE_TRACE and __DECLARE_TRACE_SYSCALL into __DECLARE_TRACE_COMMON. This change is not meant to alter the generated code, and only prepares the following modifications. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-2-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 53 +++++++++++++++++++++++++++++++---------- include/trace/bpf_probe.h | 3 +++ include/trace/define_trace.h | 5 ++++ include/trace/events/syscalls.h | 4 ++-- include/trace/perf.h | 3 +++ include/trace/trace_events.h | 28 ++++++++++++++++++++++ 6 files changed, 81 insertions(+), 15 deletions(-) (limited to 'include/trace') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 3d33b9872cec..76e441b39a96 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -197,7 +197,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ -#define __DO_TRACE(name, args, cond) \ +#define __DO_TRACE(name, args, cond, syscall) \ do { \ int __maybe_unused __idx = 0; \ \ @@ -222,21 +222,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * site if it is not watching, as it will need to be active when the * tracepoint is enabled. */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ +#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ - static inline void trace_##name(proto) \ - { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond)); \ - if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - WARN_ONCE(!rcu_is_watching(), \ - "RCU not watching for tracepoint"); \ - } \ - } \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -266,6 +255,34 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) return static_branch_unlikely(&__tracepoint_##name.key);\ } +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void trace_##name(proto) \ + { \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 0); \ + if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ + } \ + } + +#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void trace_##name(proto) \ + { \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 1); \ + if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ + } \ + } + /* * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration @@ -348,6 +365,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) return false; \ } +#define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE + #define DEFINE_TRACE_FN(name, reg, unreg, proto, args) #define DEFINE_TRACE(name, proto, args) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) @@ -409,6 +428,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto)) +#define DECLARE_TRACE_SYSCALL(name, proto, args) \ + __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()), \ + PARAMS(void *__data, proto)) + #define TRACE_EVENT_FLAGS(event, flag) #define TRACE_EVENT_PERF_PERM(event, expr...) @@ -546,6 +570,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct, assign, print) \ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) +#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \ + print, reg, unreg) \ + DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FLAGS(event, flag) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index a2ea11cc912e..c85bbce5aaa5 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -53,6 +53,9 @@ __bpf_trace_##call(void *__data, proto) \ #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + /* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 00723935dcc7..ff5fa17a6259 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -46,6 +46,10 @@ assign, print, reg, unreg) \ DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args)) +#undef TRACE_EVENT_SYSCALL +#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, print, reg, unreg) \ + DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args)) + #undef TRACE_EVENT_NOP #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print) @@ -107,6 +111,7 @@ #undef TRACE_EVENT #undef TRACE_EVENT_FN #undef TRACE_EVENT_FN_COND +#undef TRACE_EVENT_SYSCALL #undef TRACE_EVENT_CONDITION #undef TRACE_EVENT_NOP #undef DEFINE_EVENT_NOP diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h index b6e0cbc2c71f..f31ff446b468 100644 --- a/include/trace/events/syscalls.h +++ b/include/trace/events/syscalls.h @@ -15,7 +15,7 @@ #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -TRACE_EVENT_FN(sys_enter, +TRACE_EVENT_SYSCALL(sys_enter, TP_PROTO(struct pt_regs *regs, long id), @@ -41,7 +41,7 @@ TRACE_EVENT_FN(sys_enter, TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY) -TRACE_EVENT_FN(sys_exit, +TRACE_EVENT_SYSCALL(sys_exit, TP_PROTO(struct pt_regs *regs, long ret), diff --git a/include/trace/perf.h b/include/trace/perf.h index 2c11181c82e0..ded997af481e 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -55,6 +55,9 @@ perf_trace_##call(void *__data, proto) \ head, __task); \ } +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + /* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index c2f9cabf154d..8bcbb9ee44de 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -45,6 +45,16 @@ PARAMS(print)); \ DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args)); +#undef TRACE_EVENT_SYSCALL +#define TRACE_EVENT_SYSCALL(name, proto, args, tstruct, assign, print, reg, unreg) \ + DECLARE_EVENT_SYSCALL_CLASS(name, \ + PARAMS(proto), \ + PARAMS(args), \ + PARAMS(tstruct), \ + PARAMS(assign), \ + PARAMS(print)); \ + DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args)); + #include "stages/stage1_struct_define.h" #undef DECLARE_EVENT_CLASS @@ -57,6 +67,9 @@ \ static struct trace_event_class event_class_##name; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT #define DEFINE_EVENT(template, name, proto, args) \ static struct trace_event_call __used \ @@ -117,6 +130,9 @@ tstruct; \ }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT #define DEFINE_EVENT(template, name, proto, args) @@ -208,6 +224,9 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ .trace = trace_raw_output_##call, \ }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ static notrace enum print_line_t \ @@ -265,6 +284,9 @@ static inline notrace int trace_event_get_offsets_##call( \ return __data_size; \ } +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* @@ -409,6 +431,9 @@ trace_event_raw_event_##call(void *__data, proto) \ * fail to compile unless it too is updated. */ +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ static inline void ftrace_test_probe_##call(void) \ @@ -434,6 +459,9 @@ static struct trace_event_class __used __refdata event_class_##call = { \ _TRACE_PERF_INIT(call) \ }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ \ -- cgit v1.2.3 From 13d750c2c03e9861e15268574ed2c239cca9c9d5 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:12 -0400 Subject: tracing/ftrace: disable preemption in syscall probe In preparation for allowing system call enter/exit instrumentation to handle page faults, make sure that ftrace can handle this change by explicitly disabling preemption within the ftrace system call tracepoint probes to respect the current expectations within ftrace ring buffer code. This change does not yet allow ftrace to take page faults per se within its probe, but allows its existing probes to adapt to the upcoming change. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-3-mathieu.desnoyers@efficios.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/trace_events.h | 39 ++++++++++++++++++++++++++++++++------- kernel/trace/trace_syscalls.c | 12 ++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'include/trace') diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 8bcbb9ee44de..63071aa5923d 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -263,6 +263,9 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ tstruct \ {} }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) @@ -396,11 +399,11 @@ static inline notrace int trace_event_get_offsets_##call( \ #include "stages/stage6_event_callback.h" -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ - \ + +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -trace_event_raw_event_##call(void *__data, proto) \ +do_trace_event_raw_event_##call(void *__data, proto) \ { \ struct trace_event_file *trace_file = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -425,15 +428,35 @@ trace_event_raw_event_##call(void *__data, proto) \ \ trace_event_buffer_commit(&fbuffer); \ } + +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + do_trace_event_raw_event_##call(__data, args); \ +} + +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + preempt_disable_notrace(); \ + do_trace_event_raw_event_##call(__data, args); \ + preempt_enable_notrace(); \ +} + /* * The ftrace_test_probe is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the ftrace probe will * fail to compile unless it too is updated. */ -#undef DECLARE_EVENT_SYSCALL_CLASS -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS - #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ static inline void ftrace_test_probe_##call(void) \ @@ -443,6 +466,8 @@ static inline void ftrace_test_probe_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#undef __DECLARE_EVENT_CLASS + #include "stages/stage7_class_define.h" #undef DECLARE_EVENT_CLASS diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 785733245ead..f9b21bac9d45 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -299,6 +299,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int syscall_nr; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -338,6 +344,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_buffer fbuffer; int syscall_nr; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; -- cgit v1.2.3 From 65e7462a16cea593025ca3b34c5d74e69b027ee0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:13 -0400 Subject: tracing/perf: disable preemption in syscall probe In preparation for allowing system call enter/exit instrumentation to handle page faults, make sure that perf can handle this change by explicitly disabling preemption within the perf system call tracepoint probes to respect the current expectations within perf ring buffer code. This change does not yet allow perf to take page faults per se within its probe, but allows its existing probes to adapt to the upcoming change. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-4-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 42 ++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace_syscalls.c | 12 ++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) (limited to 'include/trace') diff --git a/include/trace/perf.h b/include/trace/perf.h index ded997af481e..15cde7eac8b4 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -12,10 +12,10 @@ #undef __perf_task #define __perf_task(t) (__task = (t)) -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -perf_trace_##call(void *__data, proto) \ +do_perf_trace_##call(void *__data, proto) \ { \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -55,8 +55,39 @@ perf_trace_##call(void *__data, proto) \ head, __task); \ } +/* + * Define unused __count and __task variables to use @args to pass + * arguments to do_perf_trace_##call. This is needed because the + * macros __perf_count and __perf_task introduce the side-effect to + * store copies into those local variables. + */ +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + do_perf_trace_##call(__data, args); \ +} + #undef DECLARE_EVENT_SYSCALL_CLASS -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + preempt_disable_notrace(); \ + do_perf_trace_##call(__data, args); \ + preempt_enable_notrace(); \ +} /* * This part is compiled out, it is only here as a build time check @@ -76,4 +107,7 @@ static inline void perf_test_probe_##call(void) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef __DECLARE_EVENT_CLASS + #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f9b21bac9d45..b1cc19806f3d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -596,6 +596,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -698,6 +704,12 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; -- cgit v1.2.3 From 4aadde89d81fbf4cf3105a61dbc48888b819ecfb Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:14 -0400 Subject: tracing/bpf: disable preemption in syscall probe In preparation for allowing system call enter/exit instrumentation to handle page faults, make sure that bpf can handle this change by explicitly disabling preemption within the bpf system call tracepoint probes to respect the current expectations within bpf tracing code. This change does not yet allow bpf to take page faults per se within its probe, but allows its existing probes to adapt to the upcoming change. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-5-mathieu.desnoyers@efficios.com Acked-by: Andrii Nakryiko Tested-by: Andrii Nakryiko # BPF parts Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/bpf_probe.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index c85bbce5aaa5..fec97c93e1c9 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -53,8 +53,18 @@ __bpf_trace_##call(void *__data, proto) \ #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) +#define __BPF_DECLARE_TRACE_SYSCALL(call, proto, args) \ +static notrace void \ +__bpf_trace_##call(void *__data, proto) \ +{ \ + preempt_disable_notrace(); \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ + preempt_enable_notrace(); \ +} + #undef DECLARE_EVENT_SYSCALL_CLASS -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ + __BPF_DECLARE_TRACE_SYSCALL(call, PARAMS(proto), PARAMS(args)) /* * This part is compiled out, it is only here as a build time check -- cgit v1.2.3 From a3204c740a59bebb3b37a294d83f4e353303a52c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:16 -0400 Subject: tracing/ftrace: Add might_fault check to syscall probes Add a might_fault() check to validate that the ftrace sys_enter/sys_exit probe callbacks are indeed called from a context where page faults can be handled. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-7-mathieu.desnoyers@efficios.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/trace_events.h | 1 + kernel/trace/trace_syscalls.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/trace') diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 63071aa5923d..4f22136fd465 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -446,6 +446,7 @@ __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ static notrace void \ trace_event_raw_event_##call(void *__data, proto) \ { \ + might_fault(); \ preempt_disable_notrace(); \ do_trace_event_raw_event_##call(__data, args); \ preempt_enable_notrace(); \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b1cc19806f3d..6d6bbd56ed92 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -303,6 +303,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); @@ -348,6 +349,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); -- cgit v1.2.3 From cdb537ac417938408ee819992f432c410f2d01a2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:17 -0400 Subject: tracing/perf: Add might_fault check to syscall probes Add a might_fault() check to validate that the perf sys_enter/sys_exit probe callbacks are indeed called from a context where page faults can be handled. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-8-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 1 + kernel/trace/trace_syscalls.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/trace') diff --git a/include/trace/perf.h b/include/trace/perf.h index 15cde7eac8b4..a1754b73a8f5 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -84,6 +84,7 @@ perf_trace_##call(void *__data, proto) \ u64 __count __attribute__((unused)); \ struct task_struct *__task __attribute__((unused)); \ \ + might_fault(); \ preempt_disable_notrace(); \ do_perf_trace_##call(__data, args); \ preempt_enable_notrace(); \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6d6bbd56ed92..46aab0ab9350 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -602,6 +602,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); @@ -710,6 +711,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); -- cgit v1.2.3 From 0850e1bc88b1bdc30f7f0b223a92eb22e5f06be0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:18 -0400 Subject: tracing/bpf: Add might_fault check to syscall probes Add a might_fault() check to validate that the bpf sys_enter/sys_exit probe callbacks are indeed called from a context where page faults can be handled. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-9-mathieu.desnoyers@efficios.com Acked-by: Andrii Nakryiko Tested-by: Andrii Nakryiko # BPF parts Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/bpf_probe.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/trace') diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index fec97c93e1c9..183fa2aa2935 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -57,6 +57,7 @@ __bpf_trace_##call(void *__data, proto) \ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ + might_fault(); \ preempt_disable_notrace(); \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ preempt_enable_notrace(); \ -- cgit v1.2.3 From c86e3c47187ad7fa17f8533a4142453991684b46 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:21 -0400 Subject: fs: tracepoints around multigrain timestamp events Add some tracepoints around various multigrain timestamp events. Reviewed-by: Josef Bacik Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Steven Rostedt (Google) Tested-by: Randy Dunlap # documentation bits Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241002-mgtime-v10-6-d1c4717f5284@kernel.org Signed-off-by: Christian Brauner --- fs/inode.c | 9 ++- fs/stat.c | 3 + include/trace/events/timestamp.h | 124 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/timestamp.h (limited to 'include/trace') diff --git a/fs/inode.c b/fs/inode.c index 7d1ede60e549..f7a25c511d6b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -22,6 +22,9 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include + #include "internal.h" /* @@ -2603,6 +2606,7 @@ EXPORT_SYMBOL(inode_nohighmem); struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { + trace_inode_set_ctime_to_ts(inode, &ts); set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); inode->i_ctime_sec = ts.tv_sec; inode->i_ctime_nsec = ts.tv_nsec; @@ -2689,14 +2693,17 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) } /* No need to cmpxchg if it's exactly the same */ - if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) + if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { + trace_ctime_xchg_skip(inode, &now); goto out; + } cur = cns; retry: /* Try to swap the nsec value into place. */ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { /* If swap occurred, then we're (mostly) done */ inode->i_ctime_sec = now.tv_sec; + trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); } else { /* * Was the change due to someone marking the old ctime QUERIED? diff --git a/fs/stat.c b/fs/stat.c index dd480bf51a2a..6eb6c39d0037 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -23,6 +23,8 @@ #include #include +#include + #include "internal.h" #include "mount.h" @@ -56,6 +58,7 @@ void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn)); stat->ctime.tv_nsec &= ~I_CTIME_QUERIED; + trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime); } EXPORT_SYMBOL(fill_mg_cmtime); diff --git a/include/trace/events/timestamp.h b/include/trace/events/timestamp.h new file mode 100644 index 000000000000..c9e5ec930054 --- /dev/null +++ b/include/trace/events/timestamp.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM timestamp + +#if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TIMESTAMP_H + +#include +#include + +#define CTIME_QUERIED_FLAGS \ + { I_CTIME_QUERIED, "Q" } + +DECLARE_EVENT_CLASS(ctime, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime), + + TP_ARGS(inode, ctime), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(time64_t, ctime_s) + __field(u32, ctime_ns) + __field(u32, gen) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->gen = inode->i_generation; + __entry->ctime_s = ctime->tv_sec; + __entry->ctime_ns = ctime->tv_nsec; + ), + + TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, + __entry->ctime_s, __entry->ctime_ns + ) +); + +DEFINE_EVENT(ctime, inode_set_ctime_to_ts, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime), + TP_ARGS(inode, ctime)); + +DEFINE_EVENT(ctime, ctime_xchg_skip, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime), + TP_ARGS(inode, ctime)); + +TRACE_EVENT(ctime_ns_xchg, + TP_PROTO(struct inode *inode, + u32 old, + u32 new, + u32 cur), + + TP_ARGS(inode, old, new, cur), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(u32, gen) + __field(u32, old) + __field(u32, new) + __field(u32, cur) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->gen = inode->i_generation; + __entry->old = old; + __entry->new = new; + __entry->cur = cur; + ), + + TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, + __entry->old & ~I_CTIME_QUERIED, + __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS), + __entry->new, + __entry->cur & ~I_CTIME_QUERIED, + __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS) + ) +); + +TRACE_EVENT(fill_mg_cmtime, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime, + struct timespec64 *mtime), + + TP_ARGS(inode, ctime, mtime), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(time64_t, ctime_s) + __field(time64_t, mtime_s) + __field(u32, ctime_ns) + __field(u32, mtime_ns) + __field(u32, gen) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->gen = inode->i_generation; + __entry->ctime_s = ctime->tv_sec; + __entry->mtime_s = mtime->tv_sec; + __entry->ctime_ns = ctime->tv_nsec; + __entry->mtime_ns = mtime->tv_nsec; + ), + + TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, + __entry->ctime_s, __entry->ctime_ns, + __entry->mtime_s, __entry->mtime_ns + ) +); +#endif /* _TRACE_TIMESTAMP_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 5af5fc895fb9deec3d7b225f5bc2bd4949f8bbd5 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 18 Oct 2024 11:00:34 -0400 Subject: dma-mapping: use macros to define events in a class Use a macro to avoid repeating the parameters and arguments for each event in a class. Signed-off-by: Sean Anderson Reviewed-by: Steven Rostedt (Google) Signed-off-by: Christoph Hellwig --- include/trace/events/dma.h | 60 ++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 32 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index b0f41265191c..3d348cea4d7c 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -65,15 +65,14 @@ DECLARE_EVENT_CLASS(dma_map, decode_dma_attrs(__entry->attrs)) ); -DEFINE_EVENT(dma_map, dma_map_page, - TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs), - TP_ARGS(dev, phys_addr, dma_addr, size, dir, attrs)); +#define DEFINE_MAP_EVENT(name) \ +DEFINE_EVENT(dma_map, name, \ + TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, \ + size_t size, enum dma_data_direction dir, unsigned long attrs), \ + TP_ARGS(dev, phys_addr, dma_addr, size, dir, attrs)) -DEFINE_EVENT(dma_map, dma_map_resource, - TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs), - TP_ARGS(dev, phys_addr, dma_addr, size, dir, attrs)); +DEFINE_MAP_EVENT(dma_map_page); +DEFINE_MAP_EVENT(dma_map_resource); DECLARE_EVENT_CLASS(dma_unmap, TP_PROTO(struct device *dev, dma_addr_t addr, size_t size, @@ -104,15 +103,14 @@ DECLARE_EVENT_CLASS(dma_unmap, decode_dma_attrs(__entry->attrs)) ); -DEFINE_EVENT(dma_unmap, dma_unmap_page, - TP_PROTO(struct device *dev, dma_addr_t addr, size_t size, - enum dma_data_direction dir, unsigned long attrs), - TP_ARGS(dev, addr, size, dir, attrs)); +#define DEFINE_UNMAP_EVENT(name) \ +DEFINE_EVENT(dma_unmap, name, \ + TP_PROTO(struct device *dev, dma_addr_t addr, size_t size, \ + enum dma_data_direction dir, unsigned long attrs), \ + TP_ARGS(dev, addr, size, dir, attrs)) -DEFINE_EVENT(dma_unmap, dma_unmap_resource, - TP_PROTO(struct device *dev, dma_addr_t addr, size_t size, - enum dma_data_direction dir, unsigned long attrs), - TP_ARGS(dev, addr, size, dir, attrs)); +DEFINE_UNMAP_EVENT(dma_unmap_page); +DEFINE_UNMAP_EVENT(dma_unmap_resource); TRACE_EVENT(dma_alloc, TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, @@ -279,15 +277,14 @@ DECLARE_EVENT_CLASS(dma_sync_single, __entry->size) ); -DEFINE_EVENT(dma_sync_single, dma_sync_single_for_cpu, - TP_PROTO(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction dir), - TP_ARGS(dev, dma_addr, size, dir)); +#define DEFINE_SYNC_SINGLE_EVENT(name) \ +DEFINE_EVENT(dma_sync_single, name, \ + TP_PROTO(struct device *dev, dma_addr_t dma_addr, size_t size, \ + enum dma_data_direction dir), \ + TP_ARGS(dev, dma_addr, size, dir)) -DEFINE_EVENT(dma_sync_single, dma_sync_single_for_device, - TP_PROTO(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction dir), - TP_ARGS(dev, dma_addr, size, dir)); +DEFINE_SYNC_SINGLE_EVENT(dma_sync_single_for_cpu); +DEFINE_SYNC_SINGLE_EVENT(dma_sync_single_for_device); DECLARE_EVENT_CLASS(dma_sync_sg, TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents, @@ -326,15 +323,14 @@ DECLARE_EVENT_CLASS(dma_sync_sg, sizeof(unsigned int), sizeof(unsigned int))) ); -DEFINE_EVENT(dma_sync_sg, dma_sync_sg_for_cpu, - TP_PROTO(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir), - TP_ARGS(dev, sg, nents, dir)); +#define DEFINE_SYNC_SG_EVENT(name) \ +DEFINE_EVENT(dma_sync_sg, name, \ + TP_PROTO(struct device *dev, struct scatterlist *sg, int nents, \ + enum dma_data_direction dir), \ + TP_ARGS(dev, sg, nents, dir)) -DEFINE_EVENT(dma_sync_sg, dma_sync_sg_for_device, - TP_PROTO(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir), - TP_ARGS(dev, sg, nents, dir)); +DEFINE_SYNC_SG_EVENT(dma_sync_sg_for_cpu); +DEFINE_SYNC_SG_EVENT(dma_sync_sg_for_device); #endif /* _TRACE_DMA_H */ -- cgit v1.2.3 From 3afff779a725cba914e6caba360b696ae6f90249 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 18 Oct 2024 11:00:35 -0400 Subject: dma-mapping: trace dma_alloc/free direction In preparation for using these tracepoints in a few more places, trace the DMA direction as well. For coherent allocations this is always bidirectional. Signed-off-by: Sean Anderson Reviewed-by: Steven Rostedt (Google) Signed-off-by: Christoph Hellwig --- include/trace/events/dma.h | 18 ++++++++++++------ kernel/dma/mapping.c | 6 ++++-- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 3d348cea4d7c..267cfa49d9d5 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -114,8 +114,9 @@ DEFINE_UNMAP_EVENT(dma_unmap_resource); TRACE_EVENT(dma_alloc, TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, - size_t size, gfp_t flags, unsigned long attrs), - TP_ARGS(dev, virt_addr, dma_addr, size, flags, attrs), + size_t size, enum dma_data_direction dir, gfp_t flags, + unsigned long attrs), + TP_ARGS(dev, virt_addr, dma_addr, size, dir, flags, attrs), TP_STRUCT__entry( __string(device, dev_name(dev)) @@ -123,6 +124,7 @@ TRACE_EVENT(dma_alloc, __field(u64, dma_addr) __field(size_t, size) __field(gfp_t, flags) + __field(enum dma_data_direction, dir) __field(unsigned long, attrs) ), @@ -135,8 +137,9 @@ TRACE_EVENT(dma_alloc, __entry->attrs = attrs; ), - TP_printk("%s dma_addr=%llx size=%zu virt_addr=%p flags=%s attrs=%s", + TP_printk("%s dir=%s dma_addr=%llx size=%zu virt_addr=%p flags=%s attrs=%s", __get_str(device), + decode_dma_data_direction(__entry->dir), __entry->dma_addr, __entry->size, __entry->virt_addr, @@ -146,14 +149,15 @@ TRACE_EVENT(dma_alloc, TRACE_EVENT(dma_free, TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, - size_t size, unsigned long attrs), - TP_ARGS(dev, virt_addr, dma_addr, size, attrs), + size_t size, enum dma_data_direction dir, unsigned long attrs), + TP_ARGS(dev, virt_addr, dma_addr, size, dir, attrs), TP_STRUCT__entry( __string(device, dev_name(dev)) __field(void *, virt_addr) __field(u64, dma_addr) __field(size_t, size) + __field(enum dma_data_direction, dir) __field(unsigned long, attrs) ), @@ -162,11 +166,13 @@ TRACE_EVENT(dma_free, __entry->virt_addr = virt_addr; __entry->dma_addr = dma_addr; __entry->size = size; + __entry->dir = dir; __entry->attrs = attrs; ), - TP_printk("%s dma_addr=%llx size=%zu virt_addr=%p attrs=%s", + TP_printk("%s dir=%s dma_addr=%llx size=%zu virt_addr=%p attrs=%s", __get_str(device), + decode_dma_data_direction(__entry->dir), __entry->dma_addr, __entry->size, __entry->virt_addr, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 864a1121bf08..944ac835030a 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -619,7 +619,8 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, else return NULL; - trace_dma_alloc(dev, cpu_addr, *dma_handle, size, flag, attrs); + trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL, + flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); return cpu_addr; } @@ -644,7 +645,8 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, if (!cpu_addr) return; - trace_dma_free(dev, cpu_addr, dma_handle, size, attrs); + trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL, + attrs); debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); -- cgit v1.2.3 From c4484ab86ee00f2d9236e2851621ea02c105f4cc Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 18 Oct 2024 11:00:36 -0400 Subject: dma-mapping: use trace_dma_alloc for dma_alloc* instead of using trace_dma_map In some cases, we use trace_dma_map to trace dma_alloc* functions. This generally follows dma_debug. However, this does not record all of the relevant information for allocations, such as GFP flags. Create new dma_alloc tracepoints for these functions. Note that while dma_alloc_noncontiguous may allocate discontiguous pages (from the CPU's point of view), the device will only see one contiguous mapping. Therefore, we just need to trace dma_addr and size. Signed-off-by: Sean Anderson Reviewed-by: Steven Rostedt (Google) Signed-off-by: Christoph Hellwig --- include/trace/events/dma.h | 99 +++++++++++++++++++++++++++++++++++++++++++++- kernel/dma/mapping.c | 10 ++--- 2 files changed, 102 insertions(+), 7 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 267cfa49d9d5..7a9606b8934e 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -112,7 +112,7 @@ DEFINE_EVENT(dma_unmap, name, \ DEFINE_UNMAP_EVENT(dma_unmap_page); DEFINE_UNMAP_EVENT(dma_unmap_resource); -TRACE_EVENT(dma_alloc, +DECLARE_EVENT_CLASS(dma_alloc_class, TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, gfp_t flags, unsigned long attrs), @@ -147,7 +147,58 @@ TRACE_EVENT(dma_alloc, decode_dma_attrs(__entry->attrs)) ); -TRACE_EVENT(dma_free, +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(dma_alloc_class, name, \ + TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, \ + size_t size, enum dma_data_direction dir, gfp_t flags, \ + unsigned long attrs), \ + TP_ARGS(dev, virt_addr, dma_addr, size, dir, flags, attrs)) + +DEFINE_ALLOC_EVENT(dma_alloc); +DEFINE_ALLOC_EVENT(dma_alloc_pages); + +TRACE_EVENT(dma_alloc_sgt, + TP_PROTO(struct device *dev, struct sg_table *sgt, size_t size, + enum dma_data_direction dir, gfp_t flags, unsigned long attrs), + TP_ARGS(dev, sgt, size, dir, flags, attrs), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __dynamic_array(u64, phys_addrs, sgt->orig_nents) + __field(u64, dma_addr) + __field(size_t, size) + __field(enum dma_data_direction, dir) + __field(gfp_t, flags) + __field(unsigned long, attrs) + ), + + TP_fast_assign( + struct scatterlist *sg; + int i; + + __assign_str(device); + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) + ((u64 *)__get_dynamic_array(phys_addrs))[i] = sg_phys(sg); + __entry->dma_addr = sg_dma_address(sgt->sgl); + __entry->size = size; + __entry->dir = dir; + __entry->flags = flags; + __entry->attrs = attrs; + ), + + TP_printk("%s dir=%s dma_addr=%llx size=%zu phys_addrs=%s flags=%s attrs=%s", + __get_str(device), + decode_dma_data_direction(__entry->dir), + __entry->dma_addr, + __entry->size, + __print_array(__get_dynamic_array(phys_addrs), + __get_dynamic_array_len(phys_addrs) / + sizeof(u64), sizeof(u64)), + show_gfp_flags(__entry->flags), + decode_dma_attrs(__entry->attrs)) +); + +DECLARE_EVENT_CLASS(dma_free_class, TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs), TP_ARGS(dev, virt_addr, dma_addr, size, dir, attrs), @@ -179,6 +230,50 @@ TRACE_EVENT(dma_free, decode_dma_attrs(__entry->attrs)) ); +#define DEFINE_FREE_EVENT(name) \ +DEFINE_EVENT(dma_free_class, name, \ + TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, \ + size_t size, enum dma_data_direction dir, unsigned long attrs), \ + TP_ARGS(dev, virt_addr, dma_addr, size, dir, attrs)) + +DEFINE_FREE_EVENT(dma_free); +DEFINE_FREE_EVENT(dma_free_pages); + +TRACE_EVENT(dma_free_sgt, + TP_PROTO(struct device *dev, struct sg_table *sgt, size_t size, + enum dma_data_direction dir), + TP_ARGS(dev, sgt, size, dir), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __dynamic_array(u64, phys_addrs, sgt->orig_nents) + __field(u64, dma_addr) + __field(size_t, size) + __field(enum dma_data_direction, dir) + ), + + TP_fast_assign( + struct scatterlist *sg; + int i; + + __assign_str(device); + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) + ((u64 *)__get_dynamic_array(phys_addrs))[i] = sg_phys(sg); + __entry->dma_addr = sg_dma_address(sgt->sgl); + __entry->size = size; + __entry->dir = dir; + ), + + TP_printk("%s dir=%s dma_addr=%llx size=%zu phys_addrs=%s", + __get_str(device), + decode_dma_data_direction(__entry->dir), + __entry->dma_addr, + __entry->size, + __print_array(__get_dynamic_array(phys_addrs), + __get_dynamic_array_len(phys_addrs) / + sizeof(u64), sizeof(u64))) +); + TRACE_EVENT(dma_map_sg, TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents, int ents, enum dma_data_direction dir, unsigned long attrs), diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 944ac835030a..b8a6bc492fae 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -685,8 +685,8 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); if (page) { - trace_dma_map_page(dev, page_to_phys(page), *dma_handle, size, - dir, 0); + trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, + size, dir, gfp, 0); debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); } return page; @@ -710,7 +710,7 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page, void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { - trace_dma_unmap_page(dev, dma_handle, size, dir, 0); + trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); debug_dma_unmap_page(dev, dma_handle, size, dir); __dma_free_pages(dev, size, page, dma_handle, dir); } @@ -770,7 +770,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (sgt) { sgt->nents = 1; - trace_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); + trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs); debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); } return sgt; @@ -789,7 +789,7 @@ static void free_single_sgt(struct device *dev, size_t size, void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { - trace_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir, 0); + trace_dma_free_sgt(dev, sgt, size, dir); debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); if (use_dma_iommu(dev)) -- cgit v1.2.3 From 68b6dbf1f441c4eba3b8511728a41cf9b01dca35 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 18 Oct 2024 11:00:37 -0400 Subject: dma-mapping: trace more error paths It can be surprising to the user if DMA functions are only traced on success. On failure, it can be unclear what the source of the problem is. Fix this by tracing all functions even when they fail. Cases where we BUG/WARN are skipped, since those should be sufficiently noisy already. Signed-off-by: Sean Anderson Reviewed-by: Steven Rostedt (Google) Signed-off-by: Christoph Hellwig --- include/trace/events/dma.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 25 ++++++++++++++++++------- 2 files changed, 54 insertions(+), 7 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 7a9606b8934e..d8ddc27b6a7c 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -156,6 +156,7 @@ DEFINE_EVENT(dma_alloc_class, name, \ DEFINE_ALLOC_EVENT(dma_alloc); DEFINE_ALLOC_EVENT(dma_alloc_pages); +DEFINE_ALLOC_EVENT(dma_alloc_sgt_err); TRACE_EVENT(dma_alloc_sgt, TP_PROTO(struct device *dev, struct sg_table *sgt, size_t size, @@ -320,6 +321,41 @@ TRACE_EVENT(dma_map_sg, decode_dma_attrs(__entry->attrs)) ); +TRACE_EVENT(dma_map_sg_err, + TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents, + int err, enum dma_data_direction dir, unsigned long attrs), + TP_ARGS(dev, sgl, nents, err, dir, attrs), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __dynamic_array(u64, phys_addrs, nents) + __field(int, err) + __field(enum dma_data_direction, dir) + __field(unsigned long, attrs) + ), + + TP_fast_assign( + struct scatterlist *sg; + int i; + + __assign_str(device); + for_each_sg(sgl, sg, nents, i) + ((u64 *)__get_dynamic_array(phys_addrs))[i] = sg_phys(sg); + __entry->err = err; + __entry->dir = dir; + __entry->attrs = attrs; + ), + + TP_printk("%s dir=%s dma_addrs=%s err=%d attrs=%s", + __get_str(device), + decode_dma_data_direction(__entry->dir), + __print_array(__get_dynamic_array(phys_addrs), + __get_dynamic_array_len(phys_addrs) / + sizeof(u64), sizeof(u64)), + __entry->err, + decode_dma_attrs(__entry->attrs)) +); + TRACE_EVENT(dma_unmap_sg, TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs), diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b8a6bc492fae..636dbb0629a4 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -223,6 +223,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && ents != -EIO && ents != -EREMOTEIO)) { + trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs); return -EIO; } @@ -604,20 +605,26 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, if (WARN_ON_ONCE(flag & __GFP_COMP)) return NULL; - if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) { + trace_dma_alloc(dev, cpu_addr, *dma_handle, size, + DMA_BIDIRECTIONAL, flag, attrs); return cpu_addr; + } /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (dma_alloc_direct(dev, ops)) + if (dma_alloc_direct(dev, ops)) { cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); - else if (use_dma_iommu(dev)) + } else if (use_dma_iommu(dev)) { cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs); - else if (ops->alloc) + } else if (ops->alloc) { cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); - else + } else { + trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag, + attrs); return NULL; + } trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL, flag, attrs); @@ -642,11 +649,11 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, */ WARN_ON(irqs_disabled()); + trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL, + attrs); if (!cpu_addr) return; - trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL, - attrs); debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); @@ -688,6 +695,8 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + } else { + trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } return page; } @@ -772,6 +781,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, sgt->nents = 1; trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs); debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); + } else { + trace_dma_alloc_sgt_err(dev, NULL, 0, size, gfp, dir, attrs); } return sgt; } -- cgit v1.2.3 From 2946f08ae9ed650b94e0ffebcdfdda8de76bd926 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 18 Oct 2024 17:14:00 +0100 Subject: io_uring: clean up cqe trace points We have too many helpers posting CQEs, instead of tracing completion events before filling in a CQE and thus having to pass all the data, set the CQE first, pass it to the tracing helper and let it extract everything it needs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b83c1ca9ee5aed2df0f3bb743bf5ed699cce4c86.1729267437.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ include/trace/events/io_uring.h | 24 +++++++++--------------- io_uring/io_uring.c | 4 ++-- io_uring/io_uring.h | 7 +++---- 4 files changed, 19 insertions(+), 21 deletions(-) (limited to 'include/trace') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9c7e1d3f06e5..391087144666 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -662,4 +662,9 @@ struct io_overflow_cqe { struct io_uring_cqe cqe; }; +static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) +{ + return ctx->flags & IORING_SETUP_CQE32; +} + #endif diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 412c9c210a32..fb81c533b310 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -315,20 +315,14 @@ TRACE_EVENT(io_uring_fail_link, * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure - * @req: pointer to a submitted request - * @user_data: user data associated with the request - * @res: result of the request - * @cflags: completion flags - * @extra1: extra 64-bit data for CQE32 - * @extra2: extra 64-bit data for CQE32 - * + * @req: (optional) pointer to a submitted request + * @cqe: pointer to the filled in CQE being posted */ TRACE_EVENT(io_uring_complete, - TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags, - u64 extra1, u64 extra2), +TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), - TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2), + TP_ARGS(ctx, req, cqe), TP_STRUCT__entry ( __field( void *, ctx ) @@ -343,11 +337,11 @@ TRACE_EVENT(io_uring_complete, TP_fast_assign( __entry->ctx = ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->res = res; - __entry->cflags = cflags; - __entry->extra1 = extra1; - __entry->extra2 = extra2; + __entry->user_data = cqe->user_data; + __entry->res = cqe->res; + __entry->cflags = cqe->flags; + __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; + __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fa9d31034c62..58b401900b41 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -828,8 +828,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, * the ring. */ if (likely(io_get_cqe(ctx, &cqe))) { - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); @@ -838,6 +836,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } + + trace_io_uring_complete(ctx, NULL, cqe); return true; } return false; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 70b6675941ff..9cd9a127e9ed 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -189,16 +189,15 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, if (unlikely(!io_get_cqe(ctx, &cqe))) return false; - if (trace_io_uring_complete_enabled()) - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } + + if (trace_io_uring_complete_enabled()) + trace_io_uring_complete(req->ctx, req, cqe); return true; } -- cgit v1.2.3 From 750fd23926f1507cc826b5a4fdd4bfc7283e7723 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 22 Oct 2024 19:36:27 +0000 Subject: x86/mce: Add wrapper for struct mce to export vendor specific info Currently, exporting new additional machine check error information involves adding new fields for the same at the end of the struct mce. This additional information can then be consumed through mcelog or tracepoint. However, as new MSRs are being added (and will be added in the future) by CPU vendors on their newer CPUs with additional machine check error information to be exported, the size of struct mce will balloon on some CPUs, unnecessarily, since those fields are vendor-specific. Moreover, different CPU vendors may export the additional information in varying sizes. The problem particularly intensifies since struct mce is exposed to userspace as part of UAPI. It's bloating through vendor-specific data should be avoided to limit the information being sent out to userspace. Add a new structure mce_hw_err to wrap the existing struct mce. The same will prevent its ballooning since vendor-specifc data, if any, can now be exported through a union within the wrapper structure and through __dynamic_array in mce_record tracepoint. Furthermore, new internal kernel fields can be added to the wrapper struct without impacting the user space API. [ bp: Restore reverse x-mas tree order of function vars declarations. ] Suggested-by: Borislav Petkov (AMD) Signed-off-by: Avadhut Naik Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20241022194158.110073-2-avadhut.naik@amd.com --- arch/x86/include/asm/mce.h | 14 ++- arch/x86/kernel/cpu/mce/amd.c | 27 ++--- arch/x86/kernel/cpu/mce/apei.c | 45 ++++---- arch/x86/kernel/cpu/mce/core.c | 207 ++++++++++++++++++++----------------- arch/x86/kernel/cpu/mce/genpool.c | 18 ++-- arch/x86/kernel/cpu/mce/inject.c | 6 +- arch/x86/kernel/cpu/mce/internal.h | 4 +- include/trace/events/mce.h | 42 ++++---- 8 files changed, 202 insertions(+), 161 deletions(-) (limited to 'include/trace') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3b9970117a0f..4e45f45673a3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -187,6 +187,16 @@ enum mce_notifier_prios { MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; +/** + * struct mce_hw_err - Hardware Error Record. + * @m: Machine Check record. + */ +struct mce_hw_err { + struct mce m; +}; + +#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) + struct notifier_block; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -221,8 +231,8 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_prep_record(struct mce *m); -void mce_log(struct mce *m); +void mce_prep_record(struct mce_hw_err *err); +void mce_log(struct mce_hw_err *err); DECLARE_PER_CPU(struct device *, mce_device); /* Maximum number of MCA banks per CPU. */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 14bf8c232e45..5b4d266500b2 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -778,29 +778,30 @@ bool amd_mce_usable_address(struct mce *m) static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - struct mce m; + struct mce_hw_err err; + struct mce *m = &err.m; - mce_prep_record(&m); + mce_prep_record(&err); - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); + m->status = status; + m->misc = misc; + m->bank = bank; + m->tsc = rdtsc(); - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; + if (m->status & MCI_STATUS_ADDRV) { + m->addr = addr; - smca_extract_err_addr(&m); + smca_extract_err_addr(m); } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (m->status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); } - mce_log(&m); + mce_log(&err); } DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 3885fe05f01e..7f582b4ca1ca 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -28,7 +28,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) @@ -44,22 +45,23 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_prep_record(&m); - m.bank = -1; + mce_prep_record(&err); + m = &err.m; + m->bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; + m->status |= MCI_STATUS_UC; if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); + m->status |= MCI_STATUS_PCC; + m->tsc = rdtsc(); } - m.addr = mem_err->physical_addr; - mce_log(&m); + m->addr = mem_err->physical_addr; + mce_log(&err); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); @@ -67,8 +69,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); bool apicid_found = false; + struct mce_hw_err err; unsigned int cpu; - struct mce m; + struct mce *m; if (!boot_cpu_has(X86_FEATURE_SMCA)) return -EINVAL; @@ -108,18 +111,20 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (!apicid_found) return -EINVAL; - mce_prep_record_common(&m); - mce_prep_record_per_cpu(cpu, &m); + m = &err.m; + memset(&err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(cpu, m); - m.bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.status = *i_mce; - m.addr = *(i_mce + 1); - m.misc = *(i_mce + 2); + m->bank = (ctx_info->msr_addr >> 4) & 0xFF; + m->status = *i_mce; + m->addr = *(i_mce + 1); + m->misc = *(i_mce + 2); /* Skipping MCA_CONFIG */ - m.ipid = *(i_mce + 4); - m.synd = *(i_mce + 5); + m->ipid = *(i_mce + 4); + m->synd = *(i_mce + 5); - mce_log(&m); + mce_log(&err); return 0; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2a938f429c4d..28e28b69d84d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -88,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -119,8 +119,6 @@ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); @@ -138,9 +136,12 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) m->socketid = topology_physical_package_id(cpu); } -/* Do initial initialization of a struct mce */ -void mce_prep_record(struct mce *m) +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) { + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); mce_prep_record_common(m); mce_prep_record_per_cpu(smp_processor_id(), m); } @@ -148,9 +149,9 @@ void mce_prep_record(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (!mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -171,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -214,9 +217,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -251,7 +256,7 @@ static const char *mce_dump_aux_info(struct mce *m) return NULL; } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; @@ -282,20 +287,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -303,12 +310,12 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); - memmsg = mce_dump_aux_info(final); + memmsg = mce_dump_aux_info(&final->m); if (memmsg) pr_emerg(HW_ERR "Machine check: %s\n", memmsg); @@ -323,9 +330,9 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) * panic. */ if (kexec_crash_loaded()) { - if (final && (final->status & MCI_STATUS_ADDRV)) { + if (final && (final->m.status & MCI_STATUS_ADDRV)) { struct page *p; - p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); } @@ -445,16 +452,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_prep_record(m); + mce_prep_record(err); instrumentation_end(); + m = &err->m; m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { /* @@ -574,13 +583,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable); static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -624,13 +633,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -644,8 +653,10 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -692,26 +703,28 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -721,17 +734,17 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * storm status. */ if (!mca_cfg.cmci_disabled) - mce_track_storm(&m); + mce_track_storm(m); /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) + if (!(m->status & MCI_STATUS_VAL)) continue; /* * If we are logging everything (at CPU online) or this * is a corrected error, then we must log it. */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) goto log_it; /* @@ -741,20 +754,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * everything else. */ if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) + if (m->status & MCI_STATUS_UC) continue; goto log_it; } /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) + if (!(m->status & MCI_STATUS_EN)) goto log_it; /* * Log UCNA (SDM: 15.6.3 "UCR Error Classification") * UC == 1 && PCC == 0 && S == 0 */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) goto log_it; /* @@ -768,20 +781,20 @@ log_it: if (flags & MCP_DONTLOG) goto clear_it; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: /* @@ -905,9 +918,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; @@ -925,7 +939,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -1016,10 +1030,11 @@ out: */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -1027,11 +1042,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -1043,7 +1060,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1060,11 +1077,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1268,13 +1285,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1319,7 +1337,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1329,17 +1347,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1399,9 +1417,10 @@ static void kill_me_never(struct callback_head *cb) set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1414,11 +1433,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1467,8 +1487,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1506,13 +1528,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1521,15 +1544,15 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1540,12 +1563,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1560,7 +1583,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1572,8 +1595,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1591,16 +1614,16 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (!mce_usable_address(&m)) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); - } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) { + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { /* * Saved RIP on stack makes it look like the machine check * was taken in the kernel on the instruction following @@ -1612,8 +1635,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * not occur there. Mark the page as poisoned so it won't * be added to free list when the guest is terminated. */ - if (mce_usable_address(&m)) { - struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT); + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); @@ -1628,13 +1651,13 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index 4284749ec803..d0be6dda0c14 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -31,15 +31,15 @@ static LLIST_HEAD(mce_event_llist); */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { + struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; - struct mce *m1, *m2; - m1 = &t->mce; + err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; + err2 = &node->err; - if (!mce_cmp(m1, m2)) + if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; @@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void) void mce_gen_pool_process(struct work_struct *__unused) { - struct llist_node *head; struct mce_evt_llist *node, *tmp; + struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; + mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } @@ -94,11 +94,11 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce *mce) +int mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; - if (filter_mce(mce)) + if (filter_mce(&err->m)) return -EINVAL; if (!mce_evt_pool) @@ -110,7 +110,7 @@ int mce_gen_pool_add(struct mce *mce) return -ENOMEM; } - memcpy(&node->mce, mce, sizeof(*mce)); + memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); return 0; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 49ed3428785d..313fe682db33 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -502,8 +502,9 @@ static void prepare_msrs(void *info) static void do_inject(void) { - u64 mcg_status = 0; unsigned int cpu = i_mce.extcpu; + struct mce_hw_err err; + u64 mcg_status = 0; u8 b = i_mce.bank; i_mce.tsc = rdtsc_ordered(); @@ -517,7 +518,8 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_log(&i_mce); + err.m = i_mce; + mce_log(&err); return; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 43c7f3b71df5..84f810598231 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -26,12 +26,12 @@ extern struct blocking_notifier_head x86_mce_decoder_chain; struct mce_evt_llist { struct llist_node llnode; - struct mce mce; + struct mce_hw_err err; }; void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_add(struct mce_hw_err *err); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index f0f7b3cb2041..65aba1afcd07 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -19,9 +19,9 @@ TRACE_EVENT(mce_record, - TP_PROTO(struct mce *m), + TP_PROTO(struct mce_hw_err *err), - TP_ARGS(m), + TP_ARGS(err), TP_STRUCT__entry( __field( u64, mcgcap ) @@ -46,25 +46,25 @@ TRACE_EVENT(mce_record, ), TP_fast_assign( - __entry->mcgcap = m->mcgcap; - __entry->mcgstatus = m->mcgstatus; - __entry->status = m->status; - __entry->addr = m->addr; - __entry->misc = m->misc; - __entry->synd = m->synd; - __entry->ipid = m->ipid; - __entry->ip = m->ip; - __entry->tsc = m->tsc; - __entry->ppin = m->ppin; - __entry->walltime = m->time; - __entry->cpu = m->extcpu; - __entry->cpuid = m->cpuid; - __entry->apicid = m->apicid; - __entry->socketid = m->socketid; - __entry->cs = m->cs; - __entry->bank = m->bank; - __entry->cpuvendor = m->cpuvendor; - __entry->microcode = m->microcode; + __entry->mcgcap = err->m.mcgcap; + __entry->mcgstatus = err->m.mcgstatus; + __entry->status = err->m.status; + __entry->addr = err->m.addr; + __entry->misc = err->m.misc; + __entry->synd = err->m.synd; + __entry->ipid = err->m.ipid; + __entry->ip = err->m.ip; + __entry->tsc = err->m.tsc; + __entry->ppin = err->m.ppin; + __entry->walltime = err->m.time; + __entry->cpu = err->m.extcpu; + __entry->cpuid = err->m.cpuid; + __entry->apicid = err->m.apicid; + __entry->socketid = err->m.socketid; + __entry->cs = err->m.cs; + __entry->bank = err->m.bank; + __entry->cpuvendor = err->m.cpuvendor; + __entry->microcode = err->m.microcode; ), TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x", -- cgit v1.2.3 From e52750fb1458ae9ea5860a08ed7a149185bc5b97 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Oct 2024 19:36:28 +0000 Subject: tracing: Add __print_dynamic_array() helper When printing a dynamic array in a trace event, the method is rather ugly. It has the format of: __print_array(__get_dynamic_array(array), __get_dynmaic_array_len(array) / el_size, el_size) Since dynamic arrays are known to the tracing infrastructure, create a helper macro that does the above for you. __print_dynamic_array(array, el_size) Which would expand to the same output. Signed-off-by: Steven Rostedt (Google) Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20241022194158.110073-3-avadhut.naik@amd.com --- include/trace/stages/stage3_trace_output.h | 8 ++++++++ include/trace/stages/stage7_class_define.h | 1 + samples/trace_events/trace-events-sample.h | 7 ++++++- 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index c1fb1355d309..1e7b0bef95f5 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -119,6 +119,14 @@ trace_print_array_seq(p, array, count, el_size); \ }) +#undef __print_dynamic_array +#define __print_dynamic_array(array, el_size) \ + ({ \ + __print_array(__get_dynamic_array(array), \ + __get_dynamic_array_len(array) / (el_size), \ + (el_size)); \ + }) + #undef __print_hex_dump #define __print_hex_dump(prefix_str, prefix_type, \ rowsize, groupsize, buf, len, ascii) \ diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h index bcb960d16fc0..fcd564a590f4 100644 --- a/include/trace/stages/stage7_class_define.h +++ b/include/trace/stages/stage7_class_define.h @@ -22,6 +22,7 @@ #undef __get_rel_cpumask #undef __get_rel_sockaddr #undef __print_array +#undef __print_dynamic_array #undef __print_hex_dump #undef __get_buf diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 55f9a3da92d5..999f78d380ae 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -319,7 +319,7 @@ TRACE_EVENT(foo_bar, __assign_cpumask(cpum, cpumask_bits(mask)); ), - TP_printk("foo %s %d %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -363,6 +363,11 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), + +/* A shortcut is to use __print_dynamic_array for dynamic arrays */ + + __print_dynamic_array(list, sizeof(int)), + __get_str(str), __get_str(lstr), __get_bitmask(cpus), __get_cpumask(cpum), __get_str(vstr)) -- cgit v1.2.3 From d4fca1358ea9096f2f6ed942e2cb3a820073dfc1 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 22 Oct 2024 19:36:29 +0000 Subject: x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers Starting with Zen4, AMD's Scalable MCA systems incorporate two new registers: MCA_SYND1 and MCA_SYND2. These registers will include supplemental error information in addition to the existing MCA_SYND register. The data within these registers is considered valid if MCA_STATUS[SyndV] is set. Userspace error decoding tools like rasdaemon gather related hardware error information through the tracepoints. Therefore, export these two registers through the mce_record tracepoint so that tools like rasdaemon can parse them and output the supplemental error information like FRU text contained in them. [ bp: Massage. ] Signed-off-by: Yazen Ghannam Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20241022194158.110073-4-avadhut.naik@amd.com --- arch/x86/include/asm/mce.h | 21 +++++++++++++++++++++ arch/x86/include/uapi/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mce/amd.c | 5 ++++- arch/x86/kernel/cpu/mce/core.c | 9 ++++++++- drivers/edac/mce_amd.c | 8 ++++++-- include/trace/events/mce.h | 7 +++++-- 6 files changed, 46 insertions(+), 7 deletions(-) (limited to 'include/trace') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4e45f45673a3..4d936ee20e24 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -122,6 +122,9 @@ #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +/* Registers MISC2 to MISC4 are at offsets B to D. */ +#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e +#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f #define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x)) @@ -132,6 +135,8 @@ #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) +#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x)) #define XEC(x, mask) (((x) >> 16) & mask) @@ -190,9 +195,25 @@ enum mce_notifier_prios { /** * struct mce_hw_err - Hardware Error Record. * @m: Machine Check record. + * @vendor: Vendor-specific error information. + * + * Vendor-specific fields should not be added to struct mce. Instead, vendors + * should export their vendor-specific data through their structure in the + * vendor union below. + * + * AMD's vendor data is parsed by error decoding tools for supplemental error + * information. Thus, current offsets of existing fields must be maintained. + * Only add new fields at the end of AMD's vendor structure. */ struct mce_hw_err { struct mce m; + + union vendor_info { + struct { + u64 synd1; /* MCA_SYND1 MSR */ + u64 synd2; /* MCA_SYND2 MSR */ + } amd; + } vendor; }; #define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index db9adc081c5a..cb6b48a7c22b 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -8,7 +8,8 @@ /* * Fields are zero when not available. Also, this struct is shared with * userspace mcelog and thus must keep existing fields at current offsets. - * Only add new fields to the end of the structure + * Only add new, shared fields to the end of the structure. + * Do not add vendor-specific fields. */ struct mce { __u64 status; /* Bank's MCi_STATUS MSR */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5b4d266500b2..6ca80fff1fea 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -797,8 +797,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) if (mce_flags.smca) { rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + } } mce_log(&err); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 28e28b69d84d..7fb5556a0b53 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -202,6 +202,10 @@ static void __print_mce(struct mce_hw_err *err) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vendor.amd.synd1) + pr_cont("SYND1 %llx ", err->vendor.amd.synd1); + if (err->vendor.amd.synd2) + pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -678,8 +682,11 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 8130c3dc64da..194d9fd47d20 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -793,6 +793,7 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(m); unsigned int fam = x86_family(m->cpuid); int ecc; @@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (boot_cpu_has(X86_FEATURE_SMCA)) { pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); - if (m->status & MCI_STATUS_SYNDV) - pr_cont(", Syndrome: 0x%016llx", m->synd); + if (m->status & MCI_STATUS_SYNDV) { + pr_cont(", Syndrome: 0x%016llx\n", m->synd); + pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", + err->vendor.amd.synd1, err->vendor.amd.synd2); + } pr_cont("\n"); diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 65aba1afcd07..c1c50df9ecfd 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -43,6 +43,7 @@ TRACE_EVENT(mce_record, __field( u8, bank ) __field( u8, cpuvendor ) __field( u32, microcode ) + __dynamic_array(u8, v_data, sizeof(err->vendor)) ), TP_fast_assign( @@ -65,9 +66,10 @@ TRACE_EVENT(mce_record, __entry->bank = err->m.bank; __entry->cpuvendor = err->m.cpuvendor; __entry->microcode = err->m.microcode; + memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor)); ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, @@ -83,7 +85,8 @@ TRACE_EVENT(mce_record, __entry->walltime, __entry->socketid, __entry->apicid, - __entry->microcode) + __entry->microcode, + __print_dynamic_array(v_data, sizeof(u8))) ); #endif /* _TRACE_MCE_H */ -- cgit v1.2.3 From 654ced4a13774e5aec4d9466c97d22df7cb1e60b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 31 Oct 2024 11:20:54 -0400 Subject: tracing: Introduce tracepoint_is_faultable() Introduce a "faultable" flag within the extended structure to know whether a tracepoint needs rcu tasks trace grace period before reclaim. This can be queried using tracepoint_is_faultable(). Acked-by: Andrii Nakryiko Tested-by: Jordan Rife Cc: Michael Jeanson Cc: Thomas Gleixner Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241031152056.744137-3-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint-defs.h | 2 ++ include/linux/tracepoint.h | 24 ++++++++++++++++++++++++ include/trace/define_trace.h | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 967c08d9da84..aebf0571c736 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -32,6 +32,8 @@ struct tracepoint_func { struct tracepoint_ext { int (*regfunc)(void); void (*unregfunc)(void); + /* Flags. */ + unsigned int faultable:1; }; struct tracepoint { diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 862ab49177a4..906f3091d23d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -104,6 +104,12 @@ void for_each_tracepoint_in_module(struct module *mod, * tracepoint_synchronize_unregister must be called between the last tracepoint * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. + * + * An alternative is to use the following for batch reclaim associated + * with a given tracepoint: + * + * - tracepoint_is_faultable() == false: call_rcu() + * - tracepoint_is_faultable() == true: call_rcu_tasks_trace() */ #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) @@ -111,9 +117,17 @@ static inline void tracepoint_synchronize_unregister(void) synchronize_rcu_tasks_trace(); synchronize_rcu(); } +static inline bool tracepoint_is_faultable(struct tracepoint *tp) +{ + return tp->ext && tp->ext->faultable; +} #else static inline void tracepoint_synchronize_unregister(void) { } +static inline bool tracepoint_is_faultable(struct tracepoint *tp) +{ + return false; +} #endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS @@ -345,6 +359,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static struct tracepoint_ext __tracepoint_ext_##_name = { \ .regfunc = _reg, \ .unregfunc = _unreg, \ + .faultable = false, \ + }; \ + __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); + +#define DEFINE_TRACE_SYSCALL(_name, _reg, _unreg, _proto, _args) \ + static struct tracepoint_ext __tracepoint_ext_##_name = { \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + .faultable = true, \ }; \ __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); @@ -389,6 +412,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE #define DEFINE_TRACE_FN(name, reg, unreg, proto, args) +#define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args) #define DEFINE_TRACE(name, proto, args) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index ff5fa17a6259..63fea2218afa 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -48,7 +48,7 @@ #undef TRACE_EVENT_SYSCALL #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, print, reg, unreg) \ - DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args)) + DEFINE_TRACE_SYSCALL(name, reg, unreg, PARAMS(proto), PARAMS(args)) #undef TRACE_EVENT_NOP #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print) -- cgit v1.2.3 From ad37bcd965fda43f34cf5cc051f5d310880bd1e7 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 30 Oct 2024 16:04:25 +0000 Subject: rust: add tracepoint support Make it possible to have Rust code call into tracepoints defined by C code. It is still required that the tracepoint is declared in a C header, and that this header is included in the input to bindgen. Instead of calling __DO_TRACE directly, the exported rust_do_trace_ function calls an inline helper function. This is because the `cond` argument does not exist at the callsite of DEFINE_RUST_DO_TRACE. __DECLARE_TRACE always emits an inline static and an extern declaration that is only used when CREATE_RUST_TRACE_POINTS is set. These should not end up in the final binary so it is not a problem that they sometimes are emitted without a user. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Jason Baron Cc: Ard Biesheuvel Cc: Miguel Ojeda Cc: Alex Gaynor Cc: Wedson Almeida Filho Cc: " =?utf-8?q?Bj=C3=B6rn_Roy_Baron?= " Cc: Benno Lossin Cc: Andreas Hindborg Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Sean Christopherson Cc: Uros Bizjak Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: Mark Rutland Cc: Ryan Roberts Cc: Fuad Tabba Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Anup Patel Cc: Andrew Jones Cc: Alexandre Ghiti Cc: Conor Dooley Cc: Samuel Holland Cc: Huacai Chen Cc: WANG Xuerui Cc: Bibo Mao Cc: Tiezhu Yang Cc: Andrew Morton Cc: Tianrui Zhao Link: https://lore.kernel.org/20241030-tracepoint-v12-2-eec7f0f8ad22@google.com Reviewed-by: Carlos Llamas Reviewed-by: Gary Guo Reviewed-by: Boqun Feng Signed-off-by: Alice Ryhl Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 28 ++++++++++++++++++++++- include/trace/define_trace.h | 12 ++++++++++ rust/bindings/bindings_helper.h | 1 + rust/kernel/lib.rs | 1 + rust/kernel/tracepoint.rs | 49 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 rust/kernel/tracepoint.rs (limited to 'include/trace') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 0dc67fad706c..84c4924e499f 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -225,6 +225,18 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) preempt_enable_notrace(); \ } while (0) +/* + * Declare an exported function that Rust code can call to trigger this + * tracepoint. This function does not include the static branch; that is done + * in Rust to avoid a function call when the tracepoint is disabled. + */ +#define DEFINE_RUST_DO_TRACE(name, proto, args) +#define __DEFINE_RUST_DO_TRACE(name, proto, args) \ + notrace void rust_do_trace_##name(proto) \ + { \ + __rust_do_trace_##name(args); \ + } + /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -240,6 +252,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ + extern void rust_do_trace_##name(proto); \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -271,6 +284,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void __rust_do_trace_##name(proto) \ + { \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 0); \ + } \ static inline void trace_##name(proto) \ { \ if (static_branch_unlikely(&__tracepoint_##name.key)) \ @@ -285,6 +304,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void __rust_do_trace_##name(proto) \ + { \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 1); \ + } \ static inline void trace_##name(proto) \ { \ if (static_branch_unlikely(&__tracepoint_##name.key)) \ @@ -339,7 +364,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) void __probestub_##_name(void *__data, proto) \ { \ } \ - DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); + DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); \ + DEFINE_RUST_DO_TRACE(_name, TP_PROTO(proto), TP_ARGS(args)) #define DEFINE_TRACE(name, proto, args) \ DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args)); diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index ff5fa17a6259..0557626b6f6a 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -76,6 +76,13 @@ #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name, PARAMS(proto), PARAMS(args)) +/* If requested, create helpers for calling these tracepoints from Rust. */ +#ifdef CREATE_RUST_TRACE_POINTS +#undef DEFINE_RUST_DO_TRACE +#define DEFINE_RUST_DO_TRACE(name, proto, args) \ + __DEFINE_RUST_DO_TRACE(name, PARAMS(proto), PARAMS(args)) +#endif + #undef TRACE_INCLUDE #undef __TRACE_INCLUDE @@ -134,6 +141,11 @@ # undef UNDEF_TRACE_INCLUDE_PATH #endif +#ifdef CREATE_RUST_TRACE_POINTS +# undef DEFINE_RUST_DO_TRACE +# define DEFINE_RUST_DO_TRACE(name, proto, args) +#endif + /* We may be processing more files */ #define CREATE_TRACE_POINTS diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index e0846e7e93e6..752572e638a6 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 708ff817ccc3..55f81f49024e 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -54,6 +54,7 @@ pub mod str; pub mod sync; pub mod task; pub mod time; +pub mod tracepoint; pub mod types; pub mod uaccess; pub mod workqueue; diff --git a/rust/kernel/tracepoint.rs b/rust/kernel/tracepoint.rs new file mode 100644 index 000000000000..c6e80aa99e8e --- /dev/null +++ b/rust/kernel/tracepoint.rs @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Logic for tracepoints. + +/// Declare the Rust entry point for a tracepoint. +/// +/// This macro generates an unsafe function that calls into C, and its safety requirements will be +/// whatever the relevant C code requires. To document these safety requirements, you may add +/// doc-comments when invoking the macro. +#[macro_export] +macro_rules! declare_trace { + ($($(#[$attr:meta])* $pub:vis unsafe fn $name:ident($($argname:ident : $argtyp:ty),* $(,)?);)*) => {$( + $( #[$attr] )* + #[inline(always)] + $pub unsafe fn $name($($argname : $argtyp),*) { + #[cfg(CONFIG_TRACEPOINTS)] + { + // SAFETY: It's always okay to query the static key for a tracepoint. + let should_trace = unsafe { + $crate::macros::paste! { + $crate::jump_label::static_branch_unlikely!( + $crate::bindings::[< __tracepoint_ $name >], + $crate::bindings::tracepoint, + key + ) + } + }; + + if should_trace { + $crate::macros::paste! { + // SAFETY: The caller guarantees that it is okay to call this tracepoint. + unsafe { $crate::bindings::[< rust_do_trace_ $name >]($($argname),*) }; + } + } + } + + #[cfg(not(CONFIG_TRACEPOINTS))] + { + // If tracepoints are disabled, insert a trivial use of each argument + // to avoid unused argument warnings. + $( let _unused = $argname; )* + } + } + )*} +} + +pub use declare_trace; -- cgit v1.2.3 From 91d39024e1b02914cc5e2dbc137908e29b269ce4 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 30 Oct 2024 16:04:26 +0000 Subject: rust: samples: add tracepoint to Rust sample This updates the Rust printing sample to invoke a tracepoint. This ensures that we have a user in-tree from the get-go even though the patch is being merged before its real user. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Jason Baron Cc: Ard Biesheuvel Cc: Miguel Ojeda Cc: Alex Gaynor Cc: Wedson Almeida Filho Cc: Gary Guo Cc: " =?utf-8?q?Bj=C3=B6rn_Roy_Baron?= " Cc: Benno Lossin Cc: Andreas Hindborg Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Sean Christopherson Cc: Uros Bizjak Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: Mark Rutland Cc: Ryan Roberts Cc: Fuad Tabba Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Anup Patel Cc: Andrew Jones Cc: Alexandre Ghiti Cc: Conor Dooley Cc: Samuel Holland Cc: Huacai Chen Cc: WANG Xuerui Cc: Bibo Mao Cc: Tiezhu Yang Cc: Andrew Morton Cc: Tianrui Zhao Link: https://lore.kernel.org/20241030-tracepoint-v12-3-eec7f0f8ad22@google.com Reviewed-by: Boqun Feng Signed-off-by: Alice Ryhl Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 1 + include/trace/events/rust_sample.h | 31 +++++++++++++++++++++++++++++++ rust/bindings/bindings_helper.h | 1 + samples/rust/Makefile | 3 ++- samples/rust/rust_print.rs | 18 ++++++++++++++++++ samples/rust/rust_print_events.c | 8 ++++++++ 6 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/rust_sample.h create mode 100644 samples/rust/rust_print_events.c (limited to 'include/trace') diff --git a/MAINTAINERS b/MAINTAINERS index a097afd76ded..a9b71411d77a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20223,6 +20223,7 @@ C: zulip://rust-for-linux.zulipchat.com P: https://rust-for-linux.com/contributing T: git https://github.com/Rust-for-Linux/linux.git rust-next F: Documentation/rust/ +F: include/trace/events/rust_sample.h F: rust/ F: samples/rust/ F: scripts/*rust* diff --git a/include/trace/events/rust_sample.h b/include/trace/events/rust_sample.h new file mode 100644 index 000000000000..dbc80ca2e465 --- /dev/null +++ b/include/trace/events/rust_sample.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Tracepoints for `samples/rust/rust_print.rs`. + * + * Copyright (C) 2024 Google, Inc. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rust_sample + +#if !defined(_RUST_SAMPLE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _RUST_SAMPLE_TRACE_H + +#include + +TRACE_EVENT(rust_sample_loaded, + TP_PROTO(int magic_number), + TP_ARGS(magic_number), + TP_STRUCT__entry( + __field(int, magic_number) + ), + TP_fast_assign( + __entry->magic_number = magic_number; + ), + TP_printk("magic=%d", __entry->magic_number) +); + +#endif /* _RUST_SAMPLE_TRACE_H */ + +/* This part must be outside protection */ +#include diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 752572e638a6..b072c197ce9e 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -23,6 +23,7 @@ #include #include #include +#include /* `bindgen` gets confused at certain things. */ const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; diff --git a/samples/rust/Makefile b/samples/rust/Makefile index 03086dabbea4..f29280ec4820 100644 --- a/samples/rust/Makefile +++ b/samples/rust/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o -obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o +obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o rust_print_events.o subdir-$(CONFIG_SAMPLE_RUST_HOSTPROGS) += hostprogs diff --git a/samples/rust/rust_print.rs b/samples/rust/rust_print.rs index 6eabb0d79ea3..6d14b08cac1c 100644 --- a/samples/rust/rust_print.rs +++ b/samples/rust/rust_print.rs @@ -69,6 +69,8 @@ impl kernel::Module for RustPrint { arc_print()?; + trace::trace_rust_sample_loaded(42); + Ok(RustPrint) } } @@ -78,3 +80,19 @@ impl Drop for RustPrint { pr_info!("Rust printing macros sample (exit)\n"); } } + +mod trace { + use core::ffi::c_int; + + kernel::declare_trace! { + /// # Safety + /// + /// Always safe to call. + unsafe fn rust_sample_loaded(magic: c_int); + } + + pub(crate) fn trace_rust_sample_loaded(magic: i32) { + // SAFETY: Always safe to call. + unsafe { rust_sample_loaded(magic as c_int) } + } +} diff --git a/samples/rust/rust_print_events.c b/samples/rust/rust_print_events.c new file mode 100644 index 000000000000..a9169ff0edf1 --- /dev/null +++ b/samples/rust/rust_print_events.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Google LLC + */ + +#define CREATE_TRACE_POINTS +#define CREATE_RUST_TRACE_POINTS +#include -- cgit v1.2.3 From 0aa3ef3637920799f1b2f67dfff0d698127444ac Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 9 Oct 2024 17:35:50 -0700 Subject: memcg: add tracing for memcg stat updates The memcg stats are maintained in rstat infrastructure which provides very fast updates side and reasonable read side. However memcg added plethora of stats and made the read side, which is cgroup rstat flush, very slow. To solve that, threshold was added in the memcg stats read side i.e. no need to flush the stats if updates are within the threshold. This threshold based improvement worked for sometime but more stats were added to memcg and also the read codepath was getting triggered in the performance sensitive paths which made threshold based ratelimiting ineffective. We need more visibility into the hot and cold stats i.e. stats with a lot of updates. Let's add trace to get that visibility. [shakeel.butt@linux.dev: use unsigned long type for memcg_rstat_events, per Yosry] Link: https://lkml.kernel.org/r/20241015213721.3804209-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20241010003550.3695245-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: T.J. Mercier Cc: Michal Hocko Cc: Muchun Song Cc: JP Kobryn Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/memcg.h | 81 ++++++++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 13 +++++-- 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/memcg.h (limited to 'include/trace') diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h new file mode 100644 index 000000000000..8667e57816d2 --- /dev/null +++ b/include/trace/events/memcg.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM memcg + +#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MEMCG_H + +#include +#include + + +DECLARE_EVENT_CLASS(memcg_rstat_stats, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val), + + TP_STRUCT__entry( + __field(u64, id) + __field(int, item) + __field(int, val) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->item = item; + __entry->val = val; + ), + + TP_printk("memcg_id=%llu item=%d val=%d", + __entry->id, __entry->item, __entry->val) +); + +DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val) +); + +DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val) +); + +DECLARE_EVENT_CLASS(memcg_rstat_events, + + TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val), + + TP_ARGS(memcg, item, val), + + TP_STRUCT__entry( + __field(u64, id) + __field(int, item) + __field(unsigned long, val) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->item = item; + __entry->val = val; + ), + + TP_printk("memcg_id=%llu item=%d val=%lu", + __entry->id, __entry->item, __entry->val) +); + +DEFINE_EVENT(memcg_rstat_events, count_memcg_events, + + TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val), + + TP_ARGS(memcg, item, val) +); + + +#endif /* _TRACE_MEMCG_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6159266185f..c93ecedf7a96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -71,6 +71,10 @@ #include +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + #include struct cgroup_subsys memory_cgrp_subsys __read_mostly; @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, return; __this_cpu_add(memcg->vmstats_percpu->state[i], val); - memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); + val = memcg_state_val_in_pages(idx, val); + memcg_rstat_updated(memcg, val); + trace_mod_memcg_state(memcg, idx, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -741,7 +747,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); - memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); + val = memcg_state_val_in_pages(idx, val); + memcg_rstat_updated(memcg, val); + trace_mod_memcg_lruvec_state(memcg, idx, val); memcg_stats_unlock(); } @@ -832,6 +840,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[i], count); memcg_rstat_updated(memcg, count); + trace_count_memcg_events(memcg, idx, count); memcg_stats_unlock(); } -- cgit v1.2.3 From 1f2d03cc535138b7cdbed0122cdc0f9e9626c6bf Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Fri, 11 Oct 2024 21:49:28 +0900 Subject: vmscan: add a vmscan event for reclaim_pages reclaim_folio_list uses a dummy reclaim_stat and is not being used. To know the memory stat, add a new trace event. This is useful how how many pages are not reclaimed or why. This is an example: mm_vmscan_reclaim_pages: nid=0 nr_scanned=112 nr_reclaimed=112 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=0 nr_ref_keep=0 nr_unmap_fail=0 Currently reclaim_folio_list is only called by reclaim_pages, and reclaim_pages is used by damon and madvise. In the latest Android, reclaim_pages is also used by shmem to reclaim all pages in a address_space. [jaewon31.kim@samsung.com: use sc.nr_scanned rather than new counting] Link: https://lkml.kernel.org/r/20241016143227.961162-1-jaewon31.kim@samsung.com Link: https://lkml.kernel.org/r/20241011124928.1224813-1-jaewon31.kim@samsung.com Signed-off-by: Jaewon Kim Acked-by: Vlastimil Babka Cc: Jaewon Kim Cc: Kalesh Singh Cc: Minchan Kim Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 45 +++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 5 +++-- 2 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 1a488c30afa5..490958fa10de 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -346,6 +346,51 @@ TRACE_EVENT(mm_vmscan_write_folio, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_reclaim_pages, + + TP_PROTO(int nid, + unsigned long nr_scanned, unsigned long nr_reclaimed, + struct reclaim_stat *stat), + + TP_ARGS(nid, nr_scanned, nr_reclaimed, stat), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_congested) + __field(unsigned long, nr_immediate) + __field(unsigned int, nr_activate0) + __field(unsigned int, nr_activate1) + __field(unsigned long, nr_ref_keep) + __field(unsigned long, nr_unmap_fail) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_scanned = nr_scanned; + __entry->nr_reclaimed = nr_reclaimed; + __entry->nr_dirty = stat->nr_dirty; + __entry->nr_writeback = stat->nr_writeback; + __entry->nr_congested = stat->nr_congested; + __entry->nr_immediate = stat->nr_immediate; + __entry->nr_activate0 = stat->nr_activate[0]; + __entry->nr_activate1 = stat->nr_activate[1]; + __entry->nr_ref_keep = stat->nr_ref_keep; + __entry->nr_unmap_fail = stat->nr_unmap_fail; + ), + + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld", + __entry->nid, + __entry->nr_scanned, __entry->nr_reclaimed, + __entry->nr_dirty, __entry->nr_writeback, + __entry->nr_congested, __entry->nr_immediate, + __entry->nr_activate0, __entry->nr_activate1, + __entry->nr_ref_keep, __entry->nr_unmap_fail) +); + TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, diff --git a/mm/vmscan.c b/mm/vmscan.c index 6a3c498383fa..5bec29914f12 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2129,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan, static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { - struct reclaim_stat dummy_stat; + struct reclaim_stat stat; unsigned int nr_reclaimed; struct folio *folio; struct scan_control sc = { @@ -2140,12 +2140,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } + trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat); return nr_reclaimed; } -- cgit v1.2.3 From fc9de52de38f656399d2ce40f7349a6b5f86e787 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Nov 2024 13:03:22 +0000 Subject: rxrpc: Fix missing locking causing hanging calls If a call gets aborted (e.g. because kafs saw a signal) between it being queued for connection and the I/O thread picking up the call, the abort will be prioritised over the connection and it will be removed from local->new_client_calls by rxrpc_disconnect_client_call() without a lock being held. This may cause other calls on the list to disappear if a race occurs. Fix this by taking the client_call_lock when removing a call from whatever list its ->wait_link happens to be on. Signed-off-by: David Howells cc: linux-afs@lists.infradead.org Reported-by: Marc Dionne Fixes: 9d35d880e0e4 ("rxrpc: Move client call connection to the I/O thread") Link: https://patch.msgid.link/726660.1730898202@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + net/rxrpc/conn_client.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a1b126a6b0d7..cc22596c7250 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -287,6 +287,7 @@ EM(rxrpc_call_see_input, "SEE input ") \ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ + EM(rxrpc_call_see_waiting_call, "SEE q-conn ") \ E_(rxrpc_call_see_zap, "SEE zap ") #define rxrpc_txqueue_traces \ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index d25bf1cf3670..bb11e8289d6d 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -516,6 +516,7 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local) spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); + rxrpc_see_call(call, rxrpc_call_see_waiting_call); spin_unlock(&local->client_call_lock); if (rxrpc_bundle_has_space(bundle)) @@ -586,7 +587,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); + /* May still be on ->new_client_calls. */ + spin_lock(&local->client_call_lock); list_del_init(&call->wait_link); + spin_unlock(&local->client_call_lock); return; } -- cgit v1.2.3 From 93970b6a143bd9f184ebab37c5862a204fb5690e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 29 Oct 2024 15:21:43 -0400 Subject: sunrpc: remove newlines from tracepoints Tracepoint strings don't require newlines (and in fact, they are undesirable). Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5e8495216689..b13dc275ef4a 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -719,7 +719,7 @@ TRACE_EVENT(rpc_xdr_overflow, ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " %sv%d %s requested=%zu p=%p end=%p xdr=[%p,%zu]/%u/[%p,%zu]/%u\n", + " %sv%d %s requested=%zu p=%p end=%p xdr=[%p,%zu]/%u/[%p,%zu]/%u", __entry->task_id, __entry->client_id, __get_str(progname), __entry->version, __get_str(procedure), __entry->requested, __entry->p, __entry->end, @@ -777,7 +777,7 @@ TRACE_EVENT(rpc_xdr_alignment, ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " %sv%d %s offset=%zu copied=%u xdr=[%p,%zu]/%u/[%p,%zu]/%u\n", + " %sv%d %s offset=%zu copied=%u xdr=[%p,%zu]/%u/[%p,%zu]/%u", __entry->task_id, __entry->client_id, __get_str(progname), __entry->version, __get_str(procedure), __entry->offset, __entry->copied, -- cgit v1.2.3 From f914ac96ee8828368f5a24553e75216d76da0b42 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 28 Oct 2024 19:11:06 -0700 Subject: memcg: add flush tracepoint This tracepoint gives visibility on how often the flushing of memcg stats occurs and contains info on whether it was forced, skipped, and the value of stats updated. It can help with understanding how readers are affected by having to perform the flush, and the effectiveness of the flush by inspecting the number of stats updated. Paired with the recently added tracepoints for tracing rstat updates, it can also help show correlation where stats exceed thresholds frequently. Link: https://lkml.kernel.org/r/20241029021106.25587-3-inwardvessel@gmail.com Signed-off-by: JP Kobryn Reviewed-by: Yosry Ahmed Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/memcg.h | 25 +++++++++++++++++++++++++ mm/memcontrol.c | 7 ++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h index 8667e57816d2..dfe2f51019b4 100644 --- a/include/trace/events/memcg.h +++ b/include/trace/events/memcg.h @@ -74,6 +74,31 @@ DEFINE_EVENT(memcg_rstat_events, count_memcg_events, TP_ARGS(memcg, item, val) ); +TRACE_EVENT(memcg_flush_stats, + + TP_PROTO(struct mem_cgroup *memcg, s64 stats_updates, + bool force, bool needs_flush), + + TP_ARGS(memcg, stats_updates, force, needs_flush), + + TP_STRUCT__entry( + __field(u64, id) + __field(s64, stats_updates) + __field(bool, force) + __field(bool, needs_flush) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->stats_updates = stats_updates; + __entry->force = force; + __entry->needs_flush = needs_flush; + ), + + TP_printk("memcg_id=%llu stats_updates=%lld force=%d needs_flush=%d", + __entry->id, __entry->stats_updates, + __entry->force, __entry->needs_flush) +); #endif /* _TRACE_MEMCG_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4da834b4aa3..6486e9652843 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -597,7 +597,12 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { - if (!force && !memcg_vmstats_needs_flush(memcg->vmstats)) + bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); + + trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates), + force, needs_flush); + + if (!force && !needs_flush) return; if (mem_cgroup_is_root(memcg)) -- cgit v1.2.3 From c28b97f53be7b598ae5fb47cc9dc80912019d7a8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Sep 2024 17:12:32 +0100 Subject: btrfs: qgroups: remove bytenr field from struct btrfs_qgroup_extent_record Now that we track qgroup extent records in a xarray we don't need to have a "bytenr" field in struct btrfs_qgroup_extent_record, since we can get it from the index of the record in the xarray. So remove the field and grab the bytenr from either the index key or any other place where it's available (delayed refs). This reduces the size of struct btrfs_qgroup_extent_record from 40 bytes down to 32 bytes, meaning that we now can store 128 instances of this structure instead of 102 per 4K page. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 8 ++++---- fs/btrfs/qgroup.c | 31 +++++++++++++++++-------------- fs/btrfs/qgroup.h | 13 ++++++++++--- include/trace/events/btrfs.h | 17 ++++++++++------- 4 files changed, 41 insertions(+), 28 deletions(-) (limited to 'include/trace') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 65d841d7142c..1684857554c6 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -830,7 +830,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = generic_ref->bytenr; qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } @@ -860,11 +859,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (qrecord) { int ret; - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, + head_ref->bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, - qrecord->bytenr >> fs_info->sectorsize_bits); + head_ref->bytenr >> fs_info->sectorsize_bits); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); @@ -1074,7 +1074,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; free_record: diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a0e8deca87a7..da618e34acd4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2001,27 +2001,28 @@ out: * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record, + u64 bytenr) { struct btrfs_qgroup_extent_record *existing, *ret; - const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; #if BITS_PER_LONG == 32 - if (record->bytenr >= MAX_LFS_FILESIZE) { + if (bytenr >= MAX_LFS_FILESIZE) { btrfs_err_rl(fs_info, "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", - record->bytenr); + bytenr); btrfs_err_32bit_limit(fs_info); return -EOVERFLOW; } #endif lockdep_assert_held(&delayed_refs->lock); - trace_btrfs_qgroup_trace_extent(fs_info, record); + trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); xa_lock(&delayed_refs->dirty_extents); existing = xa_load(&delayed_refs->dirty_extents, index); @@ -2066,7 +2067,8 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord) + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr) { struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; @@ -2097,7 +2099,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ctx.bytenr = qrecord->bytenr; + ctx.bytenr = bytenr; ctx.fs_info = trans->fs_info; ret = btrfs_find_all_roots(&ctx, true); @@ -2154,12 +2156,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, } delayed_refs = &trans->transaction->delayed_refs; - record->bytenr = bytenr; record->num_bytes = num_bytes; record->old_roots = NULL; spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); spin_unlock(&delayed_refs->lock); if (ret) { /* Clean up if insertion fails or item exists. */ @@ -2167,7 +2168,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, kfree(record); return 0; } - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, bytenr); } /* @@ -3043,14 +3044,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; xa_for_each(&delayed_refs->dirty_extents, index, record) { + const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); + num_dirty_extents++; - trace_btrfs_qgroup_account_extents(fs_info, record); + trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; - ctx.bytenr = record->bytenr; + ctx.bytenr = bytenr; ctx.fs_info = fs_info; /* @@ -3092,7 +3095,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_del(record->old_roots, qgroup_to_skip, 0); } - ret = btrfs_qgroup_account_extent(trans, record->bytenr, + ret = btrfs_qgroup_account_extent(trans, bytenr, record->num_bytes, record->old_roots, new_roots); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index c229256d6fd5..c36019abc82f 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -127,7 +127,12 @@ struct btrfs_inode; * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - u64 bytenr; + /* + * The bytenr of the extent is given by its index in the dirty_extents + * xarray of struct btrfs_delayed_ref_root left shifted by + * fs_info->sectorsize_bits. + */ + u64 num_bytes; /* @@ -345,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); + struct btrfs_qgroup_extent_record *record, + u64 bytenr); int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord); + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr); int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index af6b3827fb1d..8d2ff32fb3b0 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1706,9 +1706,10 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data, DECLARE_EVENT_CLASS(btrfs_qgroup_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup_extent_record *rec), + const struct btrfs_qgroup_extent_record *rec, + u64 bytenr), - TP_ARGS(fs_info, rec), + TP_ARGS(fs_info, rec, bytenr), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) @@ -1716,7 +1717,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, ), TP_fast_assign_btrfs(fs_info, - __entry->bytenr = rec->bytenr; + __entry->bytenr = bytenr; __entry->num_bytes = rec->num_bytes; ), @@ -1727,17 +1728,19 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup_extent_record *rec), + const struct btrfs_qgroup_extent_record *rec, + u64 bytenr), - TP_ARGS(fs_info, rec) + TP_ARGS(fs_info, rec, bytenr) ); DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup_extent_record *rec), + const struct btrfs_qgroup_extent_record *rec, + u64 bytenr), - TP_ARGS(fs_info, rec) + TP_ARGS(fs_info, rec, bytenr) ); TRACE_EVENT(qgroup_num_dirty_extents, -- cgit v1.2.3 From b628c139519ae0e5453e5327161a41bae966201d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 3 Oct 2024 15:27:27 +0100 Subject: btrfs: remove unused btrfs_try_tree_write_lock() btrfs_try_tree_write_lock() has been unused since commit 50b21d7a066f ("btrfs: submit a writeback bio per extent_buffer"). Remove it as we don't need it anymore. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/locking.c | 15 --------------- fs/btrfs/locking.h | 1 - include/trace/events/btrfs.h | 1 - 3 files changed, 17 deletions(-) (limited to 'include/trace') diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6a0b7abb5bd9..9a7a7b723305 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -161,21 +161,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) return 0; } -/* - * Try-lock for write. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (down_write_trylock(&eb->lock)) { - btrfs_set_eb_lock_owner(eb, current->pid); - trace_btrfs_try_tree_write_lock(eb); - return 1; - } - return 0; -} - /* * Release read lock. */ diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3c15c75e0582..46c8be2afab1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8d2ff32fb3b0..bd515415ea8b 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2344,7 +2344,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); -DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); DECLARE_EVENT_CLASS(btrfs__space_info_update, -- cgit v1.2.3 From 70a5f9e266cf1f544f33ce56f1eee634d79f3030 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Sep 2024 17:03:43 +0100 Subject: btrfs: simplify tracking progress for the extent map shrinker Now that the extent map shrinker can only be run by a single task (as a work queue item) there is no need to keep the progress of the shrinker protected by a spinlock and passing the progress to trace events as parameters. So remove the lock and simplify the arguments for the trace events. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/extent_map.c | 55 ++++++++++---------------------------------- fs/btrfs/fs.h | 1 - include/trace/events/btrfs.h | 21 ++++++++--------- 4 files changed, 22 insertions(+), 57 deletions(-) (limited to 'include/trace') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8ad625a8d9f9..0cbc31908264 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; - spin_lock_init(&fs_info->extent_map_shrinker_lock); - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4b9ab3f9c731..f6e338dd28ec 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1118,8 +1118,6 @@ out_free_pre: struct btrfs_em_shrink_ctx { long nr_to_scan; long scanned; - u64 last_ino; - u64 last_root; }; static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) @@ -1211,16 +1209,17 @@ next: static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; - u64 min_ino = ctx->last_ino + 1; + u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1; inode = btrfs_find_first_inode(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); min_ino = btrfs_ino(inode) + 1; - ctx->last_ino = btrfs_ino(inode); + fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); if (ctx->scanned >= ctx->nr_to_scan || @@ -1240,14 +1239,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx * inode if there is one or we will find out this was the last * one and move to the next root. */ - ctx->last_root = btrfs_root_id(root); + fs_info->extent_map_shrinker_last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ - ctx->last_ino = 0; - ctx->last_root = btrfs_root_id(root) + 1; + fs_info->extent_map_shrinker_last_ino = 0; + fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1; } return nr_dropped; @@ -1267,25 +1266,13 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) ctx.scanned = 0; ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); - /* - * In case we have multiple tasks running this shrinker, make the next - * one start from the next inode in case it starts before we finish. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - ctx.last_ino = fs_info->extent_map_shrinker_last_ino; - fs_info->extent_map_shrinker_last_ino++; - ctx.last_root = fs_info->extent_map_shrinker_last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); - - start_root_id = ctx.last_root; - next_root_id = ctx.last_root; + start_root_id = fs_info->extent_map_shrinker_last_root; + next_root_id = fs_info->extent_map_shrinker_last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); } while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { @@ -1302,8 +1289,8 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; - ctx.last_root = 0; - ctx.last_ino = 0; + fs_info->extent_map_shrinker_last_root = 0; + fs_info->extent_map_shrinker_last_ino = 0; cycled = true; continue; } @@ -1322,28 +1309,10 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) btrfs_put_root(root); } - /* - * In case of multiple tasks running this extent map shrinking code this - * isn't perfect but it's simple and silences things like KCSAN. It's - * not possible to know which task made more progress because we can - * cycle back to the first root and first inode if it's not the first - * time the shrinker ran, see the above logic. Also a task that started - * later may finish earlier than another task and made less progress. So - * make this simple and update to the progress of the last task that - * finished, with the occasional possibility of having two consecutive - * runs of the shrinker process the same inodes. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - fs_info->extent_map_shrinker_last_ino = ctx.last_ino; - fs_info->extent_map_shrinker_last_root = ctx.last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); - if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); } atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 678ce5259332..827af2b027df 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -635,7 +635,6 @@ struct btrfs_fs_info { s32 delalloc_batch; struct percpu_counter evictable_extent_maps; - spinlock_t extent_map_shrinker_lock; u64 extent_map_shrinker_last_root; u64 extent_map_shrinker_last_ino; atomic64_t extent_map_shrinker_nr_to_scan; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index bd515415ea8b..879016b4e391 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2555,10 +2555,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count, TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr, - u64 last_root_id, u64 last_ino), + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr), - TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino), + TP_ARGS(fs_info, nr), TP_STRUCT__entry_btrfs( __field( long, nr_to_scan ) @@ -2568,10 +2567,11 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, ), TP_fast_assign_btrfs(fs_info, - __entry->nr_to_scan = nr_to_scan; + __entry->nr_to_scan = \ + atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); __entry->nr = nr; - __entry->last_root_id = last_root_id; - __entry->last_ino = last_ino; + __entry->last_root_id = fs_info->extent_map_shrinker_last_root; + __entry->last_ino = fs_info->extent_map_shrinker_last_ino; ), TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", @@ -2581,10 +2581,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr, - u64 last_root_id, u64 last_ino), + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr), - TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino), + TP_ARGS(fs_info, nr_dropped, nr), TP_STRUCT__entry_btrfs( __field( long, nr_dropped ) @@ -2596,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, TP_fast_assign_btrfs(fs_info, __entry->nr_dropped = nr_dropped; __entry->nr = nr; - __entry->last_root_id = last_root_id; - __entry->last_ino = last_ino; + __entry->last_root_id = fs_info->extent_map_shrinker_last_root; + __entry->last_ino = fs_info->extent_map_shrinker_last_ino; ), TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", -- cgit v1.2.3 From e7fa845010f12bb98251a2bf9dcf39fd601f9424 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 5 Sep 2024 11:31:49 +0100 Subject: btrfs: rename extent map shrinker members from struct btrfs_fs_info The names for the members of struct btrfs_fs_info related to the extent map shrinker are a bit too long, so rename them to be shorter by replacing the "extent_map_" prefix with the "em_" prefix. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_map.c | 32 ++++++++++++++++---------------- fs/btrfs/fs.h | 8 ++++---- include/trace/events/btrfs.h | 10 +++++----- 4 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include/trace') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0cbc31908264..bcf6e53bc2a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4291,7 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->extent_map_shrinker_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f6e338dd28ec..67ce85ff0ae2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1212,14 +1212,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; - u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1; + u64 min_ino = fs_info->em_shrinker_last_ino + 1; inode = btrfs_find_first_inode(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); min_ino = btrfs_ino(inode) + 1; - fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode); + fs_info->em_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); if (ctx->scanned >= ctx->nr_to_scan || @@ -1239,14 +1239,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx * inode if there is one or we will find out this was the last * one and move to the next root. */ - fs_info->extent_map_shrinker_last_root = btrfs_root_id(root); + fs_info->em_shrinker_last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ - fs_info->extent_map_shrinker_last_ino = 0; - fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1; + fs_info->em_shrinker_last_ino = 0; + fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; } return nr_dropped; @@ -1261,13 +1261,13 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) bool cycled = false; long nr_dropped = 0; - fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work); + fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); ctx.scanned = 0; - ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); + ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); - start_root_id = fs_info->extent_map_shrinker_last_root; - next_root_id = fs_info->extent_map_shrinker_last_root; + start_root_id = fs_info->em_shrinker_last_root; + next_root_id = fs_info->em_shrinker_last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); @@ -1289,8 +1289,8 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; - fs_info->extent_map_shrinker_last_root = 0; - fs_info->extent_map_shrinker_last_ino = 0; + fs_info->em_shrinker_last_root = 0; + fs_info->em_shrinker_last_ino = 0; cycled = true; continue; } @@ -1315,7 +1315,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); } - atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); } void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) @@ -1335,14 +1335,14 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) * current value is zero, instead of incrementing the counter by * nr_to_scan. */ - if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) return; - queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work); + queue_work(system_unbound_wq, &fs_info->em_shrinker_work); } void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) { - atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); - INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker); + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 827af2b027df..79a1a3d6f04d 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -635,10 +635,10 @@ struct btrfs_fs_info { s32 delalloc_batch; struct percpu_counter evictable_extent_maps; - u64 extent_map_shrinker_last_root; - u64 extent_map_shrinker_last_ino; - atomic64_t extent_map_shrinker_nr_to_scan; - struct work_struct extent_map_shrinker_work; + u64 em_shrinker_last_root; + u64 em_shrinker_last_ino; + atomic64_t em_shrinker_nr_to_scan; + struct work_struct em_shrinker_work; /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 879016b4e391..4df93ca9b7a8 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2568,10 +2568,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, TP_fast_assign_btrfs(fs_info, __entry->nr_to_scan = \ - atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); + atomic64_read(&fs_info->em_shrinker_nr_to_scan); __entry->nr = nr; - __entry->last_root_id = fs_info->extent_map_shrinker_last_root; - __entry->last_ino = fs_info->extent_map_shrinker_last_ino; + __entry->last_root_id = fs_info->em_shrinker_last_root; + __entry->last_ino = fs_info->em_shrinker_last_ino; ), TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", @@ -2595,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, TP_fast_assign_btrfs(fs_info, __entry->nr_dropped = nr_dropped; __entry->nr = nr; - __entry->last_root_id = fs_info->extent_map_shrinker_last_root; - __entry->last_ino = fs_info->extent_map_shrinker_last_ino; + __entry->last_root_id = fs_info->em_shrinker_last_root; + __entry->last_ino = fs_info->em_shrinker_last_ino; ), TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", -- cgit v1.2.3 From 8b9a7bd4d6c83300e50bb1d7071c6032a07e2fed Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Nov 2024 13:00:45 +0000 Subject: rxrpc: Add a tracepoint for aborts being proposed Add a tracepoint to rxrpc to trace the proposal of an abort. The abort is performed asynchronously by the I/O thread. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/726356.1730898045@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 25 +++++++++++++++++++++++++ net/rxrpc/sendmsg.c | 1 + 2 files changed, 26 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cc22596c7250..d03e0bd8c028 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -773,6 +773,31 @@ TRACE_EVENT(rxrpc_rx_done, TP_printk("r=%d a=%d", __entry->result, __entry->abort_code) ); +TRACE_EVENT(rxrpc_abort_call, + TP_PROTO(const struct rxrpc_call *call, int abort_code), + + TP_ARGS(call, abort_code), + + TP_STRUCT__entry( + __field(unsigned int, call_nr) + __field(enum rxrpc_abort_reason, why) + __field(int, abort_code) + __field(int, error) + ), + + TP_fast_assign( + __entry->call_nr = call->debug_id; + __entry->why = call->send_abort_why; + __entry->abort_code = abort_code; + __entry->error = call->send_abort_err; + ), + + TP_printk("c=%08x a=%d e=%d %s", + __entry->call_nr, + __entry->abort_code, __entry->error, + __print_symbolic(__entry->why, rxrpc_abort_reasons)) + ); + TRACE_EVENT(rxrpc_abort, TP_PROTO(unsigned int call_nr, enum rxrpc_abort_reason why, u32 cid, u32 call_id, rxrpc_seq_t seq, int abort_code, int error), diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 23d18fe5de9f..6abb8eec1b2b 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -29,6 +29,7 @@ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; + trace_rxrpc_abort_call(call, abort_code); /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); -- cgit v1.2.3 From 9b5c87d47949292ff21ee2fadd1e83820662a430 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 5 Nov 2024 12:34:57 +0100 Subject: mm: mmap_lock: check trace_mmap_lock_$type_enabled() instead of regcount Since 7d6be67cfdd4 ("mm: mmap_lock: replace get_memcg_path_buf() with on-stack buffer") we use trace_mmap_lock_reg()/unreg() only to maintain an atomic reg_refcount which is checked to avoid performing get_mm_memcg_path() in case none of the tracepoints using it is enabled. This can be achieved directly by putting all the work needed for the tracepoint behind the trace_mmap_lock_##type##_enabled(), as suggested by Documentation/trace/tracepoints.rst and with the following advantages: - uses the tracepoint's static key instead of evaluating a branch - the check tracepoint specific, not shared by all of them - we can get rid of trace_mmap_lock_reg()/unreg() completely Thus use the trace_..._enabled() check and remove unnecessary code. Link: https://lkml.kernel.org/r/20241105113456.95066-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Axel Rasmussen Cc: Tetsuo Handa Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton --- include/trace/events/mmap_lock.h | 14 ++++---------- mm/mmap_lock.c | 39 ++++++++------------------------------- 2 files changed, 12 insertions(+), 41 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index f2827f98a44f..bc2e3ad787b3 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -10,9 +10,6 @@ struct mm_struct; -extern int trace_mmap_lock_reg(void); -extern void trace_mmap_lock_unreg(void); - DECLARE_EVENT_CLASS(mmap_lock, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), @@ -40,16 +37,15 @@ DECLARE_EVENT_CLASS(mmap_lock, ); #define DEFINE_MMAP_LOCK_EVENT(name) \ - DEFINE_EVENT_FN(mmap_lock, name, \ + DEFINE_EVENT(mmap_lock, name, \ TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ bool write), \ - TP_ARGS(mm, memcg_path, write), \ - trace_mmap_lock_reg, trace_mmap_lock_unreg) + TP_ARGS(mm, memcg_path, write)) DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); -TRACE_EVENT_FN(mmap_lock_acquire_returned, +TRACE_EVENT(mmap_lock_acquire_returned, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, bool success), @@ -76,9 +72,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, __get_str(memcg_path), __entry->write ? "true" : "false", __entry->success ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg + ) ); #endif /* _TRACE_MMAP_LOCK_H */ diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 368b840e7508..f186d57df2c6 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -19,43 +19,23 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); #ifdef CONFIG_MEMCG -static atomic_t reg_refcount; - /* * Size of the buffer for memcg path names. Ignoring stack trace support, * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. */ #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL -int trace_mmap_lock_reg(void) -{ - atomic_inc(®_refcount); - return 0; -} - -void trace_mmap_lock_unreg(void) -{ - atomic_dec(®_refcount); -} - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - char buf[MEMCG_PATH_BUF_SIZE]; \ - get_mm_memcg_path(mm, buf, sizeof(buf)); \ - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + if (trace_mmap_lock_##type##_enabled()) { \ + char buf[MEMCG_PATH_BUF_SIZE]; \ + get_mm_memcg_path(mm, buf, sizeof(buf)); \ + trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ + } \ } while (0) #else /* !CONFIG_MEMCG */ -int trace_mmap_lock_reg(void) -{ - return 0; -} - -void trace_mmap_lock_unreg(void) -{ -} - #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) @@ -65,16 +45,13 @@ void trace_mmap_lock_unreg(void) #ifdef CONFIG_MEMCG /* * Write the given mm_struct's memcg path to a buffer. If the path cannot be - * determined or the trace event is being unregistered, empty string is written. + * determined, empty string is written. */ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) { struct mem_cgroup *memcg; buf[0] = '\0'; - /* No need to get path if no trace event is registered. */ - if (!atomic_read(®_refcount)) - return; memcg = get_mem_cgroup_from_mm(mm); if (memcg == NULL) return; -- cgit v1.2.3 From 6975c1a486a40446b5bc77a89d9c520f8296fd08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 18:00:39 +0100 Subject: block: remove the ioprio field from struct request The request ioprio is only initialized from the first attached bio, so requests without a bio already never set it. Directly use the bio field instead. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20241112170050.1612998-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 10 ++++------ block/blk-mq.c | 3 +-- include/linux/blk-mq.h | 7 +++---- include/trace/events/block.h | 6 +++--- 4 files changed, 11 insertions(+), 15 deletions(-) (limited to 'include/trace') diff --git a/block/blk-merge.c b/block/blk-merge.c index 2306014c108d..df36f83f3738 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -871,11 +871,10 @@ static struct request *attempt_merge(struct request_queue *q, /* Don't merge requests with different write hints. */ if (req->bio->bi_write_hint != next->bio->bi_write_hint) return NULL; + if (req->bio->bi_ioprio != next->bio->bi_ioprio) + return NULL; } - if (req->ioprio != next->ioprio) - return NULL; - if (!blk_atomic_write_mergeable_rqs(req, next)) return NULL; @@ -1007,11 +1006,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) /* Don't merge requests with different write hints. */ if (rq->bio->bi_write_hint != bio->bi_write_hint) return false; + if (rq->bio->bi_ioprio != bio->bi_ioprio) + return false; } - if (rq->ioprio != bio_prio(bio)) - return false; - if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) return false; diff --git a/block/blk-mq.c b/block/blk-mq.c index 65e6b86d341c..3c6cadba75e3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -842,7 +842,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* @@ -3306,7 +3306,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2804fe181d9d..a28264442948 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -156,8 +156,6 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif - unsigned short ioprio; - enum mq_rq_state state; atomic_t ref; @@ -221,7 +219,9 @@ static inline bool blk_rq_is_passthrough(struct request *rq) static inline unsigned short req_get_ioprio(struct request *req) { - return req->ioprio; + if (req->bio) + return req->bio->bi_ioprio; + return 0; } #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) @@ -984,7 +984,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->nr_phys_segments = nr_segs; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 1527d5d45e01..bd0ea07338eb 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -99,7 +99,7 @@ TRACE_EVENT(block_rq_requeue, __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); - __entry->ioprio = rq->ioprio; + __entry->ioprio = req_get_ioprio(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; @@ -136,7 +136,7 @@ DECLARE_EVENT_CLASS(block_rq_completion, __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); - __entry->ioprio = rq->ioprio; + __entry->ioprio = req_get_ioprio(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; @@ -209,7 +209,7 @@ DECLARE_EVENT_CLASS(block_rq, __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); - __entry->ioprio = rq->ioprio; + __entry->ioprio = req_get_ioprio(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; -- cgit v1.2.3 From 948ccbd95016f50ce01df5eef9440eede3b8c713 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Wed, 13 Nov 2024 16:18:26 +0800 Subject: LoongArch: KVM: Add iocsr and mmio bus simulation in kernel Add iocsr and mmio memory read and write simulation to the kernel. When the VM accesses the device address space through iocsr instructions or mmio, it does not need to return to the qemu user mode but can directly completes the access in the kernel mode. Signed-off-by: Tianrui Zhao Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 82 ++++++++++++++++++++++++++++++++-------------- include/linux/kvm_host.h | 1 + include/trace/events/kvm.h | 35 ++++++++++++++++++++ 3 files changed, 94 insertions(+), 24 deletions(-) (limited to 'include/trace') diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 90894f70ff4a..69f3e3782cc9 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -157,7 +157,7 @@ static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst) int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) { int ret; - unsigned long val; + unsigned long *val; u32 addr, rd, rj, opcode; /* @@ -170,6 +170,7 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) ret = EMULATE_DO_IOCSR; run->iocsr_io.phys_addr = addr; run->iocsr_io.is_write = 0; + val = &vcpu->arch.gprs[rd]; /* LoongArch is Little endian */ switch (opcode) { @@ -202,16 +203,25 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) run->iocsr_io.is_write = 1; break; default: - ret = EMULATE_FAIL; - break; + return EMULATE_FAIL; } - if (ret == EMULATE_DO_IOCSR) { - if (run->iocsr_io.is_write) { - val = vcpu->arch.gprs[rd]; - memcpy(run->iocsr_io.data, &val, run->iocsr_io.len); - } - vcpu->arch.io_gpr = rd; + if (run->iocsr_io.is_write) { + if (!kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val)) + ret = EMULATE_DONE; + else + /* Save data and let user space to write it */ + memcpy(run->iocsr_io.data, val, run->iocsr_io.len); + + trace_kvm_iocsr(KVM_TRACE_IOCSR_WRITE, run->iocsr_io.len, addr, val); + } else { + if (!kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val)) + ret = EMULATE_DONE; + else + /* Save register id for iocsr read completion */ + vcpu->arch.io_gpr = rd; + + trace_kvm_iocsr(KVM_TRACE_IOCSR_READ, run->iocsr_io.len, addr, NULL); } return ret; @@ -447,19 +457,33 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) } if (ret == EMULATE_DO_MMIO) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, run->mmio.len, run->mmio.phys_addr, NULL); + + /* + * If mmio device such as PCH-PIC is emulated in KVM, + * it need not return to user space to handle the mmio + * exception. + */ + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, + run->mmio.len, &vcpu->arch.gprs[rd]); + if (!ret) { + update_pc(&vcpu->arch); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + /* Set for kvm_complete_mmio_read() use */ vcpu->arch.io_gpr = rd; run->mmio.is_write = 0; vcpu->mmio_is_write = 0; - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, run->mmio.len, - run->mmio.phys_addr, NULL); - } else { - kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", - inst.word, vcpu->arch.pc, vcpu->arch.badv); - kvm_arch_vcpu_dump_regs(vcpu); - vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; } + kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->mmio_needed = 0; + return ret; } @@ -600,19 +624,29 @@ int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst) } if (ret == EMULATE_DO_MMIO) { + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len, run->mmio.phys_addr, data); + + /* + * If mmio device such as PCH-PIC is emulated in KVM, + * it need not return to user space to handle the mmio + * exception. + */ + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, run->mmio.len, data); + if (!ret) + return EMULATE_DONE; + run->mmio.is_write = 1; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len, - run->mmio.phys_addr, data); - } else { - vcpu->arch.pc = curr_pc; - kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", - inst.word, vcpu->arch.pc, vcpu->arch.badv); - kvm_arch_vcpu_dump_regs(vcpu); - /* Rollback PC if emulation was unsuccessful */ + return EMULATE_DO_MMIO; } + vcpu->arch.pc = curr_pc; + kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + /* Rollback PC if emulation was unsuccessful */ + return ret; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 45be36e5285f..164d9725cb08 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -219,6 +219,7 @@ enum kvm_bus { KVM_PIO_BUS, KVM_VIRTIO_CCW_NOTIFY_BUS, KVM_FAST_MMIO_BUS, + KVM_IOCSR_BUS, KVM_NR_BUSES }; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 74e40d5d4af4..fc7d0f8ff078 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -236,6 +236,41 @@ TRACE_EVENT(kvm_mmio, __entry->len, __entry->gpa, __entry->val) ); +#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 +#define KVM_TRACE_IOCSR_READ 1 +#define KVM_TRACE_IOCSR_WRITE 2 + +#define kvm_trace_symbol_iocsr \ + { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ + { KVM_TRACE_IOCSR_READ, "read" }, \ + { KVM_TRACE_IOCSR_WRITE, "write" } + +TRACE_EVENT(kvm_iocsr, + TP_PROTO(int type, int len, u64 gpa, void *val), + TP_ARGS(type, len, gpa, val), + + TP_STRUCT__entry( + __field( u32, type ) + __field( u32, len ) + __field( u64, gpa ) + __field( u64, val ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->len = len; + __entry->gpa = gpa; + __entry->val = 0; + if (val) + memcpy(&__entry->val, val, + min_t(u32, sizeof(__entry->val), len)); + ), + + TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", + __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), + __entry->len, __entry->gpa, __entry->val) +); + #define kvm_fpu_load_symbol \ {0, "unload"}, \ {1, "load"} -- cgit v1.2.3