From 26662d7347a058ca497792c4b22ac91cc415cbf6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Apr 2023 00:14:12 -0700 Subject: bpf: Add bpf_dynptr_size bpf_dynptr_size returns the number of usable bytes in a dynptr. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20230420071414.570108-4-joannelkoong@gmail.com --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index bcf91bc7bf71..8deb22a99abe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1349,9 +1349,9 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, } return verify_pkcs7_signature(data_ptr->data, - bpf_dynptr_get_size(data_ptr), + __bpf_dynptr_size(data_ptr), sig_ptr->data, - bpf_dynptr_get_size(sig_ptr), + __bpf_dynptr_size(sig_ptr), trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); -- cgit v1.2.3 From ee7751b564a90f337330efc1221df40647d68756 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 5 May 2023 13:58:55 -0700 Subject: tracing/user_events: Use long vs int for atomic bit ops Each event stores a int to track which bit to set/clear when enablement changes. On big endian 64-bit configurations, it's possible this could cause memory corruption when it's used for atomic bit operations. Use unsigned long for enablement values to ensure any possible corruption cannot occur. Downcast to int after mask for the bit target. Link: https://lore.kernel.org/all/6f758683-4e5e-41c3-9b05-9efc703e827c@kili.mountain/ Link: https://lore.kernel.org/linux-trace-kernel/20230505205855.6407-1-beaub@linux.microsoft.com Fixes: dcb8177c1395 ("tracing/user_events: Add ioctl for disabling addresses") Reported-by: Dan Carpenter Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..e37c7f168c44 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -101,7 +101,7 @@ struct user_event_enabler { unsigned long addr; /* Track enable bit, flags, etc. Aligned for bitops. */ - unsigned int values; + unsigned long values; }; /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ @@ -116,7 +116,9 @@ struct user_event_enabler { /* Only duplicate the bit value */ #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK -#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values) +#define ENABLE_BITOPS(e) (&(e)->values) + +#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { @@ -423,9 +425,9 @@ static int user_event_enabler_write(struct user_event_mm *mm, /* Update bit atomically, user tracers must be atomic as well */ if (enabler->event && enabler->event->status) - set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + set_bit(ENABLE_BIT(enabler), ptr); else - clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + clear_bit(ENABLE_BIT(enabler), ptr); kunmap_local(kaddr); unpin_user_pages_dirty_lock(&page, 1, true); @@ -440,8 +442,7 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, struct user_event_enabler *next; list_for_each_entry_safe(enabler, next, &mm->enablers, link) { - if (enabler->addr == uaddr && - (enabler->values & ENABLE_VAL_BIT_MASK) == bit) + if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -2272,7 +2273,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) list_for_each_entry_safe(enabler, next, &mm->enablers, link) if (enabler->addr == reg.disable_addr && - (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) { + ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) -- cgit v1.2.3 From 632478a05821bc1c9b55c3a1dd0fb1be7bfa1acc Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 11 May 2023 18:32:01 +0200 Subject: tracing/timerlat: Always wakeup the timerlat thread While testing rtla timerlat auto analysis, I reach a condition where the interface was not receiving tracing data. I was able to manually reproduce the problem with these steps: # echo 0 > tracing_on # disable trace # echo 1 > osnoise/stop_tracing_us # stop trace if timerlat irq > 1 us # echo timerlat > current_tracer # enable timerlat tracer # sleep 1 # wait... that is the time when rtla # apply configs like prio or cgroup # echo 1 > tracing_on # start tracing # cat trace # tracer: timerlat # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # ||||| ACTIVATION # TASK-PID CPU# ||||| TIMESTAMP ID CONTEXT LATENCY # | | | ||||| | | | | NOTHING! Then, trying to enable tracing again with echo 1 > tracing_on resulted in no change: the trace was still not tracing. This problem happens because the timerlat IRQ hits the stop tracing condition while tracing is off, and do not wake up the timerlat thread, so the timerlat threads are kept sleeping forever, resulting in no trace, even after re-enabling the tracer. Avoid this condition by always waking up the threads, even after stopping tracing, allowing the tracer to return to its normal operating after a new tracing on. Link: https://lore.kernel.org/linux-trace-kernel/1ed8f830638b20a39d535d27d908e319a9a3c4e2.1683822622.git.bristot@kernel.org Cc: Juri Lelli Cc: stable@vger.kernel.org Fixes: a955d7eac177 ("trace: Add timerlat tracer") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index efbbec2caff8..e97e3fa5cbed 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1652,6 +1652,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) osnoise_stop_tracing(); notify_new_max_latency(diff); + wake_up_process(tlat->kthread); + return HRTIMER_NORESTART; } } -- cgit v1.2.3 From 3e0fea09b17fa2255f6cb0108bbffd4a505f8925 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 May 2023 16:07:38 -0700 Subject: tracing/user_events: Split up mm alloc and attach When a new mm is being created in a fork() path it currently is allocated and then attached in one go. This leaves the mm exposed out to the tracing register callbacks while any parent enabler locations are copied in. This should not happen. Split up mm alloc and attach as unique operations. When duplicating enablers, first alloc, then duplicate, and only upon success, attach. This prevents any timing window outside of the event_reg mutex for enablement walking. This allows for dropping RCU requirement for enablement walking in later patches. Link: https://lkml.kernel.org/r/20230519230741.669-2-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=whTBvXJuoi_kACo3qi5WZUmRrhyA-_=rRFsycTytmB6qw@mail.gmail.com/ Signed-off-by: Linus Torvalds [ change log written by Beau Belgrave ] Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e37c7f168c44..599aab46a94b 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -539,10 +539,9 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) return found; } -static struct user_event_mm *user_event_mm_create(struct task_struct *t) +static struct user_event_mm *user_event_mm_alloc(struct task_struct *t) { struct user_event_mm *user_mm; - unsigned long flags; user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT); @@ -554,12 +553,6 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) refcount_set(&user_mm->refcnt, 1); refcount_set(&user_mm->tasks, 1); - spin_lock_irqsave(&user_event_mms_lock, flags); - list_add_rcu(&user_mm->link, &user_event_mms); - spin_unlock_irqrestore(&user_event_mms_lock, flags); - - t->user_event_mm = user_mm; - /* * The lifetime of the memory descriptor can slightly outlast * the task lifetime if a ref to the user_event_mm is taken @@ -573,6 +566,17 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) return user_mm; } +static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&user_event_mms_lock, flags); + list_add_rcu(&user_mm->link, &user_event_mms); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + t->user_event_mm = user_mm; +} + static struct user_event_mm *current_user_event_mm(void) { struct user_event_mm *user_mm = current->user_event_mm; @@ -580,10 +584,12 @@ static struct user_event_mm *current_user_event_mm(void) if (user_mm) goto inc; - user_mm = user_event_mm_create(current); + user_mm = user_event_mm_alloc(current); if (!user_mm) goto error; + + user_event_mm_attach(user_mm, current); inc: refcount_inc(&user_mm->refcnt); error: @@ -671,7 +677,7 @@ void user_event_mm_remove(struct task_struct *t) void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) { - struct user_event_mm *mm = user_event_mm_create(t); + struct user_event_mm *mm = user_event_mm_alloc(t); struct user_event_enabler *enabler; if (!mm) @@ -685,10 +691,11 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) rcu_read_unlock(); + user_event_mm_attach(mm, t); return; error: rcu_read_unlock(); - user_event_mm_remove(t); + user_event_mm_destroy(mm); } static bool current_user_event_enabler_exists(unsigned long uaddr, -- cgit v1.2.3 From aaecdaf922835ed9a8ce56cdd9a8d40fe630257a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 May 2023 16:07:39 -0700 Subject: tracing/user_events: Remove RCU lock while pinning pages pin_user_pages_remote() can reschedule which means we cannot hold any RCU lock while using it. Now that enablers are not exposed out to the tracing register callbacks during fork(), there is clearly no need to require the RCU lock as event_mutex is enough to protect changes. Remove unneeded RCU usages when pinning pages and walking enablers with event_mutex held. Cleanup a misleading "safe" list walk that is not needed. During fork() duplication, remove unneeded RCU list add, since the list is not exposed yet. Link: https://lkml.kernel.org/r/20230519230741.669-3-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wiiBfT4zNS29jA0XEsy8EmbqTH1hAPdRJCDAJMD8Gxt5A@mail.gmail.com/ Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event enablement") Signed-off-by: Linus Torvalds [ change log written by Beau Belgrave ] Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 599aab46a94b..d34a59630e70 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -439,9 +439,8 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, unsigned long uaddr, unsigned char bit) { struct user_event_enabler *enabler; - struct user_event_enabler *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, link) { if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -456,19 +455,19 @@ static void user_event_enabler_update(struct user_event *user) struct user_event_mm *next; int attempt; + lockdep_assert_held(&event_mutex); + while (mm) { next = mm->next; mmap_read_lock(mm->mm); - rcu_read_lock(); - list_for_each_entry_rcu(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, link) { if (enabler->event == user) { attempt = 0; user_event_enabler_write(mm, enabler, true, &attempt); } } - rcu_read_unlock(); mmap_read_unlock(mm->mm); user_event_mm_put(mm); mm = next; @@ -496,7 +495,9 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, enabler->values = orig->values & ENABLE_VAL_DUP_MASK; refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &mm->enablers); + + /* Enablers not exposed yet, RCU not required */ + list_add(&enabler->link, &mm->enablers); return true; } -- cgit v1.2.3 From dcbd1ac2668b5fa02069ea96d581ca3f70a7543c Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 19 May 2023 16:07:40 -0700 Subject: tracing/user_events: Rename link fields for clarity Currently most list_head fields of various structs within user_events are simply named link. This causes folks to keep additional context in their head when working with the code, which can be confusing. Instead of using link, describe what the actual link is, for example: list_del_rcu(&mm->link); Changes into: list_del_rcu(&mm->mms_link); The reader now is given a hint the link is to the mms global list instead of having to remember or spot check within the code. Link: https://lkml.kernel.org/r/20230519230741.669-4-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wicngggxVpbnrYHjRTwGE0WYscPRM+L2HO2BF8ia1EXgQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 2 +- kernel/trace/trace_events_user.c | 40 ++++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 2847f5a18a86..17d452b389de 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -17,7 +17,7 @@ #ifdef CONFIG_USER_EVENTS struct user_event_mm { - struct list_head link; + struct list_head mms_link; struct list_head enablers; struct mm_struct *mm; struct user_event_mm *next; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index d34a59630e70..238c7a0615fa 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -96,7 +96,7 @@ struct user_event { * these to track enablement sites that are tied to an event. */ struct user_event_enabler { - struct list_head link; + struct list_head mm_enablers_link; struct user_event *event; unsigned long addr; @@ -155,7 +155,7 @@ struct user_event_file_info { #define VALIDATOR_REL (1 << 1) struct user_event_validator { - struct list_head link; + struct list_head user_event_link; int offset; int flags; }; @@ -261,7 +261,7 @@ error: static void user_event_enabler_destroy(struct user_event_enabler *enabler) { - list_del_rcu(&enabler->link); + list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ refcount_dec(&enabler->event->refcnt); @@ -440,7 +440,7 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, { struct user_event_enabler *enabler; - list_for_each_entry(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -461,7 +461,7 @@ static void user_event_enabler_update(struct user_event *user) next = mm->next; mmap_read_lock(mm->mm); - list_for_each_entry(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { attempt = 0; user_event_enabler_write(mm, enabler, true, &attempt); @@ -497,7 +497,7 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, refcount_inc(&enabler->event->refcnt); /* Enablers not exposed yet, RCU not required */ - list_add(&enabler->link, &mm->enablers); + list_add(&enabler->mm_enablers_link, &mm->enablers); return true; } @@ -527,13 +527,15 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) */ rcu_read_lock(); - list_for_each_entry_rcu(mm, &user_event_mms, link) - list_for_each_entry_rcu(enabler, &mm->enablers, link) + list_for_each_entry_rcu(mm, &user_event_mms, mms_link) { + list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { mm->next = found; found = user_event_mm_get(mm); break; } + } + } rcu_read_unlock(); @@ -572,7 +574,7 @@ static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_stru unsigned long flags; spin_lock_irqsave(&user_event_mms_lock, flags); - list_add_rcu(&user_mm->link, &user_event_mms); + list_add_rcu(&user_mm->mms_link, &user_event_mms); spin_unlock_irqrestore(&user_event_mms_lock, flags); t->user_event_mm = user_mm; @@ -601,7 +603,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) { struct user_event_enabler *enabler, *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) user_event_enabler_destroy(enabler); mmdrop(mm->mm); @@ -638,7 +640,7 @@ void user_event_mm_remove(struct task_struct *t) /* Remove the mm from the list, so it can no longer be enabled */ spin_lock_irqsave(&user_event_mms_lock, flags); - list_del_rcu(&mm->link); + list_del_rcu(&mm->mms_link); spin_unlock_irqrestore(&user_event_mms_lock, flags); /* @@ -686,9 +688,10 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) rcu_read_lock(); - list_for_each_entry_rcu(enabler, &old_mm->enablers, link) + list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) { if (!user_event_enabler_dup(enabler, mm)) goto error; + } rcu_read_unlock(); @@ -757,7 +760,7 @@ retry: */ if (!*write_result) { refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &user_mm->enablers); + list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } mutex_unlock(&event_mutex); @@ -913,8 +916,8 @@ static void user_event_destroy_validators(struct user_event *user) struct user_event_validator *validator, *next; struct list_head *head = &user->validators; - list_for_each_entry_safe(validator, next, head, link) { - list_del(&validator->link); + list_for_each_entry_safe(validator, next, head, user_event_link) { + list_del(&validator->user_event_link); kfree(validator); } } @@ -968,7 +971,7 @@ add_validator: validator->offset = offset; /* Want sequential access when validating */ - list_add_tail(&validator->link, &user->validators); + list_add_tail(&validator->user_event_link, &user->validators); add_field: field->type = type; @@ -1358,7 +1361,7 @@ static int user_event_validate(struct user_event *user, void *data, int len) void *pos, *end = data + len; u32 loc, offset, size; - list_for_each_entry(validator, head, link) { + list_for_each_entry(validator, head, user_event_link) { pos = data + validator->offset; /* Already done min_size check, no bounds check here */ @@ -2279,7 +2282,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) */ mutex_lock(&event_mutex); - list_for_each_entry_safe(enabler, next, &mm->enablers, link) + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) { if (enabler->addr == reg.disable_addr && ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); @@ -2290,6 +2293,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) /* Removed at least one */ ret = 0; } + } mutex_unlock(&event_mutex); -- cgit v1.2.3 From ff9e1632d69e596d8ca256deb07433a8f3565038 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 19 May 2023 16:07:41 -0700 Subject: tracing/user_events: Document user_event_mm one-shot list usage During 6.4 development it became clear that the one-shot list used by the user_event_mm's next field was confusing to others. It is not clear how this list is protected or what the next field usage is for unless you are familiar with the code. Add comments into the user_event_mm struct indicating lock requirement and usage. Also document how and why this approach was used via comments in both user_event_enabler_update() and user_event_mm_get_all() and the rules to properly use it. Link: https://lkml.kernel.org/r/20230519230741.669-5-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wicngggxVpbnrYHjRTwGE0WYscPRM+L2HO2BF8ia1EXgQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 1 + kernel/trace/trace_events_user.c | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 17d452b389de..8afa8c3a0973 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -20,6 +20,7 @@ struct user_event_mm { struct list_head mms_link; struct list_head enablers; struct mm_struct *mm; + /* Used for one-shot lists, protected by event_mutex */ struct user_event_mm *next; refcount_t refcnt; refcount_t tasks; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 238c7a0615fa..dbb14705d0d3 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -451,12 +451,25 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, static void user_event_enabler_update(struct user_event *user) { struct user_event_enabler *enabler; - struct user_event_mm *mm = user_event_mm_get_all(user); struct user_event_mm *next; + struct user_event_mm *mm; int attempt; lockdep_assert_held(&event_mutex); + /* + * We need to build a one-shot list of all the mms that have an + * enabler for the user_event passed in. This list is only valid + * while holding the event_mutex. The only reason for this is due + * to the global mm list being RCU protected and we use methods + * which can wait (mmap_read_lock and pin_user_pages_remote). + * + * NOTE: user_event_mm_get_all() increments the ref count of each + * mm that is added to the list to prevent removal timing windows. + * We must always put each mm after they are used, which may wait. + */ + mm = user_event_mm_get_all(user); + while (mm) { next = mm->next; mmap_read_lock(mm->mm); @@ -515,6 +528,14 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) struct user_event_enabler *enabler; struct user_event_mm *mm; + /* + * We use the mm->next field to build a one-shot list from the global + * RCU protected list. To build this list the event_mutex must be held. + * This lets us build a list without requiring allocs that could fail + * when user based events are most wanted for diagnostics. + */ + lockdep_assert_held(&event_mutex); + /* * We do not want to block fork/exec while enablements are being * updated, so we use RCU to walk the current tasks that have used -- cgit v1.2.3 From e30fbc618e97b38dbb49f1d44dcd0778d3f23b8c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 23 May 2023 22:11:08 -0400 Subject: tracing/histograms: Allow variables to have some modifiers Modifiers are used to change the behavior of keys. For instance, they can grouped into buckets, converted to syscall names (from the syscall identifier), show task->comm of the current pid, be an array of longs that represent a stacktrace, and more. It was found that nothing stopped a value from taking a modifier. As values are simple counters. If this happened, it would call code that was not expecting a modifier and crash the kernel. This was fixed by having the ___create_val_field() function test if a modifier was present and fail if one was. This fixed the crash. Now there's a problem with variables. Variables are used to pass fields from one event to another. Variables are allowed to have some modifiers, as the processing may need to happen at the time of the event (like stacktraces and comm names of the current pid). The issue is that it too uses __create_val_field(). Now that fails on modifiers, variables can no longer use them (this is a regression). As not all modifiers are for variables, have them use a separate check. Link: https://lore.kernel.org/linux-trace-kernel/20230523221108.064a5d82@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Tom Zanussi Cc: Mark Rutland Fixes: e0213434fe3e4 ("tracing: Do not let histogram values have some modifiers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 486cca3c2b75..543cb7dc84ad 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4238,13 +4238,19 @@ static int __create_val_field(struct hist_trigger_data *hist_data, goto out; } - /* Some types cannot be a value */ - if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | - HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | - HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | - HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) { - hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); - ret = -EINVAL; + /* values and variables should not have some modifiers */ + if (hist_field->flags & HIST_FIELD_FL_VAR) { + /* Variable */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2)) + goto err; + } else { + /* Value */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | + HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | + HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) + goto err; } hist_data->fields[val_idx] = hist_field; @@ -4256,6 +4262,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, ret = -EINVAL; out: return ret; + err: + hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); + return -EINVAL; } static int create_val_field(struct hist_trigger_data *hist_data, -- cgit v1.2.3 From 4b512860bdbdddcf41467ebd394f27cb8dfb528c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 23 May 2023 23:09:13 -0400 Subject: tracing: Rename stacktrace field to common_stacktrace The histogram and synthetic events can use a pseudo event called "stacktrace" that will create a stacktrace at the time of the event and use it just like it was a normal field. We have other pseudo events such as "common_cpu" and "common_timestamp". To stay consistent with that, convert "stacktrace" to "common_stacktrace". As this was used in older kernels, to keep backward compatibility, this will act just like "common_cpu" did with "cpu". That is, "cpu" will be the same as "common_cpu" unless the event has a "cpu" field. In which case, the event's field is used. The same is true with "stacktrace". Also update the documentation to reflect this change. Link: https://lore.kernel.org/linux-trace-kernel/20230523230913.6860e28d@rorschach.local.home Cc: Masami Hiramatsu Cc: Tom Zanussi Cc: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/histogram.rst | 64 +++++++++++++++++++-------------------- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 ++ kernel/trace/trace_events_hist.c | 16 ++++++---- 5 files changed, 46 insertions(+), 39 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 479c9eac6335..3c9b263de9c2 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -35,7 +35,7 @@ Documentation written by Tom Zanussi in place of an explicit value field - this is simply a count of event hits. If 'values' isn't specified, an implicit 'hitcount' value will be automatically created and used as the only value. - Keys can be any field, or the special string 'stacktrace', which + Keys can be any field, or the special string 'common_stacktrace', which will use the event's kernel stacktrace as the key. The keywords 'keys' or 'key' can be used to specify keys, and the keywords 'values', 'vals', or 'val' can be used to specify values. Compound @@ -54,7 +54,7 @@ Documentation written by Tom Zanussi 'compatible' if the fields named in the trigger share the same number and type of fields and those fields also have the same names. Note that any two events always share the compatible 'hitcount' and - 'stacktrace' fields and can therefore be combined using those + 'common_stacktrace' fields and can therefore be combined using those fields, however pointless that may be. 'hist' triggers add a 'hist' file to each event's subdirectory. @@ -547,9 +547,9 @@ Extended error information the hist trigger display symbolic call_sites, we can have the hist trigger additionally display the complete set of kernel stack traces that led to each call_site. To do that, we simply use the special - value 'stacktrace' for the key parameter:: + value 'common_stacktrace' for the key parameter:: - # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ + # echo 'hist:keys=common_stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ /sys/kernel/tracing/events/kmem/kmalloc/trigger The above trigger will use the kernel stack trace in effect when an @@ -561,9 +561,9 @@ Extended error information every callpath to a kmalloc for a kernel compile):: # cat /sys/kernel/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] + # trigger info: hist:keys=common_stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] - { stacktrace: + { common_stacktrace: __kmalloc_track_caller+0x10b/0x1a0 kmemdup+0x20/0x50 hidraw_report_event+0x8a/0x120 [hid] @@ -581,7 +581,7 @@ Extended error information cpu_startup_entry+0x315/0x3e0 rest_init+0x7c/0x80 } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: + { common_stacktrace: __kmalloc_track_caller+0x10b/0x1a0 kmemdup+0x20/0x50 hidraw_report_event+0x8a/0x120 [hid] @@ -596,7 +596,7 @@ Extended error information do_IRQ+0x5a/0xf0 ret_from_intr+0x0/0x30 } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: + { common_stacktrace: kmem_cache_alloc_trace+0xeb/0x150 aa_alloc_task_context+0x27/0x40 apparmor_cred_prepare+0x1f/0x50 @@ -608,7 +608,7 @@ Extended error information . . . - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 i915_gem_execbuffer2+0x6c/0x2c0 [i915] drm_ioctl+0x349/0x670 [drm] @@ -616,7 +616,7 @@ Extended error information SyS_ioctl+0x81/0xa0 system_call_fastpath+0x12/0x6a } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808 - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 load_elf_phdrs+0x76/0xa0 load_elf_binary+0x102/0x1650 @@ -625,7 +625,7 @@ Extended error information SyS_execve+0x3a/0x50 return_from_execve+0x0/0x23 } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048 - { stacktrace: + { common_stacktrace: kmem_cache_alloc_trace+0xeb/0x150 apparmor_file_alloc_security+0x27/0x40 security_file_alloc+0x16/0x20 @@ -636,7 +636,7 @@ Extended error information SyS_open+0x1e/0x20 system_call_fastpath+0x12/0x6a } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376 - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 seq_buf_alloc+0x1b/0x50 seq_read+0x2cc/0x370 @@ -1026,7 +1026,7 @@ Extended error information First we set up an initially paused stacktrace trigger on the netif_receive_skb event:: - # echo 'hist:key=stacktrace:vals=len:pause' > \ + # echo 'hist:key=common_stacktrace:vals=len:pause' > \ /sys/kernel/tracing/events/net/netif_receive_skb/trigger Next, we set up an 'enable_hist' trigger on the sched_process_exec @@ -1060,9 +1060,9 @@ Extended error information $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz # cat /sys/kernel/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + # trigger info: hist:keys=common_stacktrace:vals=len:sort=hitcount:size=2048 [paused] - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1079,7 +1079,7 @@ Extended error information kthread+0xd2/0xf0 ret_from_fork+0x42/0x70 } hitcount: 85 len: 28884 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1097,7 +1097,7 @@ Extended error information irq_thread+0x11f/0x150 kthread+0xd2/0xf0 } hitcount: 98 len: 664329 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 process_backlog+0xa8/0x150 @@ -1115,7 +1115,7 @@ Extended error information inet_sendmsg+0x64/0xa0 sock_sendmsg+0x3d/0x50 } hitcount: 115 len: 13030 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1142,14 +1142,14 @@ Extended error information into the histogram. In order to avoid having to set everything up again, we can just clear the histogram first:: - # echo 'hist:key=stacktrace:vals=len:clear' >> \ + # echo 'hist:key=common_stacktrace:vals=len:clear' >> \ /sys/kernel/tracing/events/net/netif_receive_skb/trigger Just to verify that it is in fact cleared, here's what we now see in the hist file:: # cat /sys/kernel/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + # trigger info: hist:keys=common_stacktrace:vals=len:sort=hitcount:size=2048 [paused] Totals: Hits: 0 @@ -1485,12 +1485,12 @@ Extended error information And here's an example that shows how to combine histogram data from any two events even if they don't share any 'compatible' fields - other than 'hitcount' and 'stacktrace'. These commands create a + other than 'hitcount' and 'common_stacktrace'. These commands create a couple of triggers named 'bar' using those fields:: - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + # echo 'hist:name=bar:key=common_stacktrace:val=hitcount' > \ /sys/kernel/tracing/events/sched/sched_process_fork/trigger - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + # echo 'hist:name=bar:key=common_stacktrace:val=hitcount' > \ /sys/kernel/tracing/events/net/netif_rx/trigger And displaying the output of either shows some interesting if @@ -1501,16 +1501,16 @@ Extended error information # event histogram # - # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] + # trigger info: hist:name=bar:keys=common_stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] # - { stacktrace: + { common_stacktrace: kernel_clone+0x18e/0x330 kernel_thread+0x29/0x30 kthreadd+0x154/0x1b0 ret_from_fork+0x3f/0x70 } hitcount: 1 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx_ni+0x20/0x70 dev_loopback_xmit+0xaa/0xd0 @@ -1528,7 +1528,7 @@ Extended error information call_cpuidle+0x3b/0x60 cpu_startup_entry+0x22d/0x310 } hitcount: 1 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx_ni+0x20/0x70 dev_loopback_xmit+0xaa/0xd0 @@ -1543,7 +1543,7 @@ Extended error information SyS_sendto+0xe/0x10 entry_SYSCALL_64_fastpath+0x12/0x6a } hitcount: 2 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1561,7 +1561,7 @@ Extended error information sock_sendmsg+0x38/0x50 ___sys_sendmsg+0x14e/0x270 } hitcount: 76 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1579,7 +1579,7 @@ Extended error information sock_sendmsg+0x38/0x50 ___sys_sendmsg+0x269/0x270 } hitcount: 77 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1597,7 +1597,7 @@ Extended error information sock_sendmsg+0x38/0x50 SYSC_sendto+0xef/0x170 } hitcount: 88 - { stacktrace: + { common_stacktrace: kernel_clone+0x18e/0x330 SyS_clone+0x19/0x20 entry_SYSCALL_64_fastpath+0x12/0x6a @@ -1949,7 +1949,7 @@ uninterruptible state:: # cd /sys/kernel/tracing # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events - # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger + # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=common_stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger # echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(block_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger # echo 1 > events/synthetic/block_lat/enable # cat trace diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0e373222a6df..7c4a0b72334e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -806,6 +806,7 @@ enum { FILTER_TRACE_FN, FILTER_COMM, FILTER_CPU, + FILTER_STACKTRACE, }; extern int trace_event_raw_init(struct trace_event_call *call); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..81801dc31784 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5752,7 +5752,7 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field, or the special string 'stacktrace'.\n" + "\t can be any field, or the special string 'common_stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 654ffa40457a..57e539d47989 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -194,6 +194,8 @@ static int trace_define_generic_fields(void) __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); + __generic_field(char *, stacktrace, FILTER_STACKTRACE); + __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 543cb7dc84ad..b97d3ad832f1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1364,7 +1364,7 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else - field_name = "stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -2367,7 +2367,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else if (strcmp(field_name, "stacktrace") == 0) { + } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; @@ -2378,11 +2378,15 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (!field || !field->size) { /* * For backward compatibility, if field_name - * was "cpu", then we treat this the same as - * common_cpu. This also works for "CPU". + * was "cpu" or "stacktrace", then we treat this + * the same as common_cpu and common_stacktrace + * respectively. This also works for "CPU", and + * "STACKTRACE". */ if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; + } else if (field && field->filter_type == FILTER_STACKTRACE) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -5394,7 +5398,7 @@ static void hist_trigger_print_key(struct seq_file *m, if (key_field->field) seq_printf(m, "%s.stacktrace", key_field->field->name); else - seq_puts(m, "stacktrace:\n"); + seq_puts(m, "common_stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5977,7 +5981,7 @@ static int event_hist_trigger_print(struct seq_file *m, if (field->field) seq_printf(m, "%s.stacktrace", field->field->name); else - seq_puts(m, "stacktrace"); + seq_puts(m, "common_stacktrace"); } else hist_field_print(m, field); } -- cgit v1.2.3 From 5bd4990f19b0fedbba5511b44c1cd1fdb7061f3c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 14:50:13 +0100 Subject: trace: Convert trace/seq to use copy_splice_read() For the splice from the trace seq buffer, just use copy_splice_read(). In the future, something better can probably be done by gifting pages from seq->buf into the pipe, but that would require changing seq->buf into a vmap over an array of pages. Signed-off-by: David Howells cc: Christoph Hellwig cc: Al Viro cc: Jens Axboe cc: Steven Rostedt cc: Masami Hiramatsu cc: linux-kernel@vger.kernel.org cc: linux-trace-kernel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20230522135018.2742245-27-dhowells@redhat.com Signed-off-by: Jens Axboe --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..c210d02fac97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5171,7 +5171,7 @@ static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, -- cgit v1.2.3 From c7dce4c5d9f6b17feec5ec6056453d019ee4d13b Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 16 May 2023 14:39:56 +0000 Subject: tracing: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement with strlcpy is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Acked-by: Masami Hiramatsu (Google) Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230516143956.1367827-1-azeemshaikh38@gmail.com --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_events.c | 4 ++-- kernel/trace/trace_events_inject.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..28ccd0c9bdf0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -196,7 +196,7 @@ static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { - strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = true; @@ -281,7 +281,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { - strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); @@ -291,7 +291,7 @@ static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { - strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } @@ -2521,7 +2521,7 @@ static void __trace_find_cmdline(int pid, char comm[]) if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 654ffa40457a..dc83a259915b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2831,7 +2831,7 @@ static __init int setup_trace_triggers(char *str) char *buf; int i; - strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event triggers"); @@ -3621,7 +3621,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { - strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event tracing"); diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d6b4935a78c0..abe805d471eb 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -217,7 +217,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) char *addr = (char *)(unsigned long) val; if (field->filter_type == FILTER_STATIC_STRING) { - strlcpy(entry + field->offset, addr, field->size); + strscpy(entry + field->offset, addr, field->size); } else if (field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; @@ -232,7 +232,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) } entry = *pentry; - strlcpy(entry + (entry_size - str_len), addr, str_len); + strscpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); if (field->filter_type == FILTER_RDYN_STRING) str_loc -= field->offset + field->size; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 59cda19a9033..1b3fa7b854aa 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; static int __init set_kprobe_boot_events(char *str) { - strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); return 1; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..73055ba8d8ef 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -254,7 +254,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, trace_probe_log_err(offset, GROUP_TOO_LONG); return -EINVAL; } - strlcpy(buf, event, slash - event + 1); + strscpy(buf, event, slash - event + 1); if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; -- cgit v1.2.3 From e8352cf5778c53b80fdcee086278b2048ddb8f98 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:38 -0400 Subject: tracing: Move setting of tracing_selftest_running out of register_tracer() The variables tracing_selftest_running and tracing_selftest_disabled are only used for when CONFIG_FTRACE_STARTUP_TEST is enabled. Make them only visible within the selftest code. The setting of those variables are in the register_tracer() call, and set in a location where they do not need to be. Create a wrapper around run_tracer_selftest() called do_run_tracer_selftest() which sets those variables, and have register_tracer() call that instead. Having those variables only set within the CONFIG_FTRACE_STARTUP_TEST scope gets rid of them (and also the ability to remove testing against them) when the startup tests are not enabled (most cases). Link: https://lkml.kernel.org/r/20230528051742.1325503-2-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 81801dc31784..87e5920b141f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2041,6 +2041,17 @@ static int run_tracer_selftest(struct tracer *type) return 0; } +static int do_run_tracer_selftest(struct tracer *type) +{ + int ret; + + tracing_selftest_running = true; + ret = run_tracer_selftest(type); + tracing_selftest_running = false; + + return ret; +} + static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; @@ -2092,6 +2103,10 @@ static inline int run_tracer_selftest(struct tracer *type) { return 0; } +static inline int do_run_tracer_selftest(struct tracer *type) +{ + return 0; +} #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); @@ -2127,8 +2142,6 @@ int __init register_tracer(struct tracer *type) mutex_lock(&trace_types_lock); - tracing_selftest_running = true; - for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -2157,7 +2170,7 @@ int __init register_tracer(struct tracer *type) /* store the tracer for __set_tracer_option */ type->flags->trace = type; - ret = run_tracer_selftest(type); + ret = do_run_tracer_selftest(type); if (ret < 0) goto out; @@ -2166,7 +2179,6 @@ int __init register_tracer(struct tracer *type) add_tracer_options(&global_trace, type); out: - tracing_selftest_running = false; mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) -- cgit v1.2.3 From 9da705d432a07927526005a0688d81fbbf30e349 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:39 -0400 Subject: tracing: Have tracer selftests call cond_resched() before running As there are more and more internal selftests being added to the Linux kernel (KSAN, lockdep, etc) the selftests are taking longer to run when these are enabled. Add a cond_resched() to the calling of do_run_tracer_selftest() to force a schedule if NEED_RESCHED is set, otherwise the soft lockup watchdog may trigger on boot up. Link: https://lkml.kernel.org/r/20230528051742.1325503-3-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87e5920b141f..70f2b511b9cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2045,6 +2045,13 @@ static int do_run_tracer_selftest(struct tracer *type) { int ret; + /* + * Tests can take a long time, especially if they are run one after the + * other, as does happen during bootup when all the tracers are + * registered. This could cause the soft lockup watchdog to trigger. + */ + cond_resched(); + tracing_selftest_running = true; ret = run_tracer_selftest(type); tracing_selftest_running = false; -- cgit v1.2.3 From a3ae76d7ff781208100e6acc58eb09afb7b4b177 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:40 -0400 Subject: tracing: Make tracing_selftest_running/delete nops when not used There's no reason to test the condition variables tracing_selftest_running or tracing_selftest_delete when tracing selftests are not enabled. Make them define 0s when not the selftests are not configured in. Link: https://lkml.kernel.org/r/20230528051742.1325503-4-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70f2b511b9cd..004f5f99e943 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -60,6 +60,7 @@ */ bool ring_buffer_expanded; +#ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -75,7 +76,6 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; -#ifdef CONFIG_FTRACE_STARTUP_TEST void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { @@ -83,6 +83,9 @@ void __init disable_tracing_selftest(const char *reason) pr_info("Ftrace startup test is disabled due to %s\n", reason); } } +#else +#define tracing_selftest_running 0 +#define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ -- cgit v1.2.3 From ac9d2cb1d5f8e22235c399338504dadc87d14e74 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:41 -0400 Subject: tracing: Only make selftest conditionals affect the global_trace The tracing_selftest_running and tracing_selftest_disabled variables were to keep trace_printk() and other writes from affecting the tracing selftests, as the tracing selftests would examine the ring buffer to see if it contained what it expected or not. trace_printk() and friends could add to the ring buffer and cause the selftests to fail (and then disable the tracer that was being tested). To keep that from happening, these variables were added and would keep trace_printk() and friends from writing to the ring buffer while the tests were going on. But this was only the top level ring buffer (owned by the global_trace instance). There is no reason to prevent writing into ring buffers of other instances via the trace_array_printk() and friends. For the functions that could be used by other instances, check if the global_trace is the tracer instance that is being written to before deciding to not allow the write. Link: https://lkml.kernel.org/r/20230528051742.1325503-5-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 004f5f99e943..64a4dde073ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1054,7 +1054,10 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; - if (unlikely(tracing_selftest_running || tracing_disabled)) + if (unlikely(tracing_selftest_running && tr == &global_trace)) + return 0; + + if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ @@ -3512,7 +3515,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, unsigned int trace_ctx; char *tbuffer; - if (tracing_disabled || tracing_selftest_running) + if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ @@ -3560,6 +3563,9 @@ __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { + if (tracing_selftest_running && tr == &global_trace) + return 0; + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } -- cgit v1.2.3 From a2d910f02231f33118647fc438157ae69c073f89 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:42 -0400 Subject: tracing: Have function_graph selftest call cond_resched() When all kernel debugging is enabled (lockdep, KSAN, etc), the function graph enabling and disabling can take several seconds to complete. The function_graph selftest enables and disables function graph tracing several times. With full debugging enabled, the soft lockup watchdog was triggering because the selftest was running without ever scheduling. Add cond_resched() throughout the test to make sure it does not trigger the soft lockup detector. Link: https://lkml.kernel.org/r/20230528051742.1325503-6-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_selftest.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a931d9aaea26..529590499b1f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -848,6 +848,12 @@ trace_selftest_startup_function_graph(struct tracer *trace, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * These tests can take some time to run. Make sure on non PREEMPT + * kernels, we do not trigger the softlockup detector. + */ + cond_resched(); + tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); @@ -869,6 +875,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); @@ -891,6 +899,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + tracing_start(); if (!ret && !count) { -- cgit v1.2.3 From d0c2d66fcc8db748edfe60e3b443eaff931f50e9 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Wed, 17 May 2023 14:53:23 +0000 Subject: ftrace: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Acked-by: Masami Hiramatsu (Google) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230517145323.1522010-1-azeemshaikh38@gmail.com --- kernel/trace/ftrace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 764668467155..6a77edb51f18 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5743,7 +5743,7 @@ bool ftrace_filter_param __initdata; static int __init set_ftrace_notrace(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); @@ -5751,7 +5751,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -5763,14 +5763,14 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static int __init set_graph_function(char *str) { - strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); static int __init set_graph_notrace_function(char *str) { - strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_notrace=", set_graph_notrace_function); @@ -6569,8 +6569,8 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, continue; *value = op->trampoline; *type = 't'; - strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); - strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + strscpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strscpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); *exported = 0; return 0; } @@ -6933,7 +6933,7 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, if (off) *off = addr - found_func->ip; if (sym) - strlcpy(sym, found_func->name, KSYM_NAME_LEN); + strscpy(sym, found_func->name, KSYM_NAME_LEN); return found_func->name; } @@ -6987,8 +6987,8 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, *value = mod_func->ip; *type = 'T'; - strlcpy(name, mod_func->name, KSYM_NAME_LEN); - strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); + strscpy(name, mod_func->name, KSYM_NAME_LEN); + strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); *exported = 1; preempt_enable(); return 0; -- cgit v1.2.3 From 81d0fa4cb4fc0e1a49c2b22f92c43d9fe972ebcf Mon Sep 17 00:00:00 2001 From: Pietro Borrello Date: Sat, 28 Jan 2023 16:23:41 +0000 Subject: tracing/probe: trace_probe_primary_from_call(): checked list_first_entry All callers of trace_probe_primary_from_call() check the return value to be non NULL. However, the function returns list_first_entry(&tpe->probes, ...) which can never be NULL. Additionally, it does not check for the list being possibly empty, possibly causing a type confusion on empty lists. Use list_first_entry_or_null() which solves both problems. Link: https://lore.kernel.org/linux-trace-kernel/20230128-list-entry-null-check-v1-1-8bde6a3da2ef@diag.uniroma1.it/ Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Signed-off-by: Pietro Borrello Reviewed-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Acked-by: Mukesh Ojha Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index ef8ed3b65d05..6a4ecfb1da43 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -308,7 +308,7 @@ trace_probe_primary_from_call(struct trace_event_call *call) { struct trace_probe_event *tpe = trace_probe_event_from_call(call); - return list_first_entry(&tpe->probes, struct trace_probe, list); + return list_first_entry_or_null(&tpe->probes, struct trace_probe, list); } static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) -- cgit v1.2.3 From cb16330d12741f6dae56aad5acf62f5be3a06c4e Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: fprobe: Pass return address to the handlers Pass return address as 'ret_ip' to the fprobe entry and return handlers so that the fprobe user handler can get the reutrn address without analyzing arch-dependent pt_regs. Link: https://lore.kernel.org/all/168507467664.913472.11642316698862778600.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 6 ++++-- include/linux/rethook.h | 2 +- kernel/kprobes.c | 1 + kernel/trace/bpf_trace.c | 6 ++++-- kernel/trace/fprobe.c | 6 +++--- kernel/trace/rethook.c | 3 ++- lib/test_fprobe.c | 10 +++++++--- samples/fprobe/fprobe_example.c | 6 ++++-- 8 files changed, 26 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 47fefc7f363b..134f0f59ffa8 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -35,9 +35,11 @@ struct fprobe { int nr_maxactive; int (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, - struct pt_regs *regs, void *entry_data); + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data); void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, - struct pt_regs *regs, void *entry_data); + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data); }; /* This fprobe is soft-disabled. */ diff --git a/include/linux/rethook.h b/include/linux/rethook.h index c8ac1e5afcd1..fdf26cd0e742 100644 --- a/include/linux/rethook.h +++ b/include/linux/rethook.h @@ -14,7 +14,7 @@ struct rethook_node; -typedef void (*rethook_handler_t) (struct rethook_node *, void *, struct pt_regs *); +typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long, struct pt_regs *); /** * struct rethook - The rethook management data structure. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 00e177de91cc..ce13f1a35251 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2127,6 +2127,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) NOKPROBE_SYMBOL(pre_handler_kretprobe); static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, + unsigned long ret_addr, struct pt_regs *regs) { struct kretprobe *rp = (struct kretprobe *)data; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9a050e36dc6c..987c76d94604 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2642,7 +2642,8 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; @@ -2653,7 +2654,8 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 18d36842faf5..32994815edf6 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -46,7 +46,7 @@ static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, } if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data); + ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); /* If entry_handler returns !0, nmissed is not counted. */ if (rh) { @@ -112,7 +112,7 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, } static void fprobe_exit_handler(struct rethook_node *rh, void *data, - struct pt_regs *regs) + unsigned long ret_ip, struct pt_regs *regs) { struct fprobe *fp = (struct fprobe *)data; struct fprobe_rethook_node *fpr; @@ -133,7 +133,7 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data, return; } - fp->exit_handler(fp, fpr->entry_ip, regs, + fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, fp->entry_data_size ? (void *)fpr->data : NULL); ftrace_test_recursion_unlock(bit); } diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 60f6cb2b486b..f32ee484391a 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -301,7 +301,8 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, break; handler = READ_ONCE(rhn->rethook->handler); if (handler) - handler(rhn, rhn->rethook->data, regs); + handler(rhn, rhn->rethook->data, + correct_ret_addr, regs); if (first == node) break; diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index 079435a2e26c..24de0e5ff859 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -39,7 +39,8 @@ static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32)) } static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, + struct pt_regs *regs, void *data) { KUNIT_EXPECT_FALSE(current_test, preemptible()); /* This can be called on the fprobe_selftest_target and the fprobe_selftest_target2 */ @@ -57,6 +58,7 @@ static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip, } static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, struct pt_regs *regs, void *data) { unsigned long ret = regs_return_value(regs); @@ -78,14 +80,16 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, } static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, + struct pt_regs *regs, void *data) { KUNIT_EXPECT_FALSE(current_test, preemptible()); return 0; } static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, + struct pt_regs *regs, void *data) { KUNIT_EXPECT_FALSE(current_test, preemptible()); KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip); diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c index 4efc8feb6277..64e715e7ed11 100644 --- a/samples/fprobe/fprobe_example.c +++ b/samples/fprobe/fprobe_example.c @@ -49,6 +49,7 @@ static void show_backtrace(void) } static int sample_entry_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, struct pt_regs *regs, void *data) { if (use_trace) @@ -65,10 +66,11 @@ static int sample_entry_handler(struct fprobe *fp, unsigned long ip, return 0; } -static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs, +static void sample_exit_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, struct pt_regs *regs, void *data) { - unsigned long rip = instruction_pointer(regs); + unsigned long rip = ret_ip; if (use_trace) /* -- cgit v1.2.3 From 30460c21ed40a10bf541c4e93ba5e80bb4aac5da Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: tracing/probes: Avoid setting TPARG_FL_FENTRY and TPARG_FL_RETURN When parsing a kprobe event, the return probe always sets both TPARG_FL_RETURN and TPARG_FL_FENTRY, but this is not useful because some fetchargs are only for return probe and some others only for function entry. Make it obviously mutual exclusive. Link: https://lore.kernel.org/all/168507468731.913472.11354553441385410734.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 59cda19a9033..867ffb7ee31d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -825,7 +825,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (is_return) flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); - if (ret == 0) + if (ret == 0 && !is_return) flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 6a4ecfb1da43..5df59714f9f5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -357,6 +357,11 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a #define trace_probe_for_each_link_rcu(pos, tp) \ list_for_each_entry_rcu(pos, &(tp)->event->files, list) +/* + * The flags used for parsing trace_probe arguments. + * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TPOINT are mutually exclusive. + * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive. + */ #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -- cgit v1.2.3 From 334e5519c3757019cc591d4539d5aca199bdb114 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: tracing/probes: Add fprobe events for tracing function entry and exit. Add fprobe events for tracing function entry and exit instead of kprobe events. With this change, we can continue to trace function entry/exit even if the CONFIG_KPROBES_ON_FTRACE is not available. Since CONFIG_KPROBES_ON_FTRACE requires the CONFIG_DYNAMIC_FTRACE_WITH_REGS, it is not available if the architecture only supports CONFIG_DYNAMIC_FTRACE_WITH_ARGS. And that means kprobe events can not probe function entry/exit effectively on such architecture. But this can be solved if the dynamic events supports fprobe events. The fprobe event is a new dynamic events which is only for the function (symbol) entry and exit. This event accepts non register fetch arguments so that user can trace the function arguments and return values. The fprobe events syntax is here; f[:[GRP/][EVENT]] FUNCTION [FETCHARGS] f[MAXACTIVE][:[GRP/][EVENT]] FUNCTION%return [FETCHARGS] E.g. # echo 'f vfs_read $arg1' >> dynamic_events # echo 'f vfs_read%return $retval' >> dynamic_events # cat dynamic_events f:fprobes/vfs_read__entry vfs_read arg1=$arg1 f:fprobes/vfs_read__exit vfs_read%return arg1=$retval # echo 1 > events/fprobes/enable # head -n 20 trace | tail # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | sh-142 [005] ...1. 448.386420: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.386436: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 sh-142 [005] ...1. 448.386451: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.386458: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 sh-142 [005] ...1. 448.386469: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.386476: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 sh-142 [005] ...1. 448.602073: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.602089: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 Link: https://lore.kernel.org/all/168507469754.913472.6112857614708350210.stgit@mhiramat.roam.corp.google.com/ Reported-by: kernel test robot Link: https://lore.kernel.org/all/202302011530.7vm4O8Ro-lkp@intel.com/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 5 + include/linux/trace_events.h | 3 + kernel/trace/Kconfig | 14 + kernel/trace/Makefile | 1 + kernel/trace/fprobe.c | 11 +- kernel/trace/trace.c | 8 +- kernel/trace/trace.h | 11 + kernel/trace/trace_fprobe.c | 1053 ++++++++++++++++++++ kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 4 +- kernel/trace/trace_probe.h | 3 +- .../ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 2 +- 12 files changed, 1109 insertions(+), 8 deletions(-) create mode 100644 kernel/trace/trace_fprobe.c (limited to 'kernel/trace') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 134f0f59ffa8..3e03758151f4 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -66,6 +66,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); int unregister_fprobe(struct fprobe *fp); +bool fprobe_is_registered(struct fprobe *fp); #else static inline int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { @@ -83,6 +84,10 @@ static inline int unregister_fprobe(struct fprobe *fp) { return -EOPNOTSUPP; } +static inline bool fprobe_is_registered(struct fprobe *fp) +{ + return false; +} #endif /** diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 7c4a0b72334e..3930e676436c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -318,6 +318,7 @@ enum { TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, TRACE_EVENT_FL_EPROBE_BIT, + TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, }; @@ -332,6 +333,7 @@ enum { * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe * EPROBE - Event is an event probe + * FPROBE - Event is an function probe * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. @@ -346,6 +348,7 @@ enum { TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), + TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8cf97fa4a4b3..8e10a9453c96 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -650,6 +650,20 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config FPROBE_EVENTS + depends on FPROBE + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable fprobe-based dynamic events" + select TRACING + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + This allows user to add tracing events on the function entry and + exit via ftrace interface. The syntax is same as the kprobe events + and the kprobe events on function entry and exit will be + transparently converted to this fprobe events. + config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c6651e16b557..64b61f67a403 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 32994815edf6..e4704ec26df7 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -348,6 +348,14 @@ int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) } EXPORT_SYMBOL_GPL(register_fprobe_syms); +bool fprobe_is_registered(struct fprobe *fp) +{ + if (!fp || (fp->ops.saved_func != fprobe_handler && + fp->ops.saved_func != fprobe_kprobe_handler)) + return false; + return true; +} + /** * unregister_fprobe() - Unregister fprobe from ftrace * @fp: A fprobe data structure to be unregistered. @@ -360,8 +368,7 @@ int unregister_fprobe(struct fprobe *fp) { int ret; - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fprobe_is_registered(fp)) return -EINVAL; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64a4dde073ef..755b0bf2e1ac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5672,10 +5672,16 @@ static const char readme_msg[] = " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ + defined(CONFIG_FPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t Format: p[:[/][]] []\n" "\t r[maxactive][:[/][]] []\n" +#endif +#ifdef CONFIG_FPROBE_EVENTS + "\t f[:[/][]] [%return] []\n" +#endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..b5ab5479f9e3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -148,6 +148,17 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; +struct fentry_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct fexit_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + #define TRACE_BUF_SIZE 1024 struct trace_array; diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c new file mode 100644 index 000000000000..48dbbc72b7dd --- /dev/null +++ b/kernel/trace/trace_fprobe.c @@ -0,0 +1,1053 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Fprobe-based tracing events + * Copyright (C) 2022 Google LLC. + */ +#define pr_fmt(fmt) "trace_fprobe: " fmt + +#include +#include +#include +#include +#include + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" + +#define FPROBE_EVENT_SYSTEM "fprobes" +#define RETHOOK_MAXACTIVE_MAX 4096 + +static int trace_fprobe_create(const char *raw_command); +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_fprobe_release(struct dyn_event *ev); +static bool trace_fprobe_is_busy(struct dyn_event *ev); +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations trace_fprobe_ops = { + .create = trace_fprobe_create, + .show = trace_fprobe_show, + .is_busy = trace_fprobe_is_busy, + .free = trace_fprobe_release, + .match = trace_fprobe_match, +}; + +/* + * Fprobe event core functions + */ +struct trace_fprobe { + struct dyn_event devent; + struct fprobe fp; + const char *symbol; + struct trace_probe tp; +}; + +static bool is_trace_fprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_fprobe_ops; +} + +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_fprobe, devent); +} + +/** + * for_each_trace_fprobe - iterate over the trace_fprobe list + * @pos: the struct trace_fprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_fprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos))) + +static bool trace_fprobe_is_return(struct trace_fprobe *tf) +{ + return tf->fp.exit_handler != NULL; +} + +static const char *trace_fprobe_symbol(struct trace_fprobe *tf) +{ + return tf->symbol ? tf->symbol : "unknown"; +} + +static bool trace_fprobe_is_busy(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + return trace_probe_is_enabled(&tf->tp); +} + +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tf->tp, argc, argv); +} + +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event)) + return false; + + if (system && strcmp(trace_probe_group_name(&tf->tp), system)) + return false; + + return trace_fprobe_match_command_head(tf, argc, argv); +} + +static bool trace_fprobe_is_registered(struct trace_fprobe *tf) +{ + return fprobe_is_registered(&tf->fp); +} + +/* + * Note that we don't verify the fetch_insn code, since it does not come + * from user space. + */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + struct pt_regs *regs = rec; + unsigned long val; + int ret; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* function entry handler */ +static nokprobe_inline void +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fentry_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->ip = entry_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fentry_trace_func(tf, entry_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fentry_trace_func); + +/* Kretprobe handler */ +static nokprobe_inline void +__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fexit_trace_entry_head *entry; + struct trace_event_buffer fbuffer; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fexit_trace_func); + +#ifdef CONFIG_PERF_EVENTS + +static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fentry_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return 0; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return 0; + + entry->ip = entry_ip; + memset(&entry[1], 0, dsize); + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); + return 0; +} +NOKPROBE_SYMBOL(fentry_perf_func); + +static void +fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fexit_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return; + + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); +} +NOKPROBE_SYMBOL(fexit_perf_func); +#endif /* CONFIG_PERF_EVENTS */ + +static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + int ret = 0; + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fentry_trace_func(tf, entry_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + ret = fentry_perf_func(tf, entry_ip, regs); +#endif + return ret; +} +NOKPROBE_SYMBOL(fentry_dispatcher); + +static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fexit_trace_func(tf, entry_ip, ret_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + fexit_perf_func(tf, entry_ip, ret_ip, regs); +#endif +} +NOKPROBE_SYMBOL(fexit_dispatcher); + +static void free_trace_fprobe(struct trace_fprobe *tf) +{ + if (tf) { + trace_probe_cleanup(&tf->tp); + kfree(tf->symbol); + kfree(tf); + } +} + +/* + * Allocate new trace_probe and initialize it (including fprobe). + */ +static struct trace_fprobe *alloc_trace_fprobe(const char *group, + const char *event, + const char *symbol, + int maxactive, + int nargs, bool is_return) +{ + struct trace_fprobe *tf; + int ret = -ENOMEM; + + tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); + if (!tf) + return ERR_PTR(ret); + + tf->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tf->symbol) + goto error; + + if (is_return) + tf->fp.exit_handler = fexit_dispatcher; + else + tf->fp.entry_handler = fentry_dispatcher; + + tf->fp.nr_maxactive = maxactive; + + ret = trace_probe_init(&tf->tp, event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&tf->devent, &trace_fprobe_ops); + return tf; +error: + free_trace_fprobe(tf); + return ERR_PTR(ret); +} + +static struct trace_fprobe *find_trace_fprobe(const char *event, + const char *group) +{ + struct dyn_event *pos; + struct trace_fprobe *tf; + + for_each_trace_fprobe(tf, pos) + if (strcmp(trace_probe_name(&tf->tp), event) == 0 && + strcmp(trace_probe_group_name(&tf->tp), group) == 0) + return tf; + return NULL; +} + +static inline int __enable_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) + enable_fprobe(&tf->fp); + + return 0; +} + +static void __disable_trace_fprobe(struct trace_probe *tp) +{ + struct trace_fprobe *tf; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + if (!trace_fprobe_is_registered(tf)) + continue; + disable_fprobe(&tf->fp); + } +} + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + /* TODO: check the fprobe is gone */ + __enable_trace_fprobe(tf); + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) + __disable_trace_fprobe(tp); + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* Event entry printers */ +static enum print_line_t +print_fentry_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fentry_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fentry_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static enum print_line_t +print_fexit_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fexit_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fexit_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_puts(s, " <- "); + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + +static int fentry_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fentry_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static int fexit_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fexit_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_functions fentry_funcs = { + .trace = print_fentry_event +}; + +static struct trace_event_functions fexit_funcs = { + .trace = print_fexit_event +}; + +static struct trace_event_fields fentry_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fentry_event_define_fields }, + {} +}; + +static struct trace_event_fields fexit_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fexit_event_define_fields }, + {} +}; + +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data); + +static inline void init_trace_event_call(struct trace_fprobe *tf) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + + if (trace_fprobe_is_return(tf)) { + call->event.funcs = &fexit_funcs; + call->class->fields_array = fexit_fields_array; + } else { + call->event.funcs = &fentry_funcs; + call->class->fields_array = fentry_fields_array; + } + + call->flags = TRACE_EVENT_FL_FPROBE; + call->class->reg = fprobe_register; +} + +static int register_fprobe_event(struct trace_fprobe *tf) +{ + init_trace_event_call(tf); + + return trace_probe_register_event_call(&tf->tp); +} + +static int unregister_fprobe_event(struct trace_fprobe *tf) +{ + return trace_probe_unregister_event_call(&tf->tp); +} + +/* Internal register function - just handle fprobe and flags */ +static int __register_trace_fprobe(struct trace_fprobe *tf) +{ + int i, ret; + + /* Should we need new LOCKDOWN flag for fprobe? */ + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + + if (trace_fprobe_is_registered(tf)) + return -EINVAL; + + for (i = 0; i < tf->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tf->tp.args[i]); + if (ret) + return ret; + } + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(&tf->tp)) + tf->fp.flags &= ~FPROBE_FL_DISABLED; + else + tf->fp.flags |= FPROBE_FL_DISABLED; + + /* TODO: handle filter, nofilter or symbol list */ + return register_fprobe(&tf->fp, tf->symbol, NULL); +} + +/* Internal unregister function - just handle fprobe and flags */ +static void __unregister_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) { + unregister_fprobe(&tf->fp); + memset(&tf->fp, 0, sizeof(tf->fp)); + } +} + +/* TODO: make this trace_*probe common function */ +/* Unregister a trace_probe and probe_event */ +static int unregister_trace_fprobe(struct trace_fprobe *tf) +{ + /* If other probes are on the event, just unregister fprobe */ + if (trace_probe_has_sibling(&tf->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&tf->tp)) + return -EBUSY; + + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tf->tp))) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_fprobe_event(tf)) + return -EBUSY; + +unreg: + __unregister_trace_fprobe(tf); + dyn_event_remove(&tf->devent); + trace_probe_unlink(&tf->tp); + + return 0; +} + +static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, + struct trace_fprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + int i; + + list_for_each_entry(orig, &tpe->probes, tp.list) { + if (strcmp(trace_fprobe_symbol(orig), + trace_fprobe_symbol(comp))) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +{ + int ret; + + if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + return -EEXIST; + } + ret = trace_probe_compare_arg_type(&tf->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_fprobe_has_same_fprobe(to, tf)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tf->tp, &to->tp); + if (ret) + return ret; + + ret = __register_trace_fprobe(tf); + if (ret) + trace_probe_unlink(&tf->tp); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + + return ret; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_fprobe *old_tf; + int ret; + + mutex_lock(&event_mutex); + + old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), + trace_probe_group_name(&tf->tp)); + if (old_tf) { + ret = append_trace_fprobe(tf, old_tf); + goto end; + } + + /* Register new event */ + ret = register_fprobe_event(tf); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); + goto end; + } + + /* Register fprobe */ + ret = __register_trace_fprobe(tf); + if (ret < 0) + unregister_fprobe_event(tf); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + +end: + mutex_unlock(&event_mutex); + return ret; +} + +static int __trace_fprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * - Add fentry probe: + * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS] + * - Add fexit probe: + * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS] + * + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth entry of stack (N:0-) + * $argN : fetch Nth argument (N:1-) + * $comm : fetch current task comm + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_fprobe *tf = NULL; + int i, len, ret = 0; + bool is_return = false; + char *symbol = NULL, *tmp = NULL; + const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + int maxactive = 0; + char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + + if (argv[0][0] != 'f' || argc < 2) + return -ECANCELED; + + trace_probe_log_init("trace_fprobe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) + event++; + + if (isdigit(argv[0][1])) { + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); + if (ret || !maxactive) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + /* fprobe rethook instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > RETHOOK_MAXACTIVE_MAX) { + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; + } + } + + trace_probe_log_set_index(1); + + /* a symbol specified */ + symbol = kstrdup(argv[1], GFP_KERNEL); + if (!symbol) + return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + if (!is_return && maxactive) { + trace_probe_log_set_index(0); + trace_probe_log_err(1, BAD_MAXACT_TYPE); + goto parse_error; + } + + if (is_return) + flags |= TPARG_FL_RETURN; + else + flags |= TPARG_FL_FENTRY; + + trace_probe_log_set_index(0); + if (event) { + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto parse_error; + } + + if (!event) { + /* Make a new event name */ + snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + is_return ? "exit" : "entry"); + sanitize_event_name(buf); + event = buf; + } + + /* setup a probe */ + tf = alloc_trace_fprobe(group, event, symbol, maxactive, + argc - 2, is_return); + if (IS_ERR(tf)) { + ret = PTR_ERR(tf); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto out; /* We know tf is not allocated */ + } + argc -= 2; argv += 2; + + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], flags); + if (ret) + goto error; /* This can be -ENOMEM */ + } + + ret = traceprobe_set_print_fmt(&tf->tp, + is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); + if (ret < 0) + goto error; + + ret = register_trace_fprobe(tf); + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM && ret != -EEXIST) + trace_probe_log_err(0, FAIL_REG_PROBE); + goto error; + } + +out: + trace_probe_log_clear(); + kfree(symbol); + return ret; + +parse_error: + ret = -EINVAL; +error: + free_trace_fprobe(tf); + goto out; +} + +static int trace_fprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_fprobe_create); +} + +static int trace_fprobe_release(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int ret = unregister_trace_fprobe(tf); + + if (!ret) + free_trace_fprobe(tf); + return ret; +} + +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int i; + + seq_putc(m, 'f'); + if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) + seq_printf(m, "%d", tf->fp.nr_maxactive); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), + trace_probe_name(&tf->tp)); + + seq_printf(m, " %s%s", trace_fprobe_symbol(tf), + trace_fprobe_is_return(tf) ? "%return" : ""); + + for (i = 0; i < tf->tp.nr_args; i++) + seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + */ +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_fprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_fprobe(event, file); + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return enable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_UNREGISTER: + return disable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup fprobe + * events in postcore_initcall without tracefs. + */ +static __init int init_fprobe_trace_early(void) +{ + int ret; + + ret = dyn_event_register(&trace_fprobe_ops); + if (ret) + return ret; + + return 0; +} +core_initcall(init_fprobe_trace_early); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 867ffb7ee31d..b7a4409674b3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -764,7 +764,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (isdigit(argv[0][1])) { if (!is_return) { - trace_probe_log_err(1, MAXACT_NO_KPROBE); + trace_probe_log_err(1, BAD_MAXACT_TYPE); goto parse_error; } if (event) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..c39860fb2e41 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -393,8 +393,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ - if (flags & TPARG_FL_TPOINT) { - /* eprobes do not handle registers */ + if (flags & (TPARG_FL_TPOINT | TPARG_FL_FPROBE)) { + /* eprobe and fprobe do not handle registers */ trace_probe_log_err(offs, BAD_VAR); break; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5df59714f9f5..8f4f23e8b234 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -367,6 +367,7 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a #define TPARG_FL_FENTRY BIT(2) #define TPARG_FL_TPOINT BIT(3) #define TPARG_FL_USER BIT(4) +#define TPARG_FL_FPROBE BIT(5) #define TPARG_FL_MASK GENMASK(4, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, @@ -409,7 +410,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ - C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT_TYPE, "Maxactive is only for function exit"), \ C(BAD_MAXACT, "Invalid maxactive number"), \ C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index 9e85d3019ff0..97c08867490a 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -8,7 +8,7 @@ check_error() { # command-with-error-pos-by-^ } if grep -q 'r\[maxactive\]' README; then -check_error 'p^100 vfs_read' # MAXACT_NO_KPROBE +check_error 'p^100 vfs_read' # BAD_MAXACT_TYPE check_error 'r^1a111 vfs_read' # BAD_MAXACT check_error 'r^100000 vfs_read' # MAXACT_TOO_BIG fi -- cgit v1.2.3 From e2d0d7b2f42dcaf924e9c891c91c9aa22cbbebce Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: tracing/probes: Add tracepoint support on fprobe_events Allow fprobe_events to trace raw tracepoints so that user can trace tracepoints which don't have traceevent wrappers. This new event is always available if the fprobe_events is enabled (thus no kconfig), because the fprobe_events depends on the trace-event and traceporint. e.g. # echo 't sched_overutilized_tp' >> dynamic_events # echo 't 9p_client_req' >> dynamic_events # cat dynamic_events t:tracepoints/sched_overutilized_tp sched_overutilized_tp t:tracepoints/_9p_client_req 9p_client_req The event name is based on the tracepoint name, but if it is started with digit character, an underscore '_' will be added. NOTE: to avoid further confusion, this renames TPARG_FL_TPOINT to TPARG_FL_TEVENT because this flag is used for eprobe (trace-event probe). And reuse TPARG_FL_TPOINT for this raw tracepoint probe. Link: https://lore.kernel.org/all/168507471874.913472.17214624519622959593.stgit@mhiramat.roam.corp.google.com/ Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305020453.afTJ3VVp-lkp@intel.com/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/tracepoint-defs.h | 1 + include/linux/tracepoint.h | 5 ++ kernel/trace/trace.c | 1 + kernel/trace/trace_eprobe.c | 2 +- kernel/trace/trace_fprobe.c | 134 +++++++++++++++++++++++++++++++++++++--- kernel/trace/trace_probe.c | 15 +++-- kernel/trace/trace_probe.h | 15 ++++- 7 files changed, 157 insertions(+), 16 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index e7c2276be33e..4dc4955f0fbf 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -35,6 +35,7 @@ struct tracepoint { struct static_call_key *static_call_key; void *static_call_tramp; void *iterator; + void *probestub; int (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 6811e43c1b5c..88c0ba623ee6 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -303,6 +303,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __section("__tracepoints_strings") = #_name; \ extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ int __traceiter_##_name(void *__data, proto); \ + void __probestub_##_name(void *__data, proto); \ struct tracepoint __tracepoint_##_name __used \ __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ @@ -310,6 +311,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ + .probestub = &__probestub_##_name, \ .regfunc = _reg, \ .unregfunc = _unreg, \ .funcs = NULL }; \ @@ -330,6 +332,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ return 0; \ } \ + void __probestub_##_name(void *__data, proto) \ + { \ + } \ DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); #define DEFINE_TRACE(name, proto, args) \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 755b0bf2e1ac..fa4e1a18da70 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5681,6 +5681,7 @@ static const char readme_msg[] = #endif #ifdef CONFIG_FPROBE_EVENTS "\t f[:[/][]] [%return] []\n" + "\t t[:[/][]] []\n" #endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 67e854979d53..fd64cd5d5745 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -817,7 +817,7 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; int ret; ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 48dbbc72b7dd..aa71ccb4205c 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "trace_dynevent.h" @@ -17,6 +18,7 @@ #include "trace_probe_tmpl.h" #define FPROBE_EVENT_SYSTEM "fprobes" +#define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 static int trace_fprobe_create(const char *raw_command); @@ -41,6 +43,8 @@ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; + struct tracepoint *tpoint; + struct module *mod; struct trace_probe tp; }; @@ -68,6 +72,11 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) return tf->fp.exit_handler != NULL; } +static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) +{ + return tf->tpoint != NULL; +} + static const char *trace_fprobe_symbol(struct trace_fprobe *tf) { return tf->symbol ? tf->symbol : "unknown"; @@ -668,6 +677,21 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) else tf->fp.flags |= FPROBE_FL_DISABLED; + if (trace_fprobe_is_tracepoint(tf)) { + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); + } + /* TODO: handle filter, nofilter or symbol list */ return register_fprobe(&tf->fp, tf->symbol, NULL); } @@ -678,6 +702,12 @@ static void __unregister_trace_fprobe(struct trace_fprobe *tf) if (trace_fprobe_is_registered(tf)) { unregister_fprobe(&tf->fp); memset(&tf->fp, 0, sizeof(tf->fp)); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } } } @@ -741,7 +771,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) { int ret; - if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to)) { + if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to) || + trace_fprobe_is_tracepoint(tf) != trace_fprobe_is_tracepoint(to)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); return -EEXIST; @@ -811,6 +842,60 @@ end: return ret; } +#ifdef CONFIG_MODULES +static int __tracepoint_probe_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ + struct tp_module *tp_mod = data; + struct trace_fprobe *tf; + struct dyn_event *pos; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + mutex_lock(&event_mutex); + for_each_trace_fprobe(tf, pos) { + if (tp_mod->mod == tf->mod) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } + mutex_unlock(&event_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; +#endif /* CONFIG_MODULES */ + +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; +}; + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +static struct tracepoint *find_tracepoint(const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + return data.tpoint; +} + static int __trace_fprobe_create(int argc, const char *argv[]) { /* @@ -819,6 +904,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS] * - Add fexit probe: * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS] + * - Add tracepoint probe: + * t[:[GRP/][EVENT]] TRACEPOINT [FETCHARGS] * * Fetch args: * $retval : fetch return value @@ -844,10 +931,16 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + bool is_tracepoint = false; - if (argv[0][0] != 'f' || argc < 2) + if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; + if (argv[0][0] == 't') { + is_tracepoint = true; + group = TRACEPOINT_EVENT_SYSTEM; + } + trace_probe_log_init("trace_fprobe", argc, argv); event = strchr(&argv[0][1], ':'); @@ -881,14 +974,14 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); - /* a symbol specified */ + /* a symbol(or tracepoint) must be specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) return -ENOMEM; tmp = strchr(symbol, '%'); if (tmp) { - if (!strcmp(tmp, "%return")) { + if (!is_tracepoint && !strcmp(tmp, "%return")) { *tmp = '\0'; is_return = true; } else { @@ -907,6 +1000,9 @@ static int __trace_fprobe_create(int argc, const char *argv[]) else flags |= TPARG_FL_FENTRY; + if (is_tracepoint) + flags |= TPARG_FL_TPOINT; + trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, @@ -917,8 +1013,11 @@ static int __trace_fprobe_create(int argc, const char *argv[]) if (!event) { /* Make a new event name */ - snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, - is_return ? "exit" : "entry"); + if (is_tracepoint) + strscpy(buf, symbol, MAX_EVENT_NAME_LEN); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + is_return ? "exit" : "entry"); sanitize_event_name(buf); event = buf; } @@ -932,6 +1031,18 @@ static int __trace_fprobe_create(int argc, const char *argv[]) WARN_ON_ONCE(ret != -ENOMEM); goto out; /* We know tf is not allocated */ } + + if (is_tracepoint) { + tf->tpoint = find_tracepoint(tf->symbol); + if (!tf->tpoint) { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + goto parse_error; + } + tf->mod = __module_text_address( + (unsigned long)tf->tpoint->probestub); + } + argc -= 2; argv += 2; /* parse arguments */ @@ -991,7 +1102,10 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) struct trace_fprobe *tf = to_trace_fprobe(ev); int i; - seq_putc(m, 'f'); + if (trace_fprobe_is_tracepoint(tf)) + seq_putc(m, 't'); + else + seq_putc(m, 'f'); if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) seq_printf(m, "%d", tf->fp.nr_maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), @@ -1048,6 +1162,12 @@ static __init int init_fprobe_trace_early(void) if (ret) return ret; +#ifdef CONFIG_MODULES + ret = register_tracepoint_module_notifier(&tracepoint_module_nb); + if (ret) + return ret; +#endif + return 0; } core_initcall(init_fprobe_trace_early); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index c39860fb2e41..798f18d24ebc 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -292,7 +292,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (flags & TPARG_FL_TPOINT) { + if (flags & TPARG_FL_TEVENT) { if (code->data) return -EFAULT; code->data = kstrdup(arg, GFP_KERNEL); @@ -326,8 +326,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - } else if (((flags & TPARG_FL_MASK) == - (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && + } else if (tparg_is_function_entry(flags) && (len = str_has_prefix(arg, "arg"))) { ret = kstrtoul(arg + len, 10, ¶m); if (ret) { @@ -338,6 +337,12 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; + /* + * The tracepoint probe will probe a stub function, and the + * first parameter of the stub is a dummy and should be ignored. + */ + if (flags & TPARG_FL_TPOINT) + code->param++; #endif } else goto inval_var; @@ -393,7 +398,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ - if (flags & (TPARG_FL_TPOINT | TPARG_FL_FPROBE)) { + if (flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { /* eprobe and fprobe do not handle registers */ trace_probe_log_err(offs, BAD_VAR); break; @@ -633,7 +638,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. But ignore for eprobes. */ - if (!(flags & TPARG_FL_TPOINT) && + if (!(flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8f4f23e8b234..e6b94fcdb886 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -359,16 +359,24 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a /* * The flags used for parsing trace_probe arguments. - * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TPOINT are mutually exclusive. + * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TEVENT are mutually exclusive. * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive. + * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with + * TPARG_FL_KERNEL. */ #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_TEVENT BIT(3) #define TPARG_FL_USER BIT(4) #define TPARG_FL_FPROBE BIT(5) -#define TPARG_FL_MASK GENMASK(4, 0) +#define TPARG_FL_TPOINT BIT(6) +#define TPARG_FL_LOC_MASK GENMASK(4, 0) + +static inline bool tparg_is_function_entry(unsigned int flags) +{ + return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); +} extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -415,6 +423,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_TRACEPOINT, "Tracepoint is not found"), \ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ -- cgit v1.2.3 From 1b8b0cd754cdbb54058165992456368495a695ac Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Move event parameter fetching code to common parser Move trace event parameter fetching code to common parser in trace_probe.c. This simplifies eprobe's trace-event variable fetching code by introducing a parse context data structure. Link: https://lore.kernel.org/all/168507472950.913472.2812253181558471278.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 44 +-------- kernel/trace/trace_fprobe.c | 4 +- kernel/trace/trace_kprobe.c | 4 +- kernel/trace/trace_probe.c | 218 ++++++++++++++++++++++++++------------------ kernel/trace/trace_probe.h | 9 +- kernel/trace/trace_uprobe.c | 8 +- 6 files changed, 155 insertions(+), 132 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index fd64cd5d5745..cb0077ba2b49 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -227,37 +227,6 @@ error: return ERR_PTR(ret); } -static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) -{ - struct probe_arg *parg = &ep->tp.args[i]; - struct ftrace_event_field *field; - struct list_head *head; - int ret = -ENOENT; - - head = trace_get_fields(ep->event); - list_for_each_entry(field, head, link) { - if (!strcmp(parg->code->data, field->name)) { - kfree(parg->code->data); - parg->code->data = field; - return 0; - } - } - - /* - * Argument not found on event. But allow for comm and COMM - * to be used to get the current->comm. - */ - if (strcmp(parg->code->data, "COMM") == 0 || - strcmp(parg->code->data, "comm") == 0) { - parg->code->op = FETCH_OP_COMM; - ret = 0; - } - - kfree(parg->code->data); - parg->code->data = NULL; - return ret; -} - static int eprobe_event_define_fields(struct trace_event_call *event_call) { struct eprobe_trace_entry_head field; @@ -817,19 +786,16 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + struct traceprobe_parse_context ctx = { + .event = ep->event, + .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, + }; int ret; - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { - ret = trace_eprobe_tp_arg_update(ep, i); - if (ret) - trace_probe_log_err(0, BAD_ATTACH_ARG); - } - /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index aa71ccb4205c..7d144e4a3fb6 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1047,8 +1047,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { .flags = flags }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7a4409674b3..1a3497719ada 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -867,8 +867,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { .flags = flags }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 798f18d24ebc..9ebefacb6372 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -283,74 +283,114 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return 0; } +static int parse_trace_event_arg(char *arg, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ctx->event); + list_for_each_entry(field, head, link) { + if (!strcmp(arg, field->name)) { + code->op = FETCH_OP_TP_ARG; + code->data = field; + return 0; + } + } + return -ENOENT; +} + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags, int offs) + struct fetch_insn *code, + struct traceprobe_parse_context *ctx) { unsigned long param; + int err = TP_ERR_BAD_VAR; int ret = 0; int len; - if (flags & TPARG_FL_TEVENT) { + if (ctx->flags & TPARG_FL_TEVENT) { if (code->data) return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; - } else if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) { + ret = parse_trace_event_arg(arg, code, ctx); + if (!ret) + return 0; + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + /* backward compatibility */ + ctx->offset = 0; + goto inval; + } + + if (strcmp(arg, "retval") == 0) { + if (ctx->flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; - } else { - trace_probe_log_err(offs, RETVAL_ON_PROBE); - ret = -EINVAL; + return 0; } - } else if ((len = str_has_prefix(arg, "stack"))) { + err = TP_ERR_RETVAL_ON_PROBE; + goto inval; + } + + len = str_has_prefix(arg, "stack"); + if (len) { + if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; - } else if (isdigit(arg[len])) { + return 0; + } + + if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_STACK_NUM); - ret = -EINVAL; - } else { - code->op = FETCH_OP_STACK; - code->param = (unsigned int)param; + if (ret) + goto inval; + + if ((ctx->flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_STACK_NUM; + goto inval; } - } else - goto inval_var; - } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; + return 0; + } + goto inval; + } + + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; + return 0; + } + #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - } else if (tparg_is_function_entry(flags) && - (len = str_has_prefix(arg, "arg"))) { + len = str_has_prefix(arg, "arg"); + if (len && tparg_is_function_entry(ctx->flags)) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if (!param || param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_ARG_NUM); - return -EINVAL; + if (ret) + goto inval; + + if (!param || param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_ARG_NUM; + goto inval; } + code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; /* * The tracepoint probe will probe a stub function, and the * first parameter of the stub is a dummy and should be ignored. */ - if (flags & TPARG_FL_TPOINT) + if (ctx->flags & TPARG_FL_TPOINT) code->param++; + return 0; + } #endif - } else - goto inval_var; - return ret; - -inval_var: - trace_probe_log_err(offs, BAD_VAR); +inval: + __trace_probe_log_err(ctx->offset, err); return -EINVAL; } @@ -383,7 +423,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags, int offs) + struct traceprobe_parse_context *ctx) { struct fetch_insn *code = *pcode; unsigned long param; @@ -394,13 +434,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags, offs); + ret = parse_probe_vars(arg + 1, type, code, ctx); break; case '%': /* named register */ - if (flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { + if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { /* eprobe and fprobe do not handle registers */ - trace_probe_log_err(offs, BAD_VAR); + trace_probe_log_err(ctx->offset, BAD_VAR); break; } ret = regs_query_register_offset(arg + 1); @@ -409,14 +449,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->param = (unsigned int)ret; ret = 0; } else - trace_probe_log_err(offs, BAD_REG_NAME); + trace_probe_log_err(ctx->offset, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); if (ret) { - trace_probe_log_err(offs, BAD_MEM_ADDR); + trace_probe_log_err(ctx->offset, BAD_MEM_ADDR); break; } /* load address */ @@ -424,13 +464,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) { - trace_probe_log_err(offs, FILE_ON_KPROBE); + if (ctx->flags & TPARG_FL_KERNEL) { + trace_probe_log_err(ctx->offset, FILE_ON_KPROBE); return -EINVAL; } ret = kstrtol(arg + 2, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_FILE_OFFS); + trace_probe_log_err(ctx->offset, BAD_FILE_OFFS); break; } @@ -438,8 +478,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) { - trace_probe_log_err(offs, SYM_ON_UPROBE); + if (!(ctx->flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(ctx->offset, SYM_ON_UPROBE); return -EINVAL; } /* Preserve symbol for updating */ @@ -448,7 +488,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, if (!code->data) return -ENOMEM; if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } code->op = FETCH_OP_IMM; @@ -456,7 +496,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } /* These are fetching from memory */ if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -475,36 +515,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, arg++; /* Skip '+', because kstrtol() rejects it. */ tmp = strchr(arg, '('); if (!tmp) { - trace_probe_log_err(offs, DEREF_NEED_BRACE); + trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE); return -EINVAL; } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_DEREF_OFFS); + trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS); break; } - offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); + ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); if (!tmp) { - trace_probe_log_err(offs + strlen(arg), + trace_probe_log_err(ctx->offset + strlen(arg), DEREF_OPEN_BRACE); return -EINVAL; } else { - const struct fetch_type *t2 = find_fetch_type(NULL, flags); + const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags); + int cur_offs = ctx->offset; *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags, offs); + ret = parse_probe_arg(arg, t2, &code, end, ctx); if (ret) break; + ctx->offset = cur_offs; if (code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) { - trace_probe_log_err(offs, COMM_CANT_DEREF); + trace_probe_log_err(ctx->offset, COMM_CANT_DEREF); return -EINVAL; } if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -515,7 +557,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '\\': /* Immediate value */ if (arg[1] == '"') { /* Immediate string */ - ret = __parse_imm_string(arg + 2, &tmp, offs + 2); + ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2); if (ret) break; code->op = FETCH_OP_DATA; @@ -523,7 +565,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } else { ret = str_to_immediate(arg + 1, &code->immediate); if (ret) - trace_probe_log_err(offs + 1, BAD_IMM); + trace_probe_log_err(ctx->offset + 1, BAD_IMM); else code->op = FETCH_OP_IMM; } @@ -531,7 +573,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ - trace_probe_log_err(offs, BAD_FETCH_ARG); + trace_probe_log_err(ctx->offset, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -576,12 +618,13 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, - struct probe_arg *parg, unsigned int flags, int offset) + struct probe_arg *parg, + struct traceprobe_parse_context *ctx) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; - char *arg; int ret, len; + char *arg; arg = kstrdup(argv, GFP_KERNEL); if (!arg) @@ -590,10 +633,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { - trace_probe_log_err(offset, ARG_TOO_LONG); + trace_probe_log_err(ctx->offset, ARG_TOO_LONG); goto out; } else if (len == 0) { - trace_probe_log_err(offset, NO_ARG_BODY); + trace_probe_log_err(ctx->offset, NO_ARG_BODY); goto out; } @@ -611,23 +654,24 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, *t2++ = '\0'; t3 = strchr(t2, ']'); if (!t3) { - offset += t2 + strlen(t2) - arg; - trace_probe_log_err(offset, + int offs = t2 + strlen(t2) - arg; + + trace_probe_log_err(ctx->offset + offs, ARRAY_NO_CLOSE); goto out; } else if (t3[1] != '\0') { - trace_probe_log_err(offset + t3 + 1 - arg, + trace_probe_log_err(ctx->offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, BAD_ARRAY_NUM); goto out; } if (parg->count > MAX_ARRAY_LEN) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, ARRAY_TOO_BIG); goto out; } @@ -638,17 +682,17 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. But ignore for eprobes. */ - if (!(flags & TPARG_FL_TEVENT) && + if (!(ctx->flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; - parg->type = find_fetch_type("string", flags); + parg->type = find_fetch_type("string", ctx->flags); } else - parg->type = find_fetch_type(t, flags); + parg->type = find_fetch_type(t, ctx->flags); if (!parg->type) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; } parg->offset = *size; @@ -670,7 +714,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags, offset); + ctx); if (ret) goto fail; @@ -681,7 +725,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_SYMSTRING); goto fail; } @@ -689,7 +733,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } @@ -708,7 +752,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } } @@ -731,7 +775,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_ST_RAW; @@ -742,7 +786,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); if (ret) { - trace_probe_log_err(offset + t - arg, BAD_BITFIELD); + trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); goto fail; } } @@ -752,13 +796,13 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING && scode->op != FETCH_OP_ST_USTRING) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -807,7 +851,7 @@ static int traceprobe_conflict_field_name(const char *name, } int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, - unsigned int flags) + struct traceprobe_parse_context *ctx) { struct probe_arg *parg = &tp->args[i]; const char *body; @@ -842,9 +886,9 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } + ctx->offset = body - arg; /* Parse fetch argument */ - return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, - body - arg); + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, ctx); } void traceprobe_free_probe_arg(struct probe_arg *arg) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e6b94fcdb886..f622340ae171 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -378,8 +378,15 @@ static inline bool tparg_is_function_entry(unsigned int flags) return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); } +struct traceprobe_parse_context { + struct trace_event_call *event; + unsigned int flags; + int offset; +}; + extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - const char *argv, unsigned int flags); + const char *argv, + struct traceprobe_parse_context *ctx); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..fa09b33ee731 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -686,10 +686,12 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { + .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, + }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], - (is_return ? TPARG_FL_RETURN : 0) | - TPARG_FL_USER); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); if (ret) goto error; } -- cgit v1.2.3 From b576e09701c7d045bbe5cd85d53e2f34426aa214 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Support function parameters if BTF is available Support function or tracepoint parameters by name if BTF support is enabled and the event is for function entry (this feature can be used with kprobe- events, fprobe-events and tracepoint probe events.) Note that the BTF variable syntax does not require a prefix. If it starts with an alphabetic character or an underscore ('_') without a prefix like '$' and '%', it is considered as a BTF variable. If you specify only the BTF variable name, the argument name will also be the same name instead of 'arg*'. # echo 'p vfs_read count pos' >> dynamic_events # echo 'f vfs_write count pos' >> dynamic_events # echo 't sched_overutilized_tp rd overutilized' >> dynamic_events # cat dynamic_events p:kprobes/p_vfs_read_0 vfs_read count=count pos=pos f:fprobes/vfs_write__entry vfs_write count=count pos=pos t:tracepoints/sched_overutilized_tp sched_overutilized_tp rd=rd overutilized=overutilized Link: https://lore.kernel.org/all/168507474014.913472.16963996883278039183.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Alan Maguire Tested-by: Alan Maguire --- kernel/trace/Kconfig | 12 +++ kernel/trace/trace.c | 4 + kernel/trace/trace_fprobe.c | 53 ++++++----- kernel/trace/trace_kprobe.c | 12 +-- kernel/trace/trace_probe.c | 211 +++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace_probe.h | 9 +- 6 files changed, 270 insertions(+), 31 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8e10a9453c96..b3f90d602896 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -664,6 +664,18 @@ config FPROBE_EVENTS and the kprobe events on function entry and exit will be transparently converted to this fprobe events. +config PROBE_EVENTS_BTF_ARGS + depends on HAVE_FUNCTION_ARG_ACCESS_API + depends on FPROBE_EVENTS || KPROBE_EVENTS + depends on DEBUG_INFO_BTF && BPF_SYSCALL + bool "Support BTF function arguments for probe events" + default y + help + The user can specify the arguments of the probe event using the names + of the arguments of the probed function, when the probe location is a + kernel function entry or a tracepoint. + This is available only if BTF (BPF Type Format) support is enabled. + config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa4e1a18da70..a70b22235eaf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5698,7 +5698,11 @@ static const char readme_msg[] = "\t args: =fetcharg[:type]\n" "\t fetcharg: (%|$), @
, @[+|-],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + "\t $stack, $stack, $retval, $comm, $arg, \n" +#else "\t $stack, $stack, $retval, $comm, $arg,\n" +#endif #else "\t $stack, $stack, $retval, $comm,\n" #endif diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 7d144e4a3fb6..2dd884609321 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -366,6 +366,7 @@ static void free_trace_fprobe(struct trace_fprobe *tf) static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, + struct tracepoint *tpoint, int maxactive, int nargs, bool is_return) { @@ -385,6 +386,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, else tf->fp.entry_handler = fentry_dispatcher; + tf->tpoint = tpoint; tf->fp.nr_maxactive = maxactive; ret = trace_probe_init(&tf->tp, event, group, false); @@ -930,8 +932,12 @@ static int __trace_fprobe_create(int argc, const char *argv[]) int maxactive = 0; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + char sbuf[KSYM_NAME_LEN]; bool is_tracepoint = false; + struct tracepoint *tpoint = NULL; + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -995,14 +1001,6 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto parse_error; } - if (is_return) - flags |= TPARG_FL_RETURN; - else - flags |= TPARG_FL_FENTRY; - - if (is_tracepoint) - flags |= TPARG_FL_TPOINT; - trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, @@ -1014,7 +1012,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) if (!event) { /* Make a new event name */ if (is_tracepoint) - strscpy(buf, symbol, MAX_EVENT_NAME_LEN); + snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + isdigit(*symbol) ? "_" : "", symbol); else snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, is_return ? "exit" : "entry"); @@ -1022,8 +1021,27 @@ static int __trace_fprobe_create(int argc, const char *argv[]) event = buf; } + if (is_return) + ctx.flags |= TPARG_FL_RETURN; + else + ctx.flags |= TPARG_FL_FENTRY; + + if (is_tracepoint) { + ctx.flags |= TPARG_FL_TPOINT; + tpoint = find_tracepoint(symbol); + if (!tpoint) { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + goto parse_error; + } + ctx.funcname = kallsyms_lookup( + (unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } else + ctx.funcname = symbol; + /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, maxactive, + tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, argc - 2, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); @@ -1032,24 +1050,15 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto out; /* We know tf is not allocated */ } - if (is_tracepoint) { - tf->tpoint = find_tracepoint(tf->symbol); - if (!tf->tpoint) { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - goto parse_error; - } + if (is_tracepoint) tf->mod = __module_text_address( (unsigned long)tf->tpoint->probestub); - } argc -= 2; argv += 2; - /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct traceprobe_parse_context ctx = { .flags = flags }; - trace_probe_log_set_index(i + 2); + ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1a3497719ada..7cc32da3e8e8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -742,7 +742,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - unsigned int flags = TPARG_FL_KERNEL; + struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { case 'r': @@ -823,10 +823,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto parse_error; } if (is_return) - flags |= TPARG_FL_RETURN; + ctx.flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); if (ret == 0 && !is_return) - flags |= TPARG_FL_FENTRY; + ctx.flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); @@ -856,7 +856,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc - 2, is_return); + argc - 2, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ @@ -866,10 +866,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argc -= 2; argv += 2; /* parse arguments */ + ctx.funcname = symbol; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct traceprobe_parse_context ctx = { .flags = flags }; - trace_probe_log_set_index(i + 2); + ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9ebefacb6372..08c18d9d4cf2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -11,6 +11,8 @@ */ #define pr_fmt(fmt) "trace_probe: " fmt +#include + #include "trace_probe.h" #undef C @@ -300,6 +302,171 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code, return -ENOENT; } +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + +static struct btf *traceprobe_get_btf(void) +{ + struct btf *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(btf)) + return NULL; + + return btf; +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const char *type_from_btf_id(struct btf *btf, s32 id) +{ + const struct btf_type *t; + u32 intdata; + s32 tid; + + /* TODO: const char * could be converted as a string */ + t = btf_type_skip_modifiers(btf, id, &tid); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_ENUM: + /* enum is "int", so convert to "s32" */ + return "s32"; + case BTF_KIND_ENUM64: + return "s64"; + case BTF_KIND_PTR: + /* pointer will be converted to "x??" */ + if (IS_ENABLED(CONFIG_64BIT)) + return "x64"; + else + return "x32"; + case BTF_KIND_INT: + intdata = btf_type_int(t); + if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) { + switch (BTF_INT_BITS(intdata)) { + case 8: + return "s8"; + case 16: + return "s16"; + case 32: + return "s32"; + case 64: + return "s64"; + } + } else { /* unsigned */ + switch (BTF_INT_BITS(intdata)) { + case 8: + return "u8"; + case 16: + return "u16"; + case 32: + return "u32"; + case 64: + return "u64"; + } + } + } + /* TODO: support other types */ + + return NULL; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_type *t; + s32 id; + + if (!btf || !funcname || !nr) + return ERR_PTR(-EINVAL); + + id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); + if (id <= 0) + return ERR_PTR(-ENOENT); + + /* Get BTF_KIND_FUNC type */ + t = btf_type_by_id(btf, id); + if (!btf_type_is_func(t)) + return ERR_PTR(-ENOENT); + + /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return ERR_PTR(-ENOENT); + + *nr = btf_type_vlen(t); + + if (*nr) + return (const struct btf_param *)(t + 1); + else + return NULL; +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_param *params; + int i; + + if (!btf) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; + } + + if (WARN_ON_ONCE(!ctx->funcname)) + return -EINVAL; + + if (!ctx->params) { + params = find_btf_func_param(ctx->funcname, &ctx->nr_params); + if (IS_ERR(params)) { + trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); + return PTR_ERR(params); + } + ctx->params = params; + } else + params = ctx->params; + + for (i = 0; i < ctx->nr_params; i++) { + const char *name = btf_name_by_offset(btf, params[i].name_off); + + if (name && !strcmp(name, varname)) { + code->op = FETCH_OP_ARG; + code->param = i; + return 0; + } + } + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; +} + +static const struct fetch_type *parse_btf_arg_type(int arg_idx, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + + if (btf && ctx->params) + typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); + + return find_fetch_type(typestr, ctx->flags); +} +#else +static struct btf *traceprobe_get_btf(void) +{ + return NULL; +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} +#define parse_btf_arg_type(idx, ctx) \ + find_fetch_type(NULL, ctx->flags) +#endif + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, @@ -570,6 +737,15 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_IMM; } break; + default: + if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EINVAL; + } + ret = parse_btf_arg(arg, code, ctx); + break; + } } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ @@ -718,6 +894,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (ret) goto fail; + /* Update storing type if BTF is available */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + !t && code->op == FETCH_OP_ARG) + parg->type = parse_btf_arg_type(code->param, ctx); + ret = -EINVAL; /* Store operation */ if (parg->type->is_string) { @@ -850,6 +1031,33 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } +static char *generate_probe_arg_name(const char *arg, int idx) +{ + char *name = NULL; + const char *end; + + /* + * If argument name is omitted, try arg as a name (BTF variable) + * or "argN". + */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { + end = strchr(arg, ':'); + if (!end) + end = arg + strlen(arg); + + name = kmemdup_nul(arg, end - arg, GFP_KERNEL); + if (!name || !is_good_name(name)) { + kfree(name); + name = NULL; + } + } + + if (!name) + name = kasprintf(GFP_KERNEL, "arg%d", idx + 1); + + return name; +} + int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, struct traceprobe_parse_context *ctx) { @@ -871,8 +1079,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { - /* If argument name is omitted, set "argN" */ - parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1); + parg->name = generate_probe_arg_name(arg, i); body = arg; } if (!parg->name) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f622340ae171..7af121996ce9 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "trace.h" @@ -380,6 +381,9 @@ static inline bool tparg_is_function_entry(unsigned int flags) struct traceprobe_parse_context { struct trace_event_call *event; + const struct btf_param *params; + s32 nr_params; + const char *funcname; unsigned int flags; int offset; }; @@ -478,7 +482,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ - C(NO_EP_FILTER, "No filter rule after 'if'"), + C(NO_EP_FILTER, "No filter rule after 'if'"), \ + C(NOSUP_BTFARG, "BTF is not available or not supported"), \ + C(NO_BTFARG, "This variable is not found at this probe point"),\ + C(NO_BTF_ENTRY, "No BTF entry for this probe point"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From 18b1e870a49671745c31434b18bcfdd6f20cb6a1 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Add $arg* meta argument for all function args Add the '$arg*' meta fetch argument for function-entry probe events. This will be expanded to the all arguments of the function and the tracepoint using BTF function argument information. e.g. # echo 'p vfs_read $arg*' >> dynamic_events # echo 'f vfs_write $arg*' >> dynamic_events # echo 't sched_overutilized_tp $arg*' >> dynamic_events # cat dynamic_events p:kprobes/p_vfs_read_0 vfs_read file=file buf=buf count=count pos=pos f:fprobes/vfs_write__entry vfs_write file=file buf=buf count=count pos=pos t:tracepoints/sched_overutilized_tp sched_overutilized_tp rd=rd overutilized=overutilized Also, single '$arg[0-9]*' will be converted to the BTF function argument. NOTE: This seems like a wildcard, but a fake one at this moment. This is just for telling user that this can be expanded to several arguments. And it is not like other $-vars, you can not use this $arg* as a part of fetch args, e.g. specifying name "foo=$arg*" and using it in dereferences "+0($arg*)" will lead a parse error. Link: https://lore.kernel.org/all/168507475126.913472.18329684401466211816.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_fprobe.c | 21 +++++- kernel/trace/trace_kprobe.c | 23 ++++-- kernel/trace/trace_probe.c | 169 ++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace_probe.h | 11 ++- 4 files changed, 212 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 2dd884609321..dfe2e546acdc 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -925,14 +925,16 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf = NULL; - int i, len, ret = 0; + int i, len, new_argc = 0, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + const char **new_argv = NULL; int maxactive = 0; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; bool is_tracepoint = false; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { @@ -1040,9 +1042,22 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } else ctx.funcname = symbol; + argc -= 2; argv += 2; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, - argc - 2, is_return); + argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ @@ -1054,7 +1069,6 @@ static int __trace_fprobe_create(int argc, const char *argv[]) tf->mod = __module_text_address( (unsigned long)tf->tpoint->probestub); - argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); @@ -1083,6 +1097,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) out: trace_probe_log_clear(); + kfree(new_argv); kfree(symbol); return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7cc32da3e8e8..74adb82331dd 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -732,9 +732,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk = NULL; - int i, len, ret = 0; + int i, len, new_argc = 0, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; + const char **new_argv = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; enum probe_print_type ptype; int maxactive = 0; @@ -742,6 +743,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -854,19 +856,31 @@ static int __trace_kprobe_create(int argc, const char *argv[]) event = buf; } + argc -= 2; argv += 2; + ctx.funcname = symbol; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc - 2, is_return); + argc, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); goto out; /* We know tk is not allocated */ } - argc -= 2; argv += 2; /* parse arguments */ - ctx.funcname = symbol; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; @@ -894,6 +908,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) out: trace_probe_log_clear(); + kfree(new_argv); kfree(symbol); return ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 08c18d9d4cf2..7216435d6728 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -371,9 +371,11 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) return NULL; } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr) +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) { struct btf *btf = traceprobe_get_btf(); + const struct btf_param *param; const struct btf_type *t; s32 id; @@ -395,9 +397,16 @@ static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr return ERR_PTR(-ENOENT); *nr = btf_type_vlen(t); + param = (const struct btf_param *)(t + 1); - if (*nr) - return (const struct btf_param *)(t + 1); + /* Hide the first 'data' argument of tracepoint */ + if (tracepoint) { + (*nr)--; + param++; + } + + if (*nr > 0) + return param; else return NULL; } @@ -418,7 +427,8 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, return -EINVAL; if (!ctx->params) { - params = find_btf_func_param(ctx->funcname, &ctx->nr_params); + params = find_btf_func_param(ctx->funcname, &ctx->nr_params, + ctx->flags & TPARG_FL_TPOINT); if (IS_ERR(params)) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); return PTR_ERR(params); @@ -451,12 +461,19 @@ static const struct fetch_type *parse_btf_arg_type(int arg_idx, return find_fetch_type(typestr, ctx->flags); } + #else static struct btf *traceprobe_get_btf(void) { return NULL; } +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static int parse_btf_arg(const char *varname, struct fetch_insn *code, struct traceprobe_parse_context *ctx) { @@ -1114,6 +1131,150 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) kfree(arg->fmt); } +static int argv_has_var_arg(int argc, const char *argv[], int *args_idx, + struct traceprobe_parse_context *ctx) +{ + int i, found = 0; + + for (i = 0; i < argc; i++) + if (str_has_prefix(argv[i], "$arg")) { + trace_probe_log_set_index(i + 2); + + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(0, NOFENTRY_ARGS); + return -EINVAL; + } + + if (isdigit(argv[i][4])) { + found = 1; + continue; + } + + if (argv[i][4] != '*') { + trace_probe_log_err(0, BAD_VAR); + return -EINVAL; + } + + if (*args_idx >= 0 && *args_idx < argc) { + trace_probe_log_err(0, DOUBLE_ARGS); + return -EINVAL; + } + found = 1; + *args_idx = i; + } + + return found; +} + +static int sprint_nth_btf_arg(int idx, const char *type, + char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *name; + int ret; + + if (idx >= ctx->nr_params) { + trace_probe_log_err(0, NO_BTFARG); + return -ENOENT; + } + name = btf_name_by_offset(btf, ctx->params[idx].name_off); + if (!name) { + trace_probe_log_err(0, NO_BTF_ENTRY); + return -ENOENT; + } + ret = snprintf(buf, bufsize, "%s%s", name, type); + if (ret >= bufsize) { + trace_probe_log_err(0, ARGS_2LONG); + return -E2BIG; + } + return ret; +} + +/* Return new_argv which must be freed after use */ +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + const struct btf_param *params = NULL; + int i, j, n, used, ret, args_idx = -1; + const char **new_argv = NULL; + int nr_params; + + ret = argv_has_var_arg(argc, argv, &args_idx, ctx); + if (ret < 0) + return ERR_PTR(ret); + + if (!ret) { + *new_argc = argc; + return NULL; + } + + params = find_btf_func_param(ctx->funcname, &nr_params, + ctx->flags & TPARG_FL_TPOINT); + if (IS_ERR(params)) { + if (args_idx != -1) { + /* $arg* requires BTF info */ + trace_probe_log_err(0, NOSUP_BTFARG); + return (const char **)params; + } + return 0; + } + ctx->params = params; + ctx->nr_params = nr_params; + + if (args_idx >= 0) + *new_argc = argc + ctx->nr_params - 1; + else + *new_argc = argc; + + new_argv = kcalloc(*new_argc, sizeof(char *), GFP_KERNEL); + if (!new_argv) + return ERR_PTR(-ENOMEM); + + used = 0; + for (i = 0, j = 0; i < argc; i++) { + trace_probe_log_set_index(i + 2); + if (i == args_idx) { + for (n = 0; n < nr_params; n++) { + ret = sprint_nth_btf_arg(n, "", buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + + new_argv[j++] = buf + used; + used += ret + 1; + } + continue; + } + + if (str_has_prefix(argv[i], "$arg")) { + char *type = NULL; + + n = simple_strtoul(argv[i] + 4, &type, 10); + if (type && !(*type == ':' || *type == '\0')) { + trace_probe_log_err(0, BAD_VAR); + ret = -ENOENT; + goto error; + } + /* Note: $argN starts from $arg1 */ + ret = sprint_nth_btf_arg(n - 1, type, buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + new_argv[j++] = buf + used; + used += ret + 1; + } else + new_argv[j++] = argv[i]; + } + + return new_argv; + +error: + kfree(new_argv); + return ERR_PTR(ret); +} + int traceprobe_update_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 7af121996ce9..e7fa2f2ed01c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -33,7 +33,9 @@ #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 +#define MAX_BTF_ARGS_LEN 128 #define MAX_STRING_SIZE PATH_MAX +#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -391,6 +393,9 @@ struct traceprobe_parse_context { extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, struct traceprobe_parse_context *ctx); +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); @@ -485,7 +490,11 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EP_FILTER, "No filter rule after 'if'"), \ C(NOSUP_BTFARG, "BTF is not available or not supported"), \ C(NO_BTFARG, "This variable is not found at this probe point"),\ - C(NO_BTF_ENTRY, "No BTF entry for this probe point"), + C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \ + C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ + C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ + C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ + C(ARGS_2LONG, "$arg* failed because the argument list is too long"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From fd26290ec89d4eae8570e027df3b8c519d285fd0 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Add BTF retval type support Check the target function has non-void retval type and set the correct fetch type if user doesn't specify it. If the function returns void, $retval is rejected as below; # echo 'f unregister_kprobes%return $retval' >> dynamic_events sh: write error: No such file or directory # cat error_log [ 37.488397] trace_fprobe: error: This function returns 'void' type Command: f unregister_kprobes%return $retval ^ Link: https://lore.kernel.org/all/168507476195.913472.16290308831790216609.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 69 +++++++++++++++++++++++++++++++++++++++++----- kernel/trace/trace_probe.h | 1 + 2 files changed, 63 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 7216435d6728..ba1c6e059b51 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -371,15 +371,13 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) return NULL; } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, - bool tracepoint) +static const struct btf_type *find_btf_func_proto(const char *funcname) { struct btf *btf = traceprobe_get_btf(); - const struct btf_param *param; const struct btf_type *t; s32 id; - if (!btf || !funcname || !nr) + if (!btf || !funcname) return ERR_PTR(-EINVAL); id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); @@ -396,6 +394,22 @@ static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr if (!btf_type_is_func_proto(t)) return ERR_PTR(-ENOENT); + return t; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + const struct btf_param *param; + const struct btf_type *t; + + if (!funcname || !nr) + return ERR_PTR(-EINVAL); + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return (const struct btf_param *)t; + *nr = btf_type_vlen(t); param = (const struct btf_param *)(t + 1); @@ -462,6 +476,32 @@ static const struct fetch_type *parse_btf_arg_type(int arg_idx, return find_fetch_type(typestr, ctx->flags); } +static const struct fetch_type *parse_btf_retval_type( + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + const struct btf_type *t; + + if (btf && ctx->funcname) { + t = find_btf_func_proto(ctx->funcname); + if (!IS_ERR(t)) + typestr = type_from_btf_id(btf, t->type); + } + + return find_fetch_type(typestr, ctx->flags); +} + +static bool is_btf_retval_void(const char *funcname) +{ + const struct btf_type *t; + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return false; + + return t->type == 0; +} #else static struct btf *traceprobe_get_btf(void) { @@ -480,8 +520,15 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EOPNOTSUPP; } + #define parse_btf_arg_type(idx, ctx) \ find_fetch_type(NULL, ctx->flags) + +#define parse_btf_retval_type(ctx) \ + find_fetch_type(NULL, ctx->flags) + +#define is_btf_retval_void(funcname) (false) + #endif #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) @@ -512,6 +559,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (strcmp(arg, "retval") == 0) { if (ctx->flags & TPARG_FL_RETURN) { + if ((ctx->flags & TPARG_FL_KERNEL) && + is_btf_retval_void(ctx->funcname)) { + err = TP_ERR_NO_RETVAL; + goto inval; + } code->op = FETCH_OP_RETVAL; return 0; } @@ -912,9 +964,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, goto fail; /* Update storing type if BTF is available */ - if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && - !t && code->op == FETCH_OP_ARG) - parg->type = parse_btf_arg_type(code->param, ctx); + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && !t) { + if (code->op == FETCH_OP_ARG) + parg->type = parse_btf_arg_type(code->param, ctx); + else if (code->op == FETCH_OP_RETVAL) + parg->type = parse_btf_retval_type(ctx); + } ret = -EINVAL; /* Store operation */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e7fa2f2ed01c..01ea148723de 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -449,6 +449,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(NO_RETVAL, "This function returns 'void' type"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ C(BAD_VAR, "Invalid $-valiable specified"), \ -- cgit v1.2.3 From f46fab0e36e611a2389d3843f34658c849b6bd60 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 6 Jun 2023 11:17:14 -0700 Subject: bpf: Add extra path pointer check to d_path helper Anastasios reported crash on stable 5.15 kernel with following BPF attached to lsm hook: SEC("lsm.s/bprm_creds_for_exec") int BPF_PROG(bprm_creds_for_exec, struct linux_binprm *bprm) { struct path *path = &bprm->executable->f_path; char p[128] = { 0 }; bpf_d_path(path, p, 128); return 0; } But bprm->executable can be NULL, so bpf_d_path call will crash: BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC NOPTI ... RIP: 0010:d_path+0x22/0x280 ... Call Trace: bpf_d_path+0x21/0x60 bpf_prog_db9cf176e84498d9_bprm_creds_for_exec+0x94/0x99 bpf_trampoline_6442506293_0+0x55/0x1000 bpf_lsm_bprm_creds_for_exec+0x5/0x10 security_bprm_creds_for_exec+0x29/0x40 bprm_execve+0x1c1/0x900 do_execveat_common.isra.0+0x1af/0x260 __x64_sys_execve+0x32/0x40 It's problem for all stable trees with bpf_d_path helper, which was added in 5.9. This issue is fixed in current bpf code, where we identify and mark trusted pointers, so the above code would fail even to load. For the sake of the stable trees and to workaround potentially broken verifier in the future, adding the code that reads the path object from the passed pointer and verifies it's valid in kernel space. Fixes: 6e22ab9da793 ("bpf: Add d_path helper") Reported-by: Anastasios Papagiannis Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20230606181714.532998-1-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9a050e36dc6c..1f4b07da327a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -900,13 +900,23 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) { + struct path copy; long len; char *p; if (!sz) return 0; - p = d_path(path, buf, sz); + /* + * The path pointer is verified as trusted and safe to use, + * but let's double check it's valid anyway to workaround + * potentially broken verifier. + */ + len = copy_from_kernel_nofault(©, path, sizeof(*path)); + if (len < 0) + return len; + + p = d_path(©, buf, sz); if (IS_ERR(p)) { len = PTR_ERR(p); } else { -- cgit v1.2.3 From 0b295316b3a9b7858eafbebdc31b4827a6edde03 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:36 +0100 Subject: mm/gup: remove unused vmas parameter from pin_user_pages_remote() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No invocation of pin_user_pages_remote() uses the vmas parameter, so remove it. This forms part of a larger patch set eliminating the use of the vmas parameters altogether. Link: https://lkml.kernel.org/r/28f000beb81e45bf538a2aaa77c90f5482b67a32.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Cc: Catalin Marinas Cc: Christian König Cc: Dennis Dalessandro Cc: Greg Kroah-Hartman Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sakari Ailus Cc: Sean Christopherson Signed-off-by: Andrew Morton --- drivers/iommu/iommufd/pages.c | 4 ++-- drivers/vfio/vfio_iommu_type1.c | 2 +- include/linux/mm.h | 2 +- kernel/trace/trace_events_user.c | 2 +- mm/gup.c | 8 +++----- mm/process_vm_access.c | 2 +- 6 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 3c47846cc5ef..412ca96be128 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -786,7 +786,7 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, user->locked = 1; } rc = pin_user_pages_remote(pages->source_mm, uptr, npages, - user->gup_flags, user->upages, NULL, + user->gup_flags, user->upages, &user->locked); } if (rc <= 0) { @@ -1799,7 +1799,7 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, rc = pin_user_pages_remote( pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, - NULL, NULL); + NULL); mmap_read_unlock(pages->source_mm); if (rc != 1) { if (WARN_ON(rc >= 0)) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0d2f805468e1..306e6f1d1c70 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -562,7 +562,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, mmap_read_lock(mm); ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, - pages, NULL, NULL); + pages, NULL); if (ret > 0) { int i; diff --git a/include/linux/mm.h b/include/linux/mm.h index 6336253c18e2..cf17ffdf4fbf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2367,7 +2367,7 @@ long get_user_pages_remote(struct mm_struct *mm, long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..bdc2666e8d39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -406,7 +406,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, - &page, NULL, NULL); + &page, NULL); if (unlikely(ret <= 0)) { if (!fixup_fault) diff --git a/mm/gup.c b/mm/gup.c index 21daeee5f163..edf0fe2695b0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3100,8 +3100,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -3116,14 +3114,14 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, NULL, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, + return __gup_longterm_locked(mm, start, nr_pages, pages, NULL, locked ? locked : &local_locked, gup_flags); } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 78dfaf9e8990..0523edab03a6 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr, mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, - NULL, &locked); + &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) -- cgit v1.2.3 From ba470eebc2f6c2f704872955a715b9555328e7d0 Mon Sep 17 00:00:00 2001 From: sunliming Date: Mon, 29 May 2023 11:21:00 +0800 Subject: tracing/user_events: Prevent same name but different args event User processes register name_args for events. If the same name but different args event are registered. The trace outputs of second event are printed as the first event. This is incorrect. Return EADDRINUSE back to the user process if the same name but different args event has being registered. Link: https://lore.kernel.org/linux-trace-kernel/20230529032100.286534-1-sunliming@kylinos.cn Signed-off-by: sunliming Reviewed-by: Masami Hiramatsu (Google) Acked-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 36 +++++++++++++++++++---- tools/testing/selftests/user_events/ftrace_test.c | 6 ++++ 2 files changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index dbb14705d0d3..37a38496a6be 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1786,6 +1786,8 @@ static int user_event_parse(struct user_event_group *group, char *name, int ret; u32 key; struct user_event *user; + int argc = 0; + char **argv; /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); @@ -1793,13 +1795,35 @@ static int user_event_parse(struct user_event_group *group, char *name, mutex_unlock(&event_mutex); if (user) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + if (!argv) { + ret = -ENOMEM; + goto error; + } + + ret = user_fields_match(user, argc, (const char **)argv); + argv_free(argv); + + } else + ret = list_empty(&user->fields); + + if (ret) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + } else { + ret = -EADDRINUSE; + goto error; + } + return 0; +error: + refcount_dec(&user->refcnt); + return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index 7c99cef94a65..6e8c4b47281c 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -228,6 +228,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;"; + ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(EADDRINUSE, errno); + /* Ensure disabled */ self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, self->enable_fd); -- cgit v1.2.3 From cfac4ed7279d056df6167bd665e460787dc9e0c4 Mon Sep 17 00:00:00 2001 From: sunliming Date: Mon, 29 May 2023 14:51:10 +0800 Subject: tracing/user_events: Handle matching arguments that is null from dyn_events When A registering user event from dyn_events has no argments, it will pass the matching check, regardless of whether there is a user event with the same name and arguments. Add the matching check when the arguments of registering user event is null. Link: https://lore.kernel.org/linux-trace-kernel/20230529065110.303440-1-sunliming@kylinos.cn Signed-off-by: sunliming Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 37a38496a6be..afe61dc86543 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1745,6 +1745,8 @@ static bool user_event_match(const char *system, const char *event, if (match && argc > 0) match = user_fields_match(user, argc, argv); + else if (match && argc == 0) + match = list_empty(&user->fields); return match; } -- cgit v1.2.3 From e70bb54d7a51299e7feca9169c07d79f9a3fc01d Mon Sep 17 00:00:00 2001 From: sunliming Date: Thu, 25 May 2023 16:52:32 +0800 Subject: tracing: Modify print_fields() for fields output order Now the print_fields() print trace event fields in reverse order. Modify it to the positive sequence. Example outputs for a user event: test0 u32 count1; u32 count2 Output before: example-2547 [000] ..... 325.666387: test0: count2=0x2 (2) count1=0x1 (1) Output after: example-2742 [002] ..... 429.769370: test0: count1=0x1 (1) count2=0x2 (2) Link: https://lore.kernel.org/linux-trace-kernel/20230525085232.5096-1-sunliming@kylinos.cn Fixes: 80a76994b2d88 ("tracing: Add "fields" option to show raw trace event fields") Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..1e33f367783e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int ret; void *pos; - list_for_each_entry(field, head, link) { + list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); if (field->offset + field->size > iter->ent_size) { trace_seq_puts(&iter->seq, ""); -- cgit v1.2.3 From 6f05dcabe5c241d066ec472cf38ac8b84f8c9c6f Mon Sep 17 00:00:00 2001 From: sunliming Date: Tue, 6 Jun 2023 14:20:24 +0800 Subject: tracing/user_events: Fix the incorrect trace record for empty arguments events The user_events support events that has empty arguments. But the trace event is discarded and not really committed when the arguments is empty. Fix this by not attempting to copy in zero-length data. Link: https://lkml.kernel.org/r/20230606062027.1008398-2-sunliming@kylinos.cn Acked-by: Beau Belgrave Acked-by: Masami Hiramatsu (Google) Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index afe61dc86543..49914b6cb651 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1432,7 +1432,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, if (unlikely(!entry)) return; - if (unlikely(!copy_nofault(entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1473,7 +1473,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i, perf_fetch_caller_regs(regs); - if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && -- cgit v1.2.3 From ed0e0ae0c932188654422d01f4e0ea14ff97c063 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 1 Jun 2023 15:49:28 -0700 Subject: tracing/user_events: Remove user_ns walk for groups During discussions it was suggested that user_ns is not a good place to try to attach a tracing namespace. The current code has stubs to enable that work that are very likely to change and incur a performance cost. Remove the user_ns walk when creating a group and determining the system name to use, since it's unlikely user_ns will be used in the future. Link: https://lore.kernel.org/all/20230601-urenkel-holzofen-cd9403b9cadd@brauner/ Link: https://lore.kernel.org/linux-trace-kernel/20230601224928.301-1-beaub@linux.microsoft.com Suggested-by: Christian Brauner Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 42 +++++----------------------------------- 1 file changed, 5 insertions(+), 37 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 49914b6cb651..cf6d4c02c363 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -182,21 +182,11 @@ static void user_event_group_destroy(struct user_event_group *group) kfree(group); } -static char *user_event_group_system_name(struct user_namespace *user_ns) +static char *user_event_group_system_name(void) { char *system_name; int len = sizeof(USER_EVENTS_SYSTEM) + 1; - if (user_ns != &init_user_ns) { - /* - * Unexpected at this point: - * We only currently support init_user_ns. - * When we enable more, this will trigger a failure so log. - */ - pr_warn("user_events: Namespace other than init_user_ns!\n"); - return NULL; - } - system_name = kmalloc(len, GFP_KERNEL); if (!system_name) @@ -207,34 +197,12 @@ static char *user_event_group_system_name(struct user_namespace *user_ns) return system_name; } -static inline struct user_event_group -*user_event_group_from_user_ns(struct user_namespace *user_ns) -{ - if (user_ns == &init_user_ns) - return init_group; - - return NULL; -} - static struct user_event_group *current_user_event_group(void) { - struct user_namespace *user_ns = current_user_ns(); - struct user_event_group *group = NULL; - - while (user_ns) { - group = user_event_group_from_user_ns(user_ns); - - if (group) - break; - - user_ns = user_ns->parent; - } - - return group; + return init_group; } -static struct user_event_group -*user_event_group_create(struct user_namespace *user_ns) +static struct user_event_group *user_event_group_create(void) { struct user_event_group *group; @@ -243,7 +211,7 @@ static struct user_event_group if (!group) return NULL; - group->system_name = user_event_group_system_name(user_ns); + group->system_name = user_event_group_system_name(); if (!group->system_name) goto error; @@ -2603,7 +2571,7 @@ static int __init trace_events_user_init(void) if (!fault_cache) return -ENOMEM; - init_group = user_event_group_create(&init_user_ns); + init_group = user_event_group_create(); if (!init_group) { kmem_cache_destroy(fault_cache); -- cgit v1.2.3 From b08d72580584ab89c41d6fc0f15cd1cf4ce2ed93 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:31 -0700 Subject: tracing/user_events: Store register flags on events Currently we don't have any available flags for user processes to use to indicate options for user_events. We will soon have a flag to indicate the event should or should not auto-delete once it's not being used by anyone. Add a reg_flags field to user_events and parameters to existing functions to allow for this in future patches. Link: https://lkml.kernel.org/r/20230614163336.5797-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index cf6d4c02c363..629823e21447 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -87,6 +87,7 @@ struct user_event { struct list_head validators; refcount_t refcnt; int min_size; + int reg_flags; char status; }; @@ -165,7 +166,7 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser); + struct user_event **newuser, int reg_flags); static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); @@ -810,7 +811,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * Upon success user_event has its ref count increased by 1. */ static int user_event_parse_cmd(struct user_event_group *group, - char *raw_command, struct user_event **newuser) + char *raw_command, struct user_event **newuser, + int reg_flags) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -824,7 +826,7 @@ static int user_event_parse_cmd(struct user_event_group *group, if (flags) *flags++ = '\0'; - return user_event_parse(group, name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser, reg_flags); } static int user_field_array_size(const char *type) @@ -1588,7 +1590,7 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user); + ret = user_event_parse_cmd(group, name, &user, 0); if (!ret) refcount_dec(&user->refcnt); @@ -1751,7 +1753,7 @@ static int user_event_trace_register(struct user_event *user) */ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser) + struct user_event **newuser, int reg_flags) { int ret; u32 key; @@ -1846,6 +1848,8 @@ error: if (ret) goto put_user_lock; + user->reg_flags = reg_flags; + /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); @@ -2144,7 +2148,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, return ret; } - ret = user_event_parse_cmd(info->group, name, &user); + ret = user_event_parse_cmd(info->group, name, &user, reg.flags); if (ret) { kfree(name); -- cgit v1.2.3 From f0dbf6fd0bddb9290c89fdd7afc53dad5051030e Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:32 -0700 Subject: tracing/user_events: Track refcount consistently via put/get Various parts of the code today track user_event's refcnt field directly via a refcount_add/dec. This makes it hard to modify the behavior of the last reference decrement in all code paths consistently. For example, in the future we will auto-delete events upon the last reference going away. This last reference could happen in many places, but we want it to be consistently handled. Add user_event_get() and user_event_put() for the add/dec. Update all places where direct refcounts are being used to utilize these new functions. In each location pass if event_mutex is locked or not. This allows us to drop events automatically in future patches clearly. Ensure when caller states the lock is held, it really is (or is not) held. Link: https://lkml.kernel.org/r/20230614163336.5797-3-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 69 ++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 28 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 629823e21447..c064458eea5c 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -177,6 +177,28 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static struct user_event *user_event_get(struct user_event *user) +{ + refcount_inc(&user->refcnt); + + return user; +} + +static void user_event_put(struct user_event *user, bool locked) +{ +#ifdef CONFIG_LOCKDEP + if (locked) + lockdep_assert_held(&event_mutex); + else + lockdep_assert_not_held(&event_mutex); +#endif + + if (unlikely(!user)) + return; + + refcount_dec(&user->refcnt); +} + static void user_event_group_destroy(struct user_event_group *group) { kfree(group->system_name); @@ -228,12 +250,13 @@ error: return NULL; }; -static void user_event_enabler_destroy(struct user_event_enabler *enabler) +static void user_event_enabler_destroy(struct user_event_enabler *enabler, + bool locked) { list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ - refcount_dec(&enabler->event->refcnt); + user_event_put(enabler->event, locked); kfree(enabler); } @@ -295,7 +318,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) /* User asked for enabler to be removed during fault */ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); goto out; } @@ -470,14 +493,12 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, if (!enabler) return false; - enabler->event = orig->event; + enabler->event = user_event_get(orig->event); enabler->addr = orig->addr; /* Only dup part of value (ignore future flags, etc) */ enabler->values = orig->values & ENABLE_VAL_DUP_MASK; - refcount_inc(&enabler->event->refcnt); - /* Enablers not exposed yet, RCU not required */ list_add(&enabler->mm_enablers_link, &mm->enablers); @@ -594,7 +615,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) struct user_event_enabler *enabler, *next; list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, false); mmdrop(mm->mm); kfree(mm); @@ -749,7 +770,7 @@ retry: * exit or run exec(), which includes forks and clones. */ if (!*write_result) { - refcount_inc(&enabler->event->refcnt); + user_event_get(user); list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } @@ -1337,10 +1358,8 @@ static struct user_event *find_user_event(struct user_event_group *group, *outkey = key; hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) { - refcount_inc(&user->refcnt); - return user; - } + if (!strcmp(EVENT_NAME(user), name)) + return user_event_get(user); return NULL; } @@ -1554,12 +1573,12 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - refcount_inc(&user->refcnt); + user_event_get(user); update_enable_bit_for(user); return 0; dec: update_enable_bit_for(user); - refcount_dec(&user->refcnt); + user_event_put(user, true); return 0; } @@ -1593,7 +1612,7 @@ static int user_event_create(const char *raw_command) ret = user_event_parse_cmd(group, name, &user, 0); if (!ret) - refcount_dec(&user->refcnt); + user_event_put(user, false); mutex_unlock(&group->reg_mutex); @@ -1794,7 +1813,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return 0; error: - refcount_dec(&user->refcnt); + user_event_put(user, false); return ret; } @@ -1883,7 +1902,7 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user) return -ENOENT; - refcount_dec(&user->refcnt); + user_event_put(user, true); if (!user_event_last_ref(user)) return -EBUSY; @@ -2042,9 +2061,7 @@ static int user_events_ref_add(struct user_event_file_info *info, for (i = 0; i < count; ++i) new_refs->events[i] = refs->events[i]; - new_refs->events[i] = user; - - refcount_inc(&user->refcnt); + new_refs->events[i] = user_event_get(user); rcu_assign_pointer(info->refs, new_refs); @@ -2158,7 +2175,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - refcount_dec(&user->refcnt); + user_event_put(user, false); /* Positive number is index and valid */ if (ret < 0) @@ -2307,7 +2324,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); /* Removed at least one */ ret = 0; @@ -2365,7 +2382,6 @@ static int user_events_release(struct inode *node, struct file *file) struct user_event_file_info *info = file->private_data; struct user_event_group *group; struct user_event_refs *refs; - struct user_event *user; int i; if (!info) @@ -2389,12 +2405,9 @@ static int user_events_release(struct inode *node, struct file *file) * The underlying user_events are ref counted, and cannot be freed. * After this decrement, the user_events may be freed elsewhere. */ - for (i = 0; i < refs->count; ++i) { - user = refs->events[i]; + for (i = 0; i < refs->count; ++i) + user_event_put(refs->events[i], false); - if (user) - refcount_dec(&user->refcnt); - } out: file->private_data = NULL; -- cgit v1.2.3 From a65442edb47a6d9c974b50049296ac9ddb378fee Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:33 -0700 Subject: tracing/user_events: Add auto cleanup and future persist flag Currently user events need to be manually deleted via the delete IOCTL call or via the dynamic_events file. Most operators and processes wish to have these events auto cleanup when they are no longer used by anything to prevent them piling without manual maintenance. However, some operators may not want this, such as pre-registering events via the dynamic_events tracefs file. Update user_event_put() to attempt an auto delete of the event if it's the last reference. The auto delete must run in a work queue to ensure proper behavior of class->reg() invocations that don't expect the call to go away from underneath them during the unregister. Add work_struct to user_event struct to ensure we can do this reliably. Add a persist flag, that is not yet exposed, to ensure we can toggle between auto-cleanup and leaving the events existing in the future. When a non-zero flag is seen during register, return -EINVAL to ensure ABI is clear for the user processes while we work out the best approach for persistent events. Link: https://lkml.kernel.org/r/20230614163336.5797-4-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/20230518093600.3f119d68@rorschach.local.home/ Suggested-by: Steven Rostedt Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 139 +++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index c064458eea5c..8df0550415e7 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -49,6 +49,18 @@ #define EVENT_STATUS_PERF BIT(1) #define EVENT_STATUS_OTHER BIT(7) +/* + * User register flags are not allowed yet, keep them here until we are + * ready to expose them out to the user ABI. + */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + /* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. @@ -85,6 +97,7 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; + struct work_struct put_work; refcount_t refcnt; int min_size; int reg_flags; @@ -171,6 +184,7 @@ static int user_event_parse(struct user_event_group *group, char *name, static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); +static int destroy_user_event(struct user_event *user); static u32 user_event_key(char *name) { @@ -184,19 +198,103 @@ static struct user_event *user_event_get(struct user_event *user) return user; } +static void delayed_destroy_user_event(struct work_struct *work) +{ + struct user_event *user = container_of( + work, struct user_event, put_work); + + mutex_lock(&event_mutex); + + if (!refcount_dec_and_test(&user->refcnt)) + goto out; + + if (destroy_user_event(user)) { + /* + * The only reason this would fail here is if we cannot + * update the visibility of the event. In this case the + * event stays in the hashtable, waiting for someone to + * attempt to delete it later. + */ + pr_warn("user_events: Unable to delete event\n"); + refcount_set(&user->refcnt, 1); + } +out: + mutex_unlock(&event_mutex); +} + static void user_event_put(struct user_event *user, bool locked) { -#ifdef CONFIG_LOCKDEP - if (locked) - lockdep_assert_held(&event_mutex); - else - lockdep_assert_not_held(&event_mutex); -#endif + bool delete; if (unlikely(!user)) return; - refcount_dec(&user->refcnt); + /* + * When the event is not enabled for auto-delete there will always + * be at least 1 reference to the event. During the event creation + * we initially set the refcnt to 2 to achieve this. In those cases + * the caller must acquire event_mutex and after decrement check if + * the refcnt is 1, meaning this is the last reference. When auto + * delete is enabled, there will only be 1 ref, IE: refcnt will be + * only set to 1 during creation to allow the below checks to go + * through upon the last put. The last put must always be done with + * the event mutex held. + */ + if (!locked) { + lockdep_assert_not_held(&event_mutex); + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex); + } else { + lockdep_assert_held(&event_mutex); + delete = refcount_dec_and_test(&user->refcnt); + } + + if (!delete) + return; + + /* + * We now have the event_mutex in all cases, which ensures that + * no new references will be taken until event_mutex is released. + * New references come through find_user_event(), which requires + * the event_mutex to be held. + */ + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* We should not get here when persist flag is set */ + pr_alert("BUG: Auto-delete engaged on persistent event\n"); + goto out; + } + + /* + * Unfortunately we have to attempt the actual destroy in a work + * queue. This is because not all cases handle a trace_event_call + * being removed within the class->reg() operation for unregister. + */ + INIT_WORK(&user->put_work, delayed_destroy_user_event); + + /* + * Since the event is still in the hashtable, we have to re-inc + * the ref count to 1. This count will be decremented and checked + * in the work queue to ensure it's still the last ref. This is + * needed because a user-process could register the same event in + * between the time of event_mutex release and the work queue + * running the delayed destroy. If we removed the item now from + * the hashtable, this would result in a timing window where a + * user process would fail a register because the trace_event_call + * register would fail in the tracing layers. + */ + refcount_set(&user->refcnt, 1); + + if (WARN_ON_ONCE(!schedule_work(&user->put_work))) { + /* + * If we fail we must wait for an admin to attempt delete or + * another register/close of the event, whichever is first. + */ + pr_warn("user_events: Unable to queue delayed destroy\n"); + } +out: + /* Ensure if we didn't have event_mutex before we unlock it */ + if (!locked) + mutex_unlock(&event_mutex); } static void user_event_group_destroy(struct user_event_group *group) @@ -793,7 +891,12 @@ out: static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { - return refcount_read(&user->refcnt) == 1; + int last = 0; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) + last = 1; + + return refcount_read(&user->refcnt) == last; } static __always_inline __must_check @@ -1609,7 +1712,8 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user, 0); + /* Dyn events persist, otherwise they would cleanup immediately */ + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST); if (!ret) user_event_put(user, false); @@ -1780,6 +1884,10 @@ static int user_event_parse(struct user_event_group *group, char *name, int argc = 0; char **argv; + /* User register flags are not ready yet */ + if (reg_flags != 0 || flags != NULL) + return -EINVAL; + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); user = find_user_event(group, name, &key); @@ -1869,8 +1977,13 @@ error: user->reg_flags = reg_flags; - /* Ensure we track self ref and caller ref (2) */ - refcount_set(&user->refcnt, 2); + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); + } else { + /* Ensure we track only caller ref (1) */ + refcount_set(&user->refcnt, 1); + } dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); @@ -2092,8 +2205,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; - /* Ensure no flags, since we don't support any yet */ - if (kreg->flags != 0) + /* Ensure only valid flags */ + if (kreg->flags & ~(USER_EVENT_REG_MAX-1)) return -EINVAL; /* Ensure supported size */ -- cgit v1.2.3 From f3d40e6545594c22733d091c5ec6b8ff345cbd57 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 20 Jun 2023 18:34:55 -0400 Subject: fgraph: Add declaration of "struct fgraph_ret_regs" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In final testing of: https://patchwork.kernel.org/project/linux-trace-kernel/patch/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/ "function_graph: Support recording and printing the return value of function" The test failed due to a new warning found in the build: kernel/trace/fgraph.c:243:56: warning: ‘struct fgraph_ret_regs’ declared inside parameter list will not be visible outside of this definition or declaration Instead of asking to send another patch series, just add it and then apply the updates. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 218cd95bf8e4..ea3d7bb235d3 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -236,6 +236,9 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; +/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ +struct fgraph_ret_regs; + /* * Send the trace to the ring-buffer. * @return the original return address. -- cgit v1.2.3 From a1be9ccc57f07d54278be34eed6bd679bc941c97 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Sat, 8 Apr 2023 05:42:15 -0700 Subject: function_graph: Support recording and printing the return value of function Analyzing system call failures with the function_graph tracer can be a time-consuming process, particularly when locating the kernel function that first returns an error in the trace logs. This change aims to simplify the process by recording the function return value to the 'retval' member of 'ftrace_graph_ret' and printing it when outputting the trace log. We have introduced new trace options: funcgraph-retval and funcgraph-retval-hex. The former controls whether to display the return value, while the latter controls the display format. Please note that even if a function's return type is void, a return value will still be printed. You can simply ignore it. This patch only establishes the fundamental infrastructure. Subsequent patches will make this feature available on some commonly used processor architectures. Here is an example: I attempted to attach the demo process to a cpu cgroup, but it failed: echo `pidof demo` > /sys/fs/cgroup/cpu/test/tasks -bash: echo: write error: Invalid argument The strace logs indicate that the write system call returned -EINVAL(-22): ... write(1, "273\n", 4) = -1 EINVAL (Invalid argument) ... To capture trace logs during a write system call, use the following commands: cd /sys/kernel/debug/tracing/ echo 0 > tracing_on echo > trace echo *sys_write > set_graph_function echo *spin* > set_graph_notrace echo *rcu* >> set_graph_notrace echo *alloc* >> set_graph_notrace echo preempt* >> set_graph_notrace echo kfree* >> set_graph_notrace echo $$ > set_ftrace_pid echo function_graph > current_tracer echo 1 > options/funcgraph-retval echo 0 > options/funcgraph-retval-hex echo 1 > tracing_on echo `pidof demo` > /sys/fs/cgroup/cpu/test/tasks echo 0 > tracing_on cat trace > ~/trace.log To locate the root cause, search for error code -22 directly in the file trace.log and identify the first function that returned -22. Once you have identified this function, examine its code to determine the root cause. For example, in the trace log below, cpu_cgroup_can_attach returned -22 first, so we can focus our analysis on this function to identify the root cause. ... 1) | cgroup_migrate() { 1) 0.651 us | cgroup_migrate_add_task(); /* = 0xffff93fcfd346c00 */ 1) | cgroup_migrate_execute() { 1) | cpu_cgroup_can_attach() { 1) | cgroup_taskset_first() { 1) 0.732 us | cgroup_taskset_next(); /* = 0xffff93fc8fb20000 */ 1) 1.232 us | } /* cgroup_taskset_first = 0xffff93fc8fb20000 */ 1) 0.380 us | sched_rt_can_attach(); /* = 0x0 */ 1) 2.335 us | } /* cpu_cgroup_can_attach = -22 */ 1) 4.369 us | } /* cgroup_migrate_execute = -22 */ 1) 7.143 us | } /* cgroup_migrate = -22 */ ... Link: https://lkml.kernel.org/r/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn Tested-by: Florian Kauer Acked-by: Masami Hiramatsu (Google) Signed-off-by: Donglin Peng Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 3 ++ kernel/trace/Kconfig | 15 ++++++ kernel/trace/fgraph.c | 23 ++++++++- kernel/trace/trace.h | 2 + kernel/trace/trace_entries.h | 26 ++++++++++ kernel/trace/trace_functions_graph.c | 93 ++++++++++++++++++++++++++++++++---- 6 files changed, 151 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b23bdd414394..49f279f4c3a1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1018,6 +1018,9 @@ struct ftrace_graph_ent { */ struct ftrace_graph_ret { unsigned long func; /* Current function */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + unsigned long retval; +#endif int depth; /* Number of functions that overran the depth limit for current task */ unsigned int overrun; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8cf97fa4a4b3..abe5c583bd59 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,9 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst +config HAVE_FUNCTION_GRAPH_RETVAL + bool + config HAVE_DYNAMIC_FTRACE bool help @@ -227,6 +230,18 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config FUNCTION_GRAPH_RETVAL + bool "Kernel Function Graph Return Value" + depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return value when + using function graph tracer. It can be helpful to locate functions + that return errors. This feature is off by default, and you can + enable it via the trace option funcgraph-retval. + See Documentation/trace/ftrace.rst + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index ea3d7bb235d3..cd2c35b1dd8f 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -243,12 +243,16 @@ struct fgraph_ret_regs; * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, + unsigned long frame_pointer) { struct ftrace_graph_ret trace; unsigned long ret; ftrace_pop_return_trace(&trace, &ret, frame_pointer); +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + trace.retval = fgraph_ret_regs_return_value(ret_regs); +#endif trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); /* @@ -269,6 +273,23 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/* + * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can + * leave only ftrace_return_to_handler(ret_regs). + */ +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL +unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +{ + return __ftrace_return_to_handler(ret_regs, + fgraph_ret_regs_frame_pointer(ret_regs)); +} +#else +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + return __ftrace_return_to_handler(NULL, frame_pointer); +} +#endif + /** * ftrace_graph_get_ret_stack - return the entry of the shadow stack * @task: The task to read the shadow stack from diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..e6407a27d644 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -832,6 +832,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_TAIL 0x100 #define TRACE_GRAPH_SLEEP_TIME 0x200 #define TRACE_GRAPH_GRAPH_TIME 0x400 +#define TRACE_GRAPH_PRINT_RETVAL 0x800 +#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index cd41e863b51c..340b2fa98218 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -86,6 +86,30 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, retval ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + ), + + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + (void *)__entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth, __entry->retval) +); + +#else + FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -105,6 +129,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __entry->depth) ); +#endif + /* * Context switch trace entry - which task (and prio) we switched from/to: * diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 203204cadf92..c35fbaab2a47 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -58,6 +58,12 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + /* Display function return value ? */ + { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) }, + /* Display function return value in hexadecimal format ? */ + { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -619,6 +625,56 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, trace_seq_puts(s, "| "); } +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL + +static void print_graph_retval(struct trace_seq *s, unsigned long retval, + bool leaf, void *func, bool hex_format) +{ + unsigned long err_code = 0; + + if (retval == 0 || hex_format) + goto done; + + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + /* sign extension */ + err_code = (unsigned long)(s32)retval; + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; + +done: + if (leaf) { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "%ps(); /* = %ld */\n", + func, err_code); + } else { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "} /* %ps = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "} /* %ps = %ld */\n", + func, err_code); + } +} + +#else + +#define __TRACE_GRAPH_PRINT_RETVAL 0 + +#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) + +#endif + /* Case of a leaf function on its call entry */ static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, @@ -663,7 +719,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); - trace_seq_printf(s, "%ps();\n", (void *)call->func); + /* + * Write out the function return value if the option function-retval is + * enabled. + */ + if (flags & __TRACE_GRAPH_PRINT_RETVAL) + print_graph_retval(s, graph_ret->retval, true, (void *)call->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + else + trace_seq_printf(s, "%ps();\n", (void *)call->func); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -942,16 +1006,25 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, trace_seq_putc(s, ' '); /* - * If the return function does not have a matching entry, - * then the entry was lost. Instead of just printing - * the '}' and letting the user guess what function this - * belongs to, write out the function name. Always do - * that if the funcgraph-tail option is enabled. + * Always write out the function name and its return value if the + * function-retval option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); - else - trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (flags & __TRACE_GRAPH_PRINT_RETVAL) { + print_graph_retval(s, trace->retval, false, (void *)trace->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + } else { + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. + */ + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + } /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) -- cgit v1.2.3 From 83f74441bcb16c324b7bdba0ab4261a44cb1ac21 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 11 Jun 2023 15:00:29 +0200 Subject: ftrace: Show all functions with addresses in available_filter_functions_addrs Adding new available_filter_functions_addrs file that shows all available functions (same as available_filter_functions) together with addresses, like: # cat available_filter_functions_addrs | head ffffffff81000770 __traceiter_initcall_level ffffffff810007c0 __traceiter_initcall_start ffffffff81000810 __traceiter_initcall_finish ffffffff81000860 trace_initcall_finish_cb ... Note displayed address is the patch-site address and can differ from /proc/kallsyms address. It's useful to have address avilable for traceable symbols, so we don't need to allways cross check kallsyms with available_filter_functions (or the other way around) and have all the data in single file. For backwards compatibility reasons we can't change the existing available_filter_functions file output, but we need to add new file. The problem is that we need to do 2 passes: - through available_filter_functions and find out if the function is traceable - through /proc/kallsyms to get the address for traceable function Having available_filter_functions symbols together with addresses allow us to skip the kallsyms step and we are ok with the address in available_filter_functions_addr not being the function entry, because kprobe_multi uses fprobe and that handles both entry and patch-site address properly. We have 2 interfaces how to create kprobe_multi link: a) passing symbols to kernel 1) user gathers symbols and need to ensure that they are trace-able -> pass through available_filter_functions file 2) kernel takes those symbols and translates them to addresses through kallsyms api 3) addresses are passed to fprobe/ftrace through: register_fprobe_ips -> ftrace_set_filter_ips b) passing addresses to kernel 1) user gathers symbols and needs to ensure that they are trace-able -> pass through available_filter_functions file 2) user takes those symbols and translates them to addresses through /proc/kallsyms 3) addresses are passed to the kernel and kernel calls: register_fprobe_ips -> ftrace_set_filter_ips The new available_filter_functions_addrs file helps us with option b), because we can make 'b 1' and 'b 2' in one step - while filtering traceable functions, we get the address directly. Link: https://lore.kernel.org/linux-trace-kernel/20230611130029.1202298-1-jolsa@kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrii Nakryiko Tested-by: Jackie Liu # x86 Suggested-by: Steven Rostedt (Google) Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 6 ++++++ include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) (limited to 'kernel/trace') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index df2d3e57a83f..b7308ab10c0e 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -324,6 +324,12 @@ of ftrace. Here is a list of some of the key files: "set_graph_function", or "set_graph_notrace". (See the section "dynamic ftrace" below for more details.) + available_filter_functions_addrs: + + Similar to available_filter_functions, but with address displayed + for each function. The displayed address is the patch-site address + and can differ from /proc/kallsyms address. + dyn_ftrace_total_info: This file is for debugging purposes. The number of functions that diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 49f279f4c3a1..8e59bd954153 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -633,6 +633,7 @@ enum { FTRACE_ITER_MOD = (1 << 5), FTRACE_ITER_ENABLED = (1 << 6), FTRACE_ITER_TOUCHED = (1 << 7), + FTRACE_ITER_ADDRS = (1 << 8), }; void arch_ftrace_update_code(int command); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 764668467155..b24c573934af 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3861,6 +3861,9 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; + if (iter->flags & FTRACE_ITER_ADDRS) + seq_printf(m, "%lx ", rec->ip); + if (print_rec(m, rec->ip)) { /* This should only happen when a rec is disabled */ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); @@ -3996,6 +3999,30 @@ ftrace_touched_open(struct inode *inode, struct file *file) return 0; } +static int +ftrace_avail_addrs_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ADDRS; + iter->ops = &global_ops; + + return 0; +} + /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -5916,6 +5943,13 @@ static const struct file_operations ftrace_touched_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_avail_addrs_fops = { + .open = ftrace_avail_addrs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -6377,6 +6411,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); + trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_avail_addrs_fops); + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); -- cgit v1.2.3 From 4998e7fda149d2392ea6aa9879299d8a32019dbe Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 6 Jun 2023 17:12:25 +0200 Subject: tracing/osnoise: Switch from PF_NO_SETAFFINITY to migrate_disable Currently, osnoise/timerlat threads run with PF_NO_SETAFFINITY set. It works well, however, cgroups do not allow PF_NO_SETAFFINITY threads to be accepted, and this creates a limitation to osnoise/timerlat. To avoid this limitation, disable migration of the threads as soon as they start to run, and then clean the PF_NO_SETAFFINITY flag (still) used during thread creation. If for some reason a thread migration is requested, e.g., via sched_settafinity, the tracer thread will notice and exit. Link: https://lkml.kernel.org/r/8ba8bc9c15b3ea40cf73cf67a9bc061a264609f0.1686063934.git.bristot@kernel.org Cc: Juri Lelli Cc: William White Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e97e3fa5cbed..c265ec5f1726 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1545,6 +1545,39 @@ static void osnoise_sleep(void) } } +/* + * osnoise_migration_pending - checks if the task needs to migrate + * + * osnoise/timerlat threads are per-cpu. If there is a pending request to + * migrate the thread away from the current CPU, something bad has happened. + * Play the good citizen and leave. + * + * Returns 0 if it is safe to continue, 1 otherwise. + */ +static inline int osnoise_migration_pending(void) +{ + if (!current->migration_pending) + return 0; + + /* + * If migration is pending, there is a task waiting for the + * tracer to enable migration. The tracer does not allow migration, + * thus: taint and leave to unblock the blocked thread. + */ + osnoise_taint("migration requested to osnoise threads, leaving."); + + /* + * Unset this thread from the threads managed by the interface. + * The tracers are responsible for cleaning their env before + * exiting. + */ + mutex_lock(&interface_lock); + this_cpu_osn_var()->kthread = NULL; + mutex_unlock(&interface_lock); + + return 1; +} + /* * osnoise_main - The osnoise detection kernel thread * @@ -1553,12 +1586,29 @@ static void osnoise_sleep(void) */ static int osnoise_main(void *data) { + unsigned long flags; + + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); while (!kthread_should_stop()) { + if (osnoise_migration_pending()) + break; + run_osnoise(); osnoise_sleep(); } + migrate_enable(); return 0; } @@ -1706,6 +1756,7 @@ static int timerlat_main(void *data) struct timerlat_variables *tlat = this_cpu_tmr_var(); struct timerlat_sample s; struct sched_param sp; + unsigned long flags; u64 now, diff; /* @@ -1714,6 +1765,18 @@ static int timerlat_main(void *data) sp.sched_priority = DEFAULT_TIMERLAT_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + tlat->count = 0; tlat->tracing_thread = false; @@ -1731,6 +1794,7 @@ static int timerlat_main(void *data) osn_var->sampling = 1; while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); diff = now - tlat->abs_period; @@ -1749,10 +1813,14 @@ static int timerlat_main(void *data) if (time_to_us(diff) >= osnoise_data.stop_tracing_total) osnoise_stop_tracing(); + if (osnoise_migration_pending()) + break; + wait_next_period(tlat); } hrtimer_cancel(&tlat->timer); + migrate_enable(); return 0; } #else /* CONFIG_TIMERLAT_TRACER */ -- cgit v1.2.3 From cb7ca871c883eed5132e106cda44b2b060e6f52e Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 6 Jun 2023 17:12:26 +0200 Subject: tracing/osnoise: Skip running osnoise if all instances are off In the case of all tracing instances being off, sleep for the entire period. Q: Why not kill all threads so? A: It is valid and useful to start the threads with tracing off. For example, rtla disables tracing, starts the tracer, applies the scheduling setup to the threads, e.g., sched priority and cgroup, and then begin tracing with all set. Skipping the period helps to speed up rtla setup and save the trace after a stop tracing. Link: https://lkml.kernel.org/r/aa4dd9b7e76fcb63901fe5407e15ec002b318599.1686063934.git.bristot@kernel.org Cc: Juri Lelli Cc: William White Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index c265ec5f1726..220172cb874d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1285,6 +1285,22 @@ static __always_inline void osnoise_stop_tracing(void) rcu_read_unlock(); } +/* + * osnoise_has_tracing_on - Check if there is at least one instance on + */ +static __always_inline int osnoise_has_tracing_on(void) +{ + struct osnoise_instance *inst; + int trace_is_on = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) + trace_is_on += tracer_tracing_is_on(inst->tr); + rcu_read_unlock(); + + return trace_is_on; +} + /* * notify_new_max_latency - Notify a new max latency via fsnotify interface. */ @@ -1517,13 +1533,16 @@ static struct cpumask save_cpumask; /* * osnoise_sleep - sleep until the next period */ -static void osnoise_sleep(void) +static void osnoise_sleep(bool skip_period) { u64 interval; ktime_t wake_time; mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + if (skip_period) + interval = osnoise_data.sample_period; + else + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; mutex_unlock(&interface_lock); /* @@ -1604,8 +1623,14 @@ static int osnoise_main(void *data) if (osnoise_migration_pending()) break; + /* skip a period if tracing is off on all instances */ + if (!osnoise_has_tracing_on()) { + osnoise_sleep(true); + continue; + } + run_osnoise(); - osnoise_sleep(); + osnoise_sleep(false); } migrate_enable(); -- cgit v1.2.3 From e88ed227f639ebcb31ed4e5b88756b47d904584b Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 6 Jun 2023 17:12:27 +0200 Subject: tracing/timerlat: Add user-space interface Going a step further, we propose a way to use any user-space workload as the task waiting for the timerlat timer. This is done via a per-CPU file named osnoise/cpu$id/timerlat_fd file. The tracef_fd allows a task to open at a time. When a task reads the file, the timerlat timer is armed for future osnoise/timerlat_period_us time. When the timer fires, it prints the IRQ latency and wakes up the user-space thread waiting in the timerlat_fd. The thread then starts to run, executes the timerlat measurement, prints the thread scheduling latency and returns to user-space. When the thread rereads the timerlat_fd, the tracer will print the user-ret(urn) latency, which is an additional metric. This additional metric is also traced by the tracer and can be used, for example of measuring the context switch overhead from kernel-to-user and user-to-kernel, or the response time for an arbitrary execution in user-space. The tracer supports one thread per CPU, the thread must be pinned to the CPU, and it cannot migrate while holding the timerlat_fd. The reason is that the tracer is per CPU (nothing prohibits the tracer from allowing migrations in the future). The tracer monitors the migration of the thread and disables the tracer if detected. The timerlat_fd is only available for opening/reading when timerlat tracer is enabled, and NO_OSNOISE_WORKLOAD is set. The simplest way to activate this feature from user-space is: -------------------------------- %< ----------------------------------- int main(void) { char buffer[1024]; int timerlat_fd; int retval; long cpu = 0; /* place in CPU 0 */ cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); if (sched_setaffinity(gettid(), sizeof(set), &set) == -1) return 1; snprintf(buffer, sizeof(buffer), "/sys/kernel/tracing/osnoise/per_cpu/cpu%ld/timerlat_fd", cpu); timerlat_fd = open(buffer, O_RDONLY); if (timerlat_fd < 0) { printf("error opening %s: %s\n", buffer, strerror(errno)); exit(1); } for (;;) { retval = read(timerlat_fd, buffer, 1024); if (retval < 0) break; } close(timerlat_fd); exit(0); } -------------------------------- >% ----------------------------------- When disabling timerlat, if there is a workload holding the timerlat_fd, the SIGKILL will be sent to the thread. Link: https://lkml.kernel.org/r/69fe66a863d2792ff4c3a149bf9e32e26468bb3a.1686063934.git.bristot@kernel.org Cc: Juri Lelli Cc: William White Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/timerlat-tracer.rst | 78 +++++++ kernel/trace/trace_osnoise.c | 378 +++++++++++++++++++++++++++++++- kernel/trace/trace_output.c | 4 +- 3 files changed, 455 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst index db17df312bc8..53a56823e903 100644 --- a/Documentation/trace/timerlat-tracer.rst +++ b/Documentation/trace/timerlat-tracer.rst @@ -180,3 +180,81 @@ dummy_load_1ms_pd_init, which had the following code (on purpose):: return 0; } + +User-space interface +--------------------------- + +Timerlat allows user-space threads to use timerlat infra-structure to +measure scheduling latency. This interface is accessible via a per-CPU +file descriptor inside $tracing_dir/osnoise/per_cpu/cpu$ID/timerlat_fd. + +This interface is accessible under the following conditions: + + - timerlat tracer is enable + - osnoise workload option is set to NO_OSNOISE_WORKLOAD + - The user-space thread is affined to a single processor + - The thread opens the file associated with its single processor + - Only one thread can access the file at a time + +The open() syscall will fail if any of these conditions are not met. +After opening the file descriptor, the user space can read from it. + +The read() system call will run a timerlat code that will arm the +timer in the future and wait for it as the regular kernel thread does. + +When the timer IRQ fires, the timerlat IRQ will execute, report the +IRQ latency and wake up the thread waiting in the read. The thread will be +scheduled and report the thread latency via tracer - as for the kernel +thread. + +The difference from the in-kernel timerlat is that, instead of re-arming +the timer, timerlat will return to the read() system call. At this point, +the user can run any code. + +If the application rereads the file timerlat file descriptor, the tracer +will report the return from user-space latency, which is the total +latency. If this is the end of the work, it can be interpreted as the +response time for the request. + +After reporting the total latency, timerlat will restart the cycle, arm +a timer, and go to sleep for the following activation. + +If at any time one of the conditions is broken, e.g., the thread migrates +while in user space, or the timerlat tracer is disabled, the SIG_KILL +signal will be sent to the user-space thread. + +Here is an basic example of user-space code for timerlat:: + + int main(void) + { + char buffer[1024]; + int timerlat_fd; + int retval; + long cpu = 0; /* place in CPU 0 */ + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu, &set); + + if (sched_setaffinity(gettid(), sizeof(set), &set) == -1) + return 1; + + snprintf(buffer, sizeof(buffer), + "/sys/kernel/tracing/osnoise/per_cpu/cpu%ld/timerlat_fd", + cpu); + + timerlat_fd = open(buffer, O_RDONLY); + if (timerlat_fd < 0) { + printf("error opening %s: %s\n", buffer, strerror(errno)); + exit(1); + } + + for (;;) { + retval = read(timerlat_fd, buffer, 1024); + if (retval < 0) + break; + } + + close(timerlat_fd); + exit(0); + } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 220172cb874d..bd0d01d00fb9 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -181,6 +181,7 @@ struct osn_irq { #define IRQ_CONTEXT 0 #define THREAD_CONTEXT 1 +#define THREAD_URET 2 /* * sofirq runtime info. */ @@ -238,6 +239,7 @@ struct timerlat_variables { u64 abs_period; bool tracing_thread; u64 count; + bool uthread_migrate; }; static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); @@ -1181,6 +1183,78 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) osn_var->thread.arrival_time = 0; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise_stop_exception - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_exception(char *msg, int cpu) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler + * + * his function is hooked to the sched:sched_migrate_task trace event, and monitors + * timerlat user-space thread migration. + */ +static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu) +{ + struct osnoise_variables *osn_var; + long cpu = task_cpu(p); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + if (osn_var->pid == p->pid && dest_cpu != cpu) { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user-thread migrated\n"); + osnoise_stop_exception("timerlat user-thread migrated", cpu); + } +} + +static int register_migration_monitor(void) +{ + int ret = 0; + + /* + * Timerlat thread migration check is only required when running timerlat in user-space. + * Thus, enable callback only if timerlat is set with no workload. + */ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + + return ret; +} + +static void unregister_migration_monitor(void) +{ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); +} +#else +static int register_migration_monitor(void) +{ + return 0; +} +static void unregister_migration_monitor(void) {} +#endif /* * trace_sched_switch - sched:sched_switch trace event handler * @@ -1204,7 +1278,7 @@ trace_sched_switch_callback(void *data, bool preempt, } /* - * hook_thread_events - Hook the insturmentation for thread noise + * hook_thread_events - Hook the instrumentation for thread noise * * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1217,11 +1291,19 @@ static int hook_thread_events(void) if (ret) return -EINVAL; + ret = register_migration_monitor(); + if (ret) + goto out_unreg; + return 0; + +out_unreg: + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + return -EINVAL; } /* - * unhook_thread_events - *nhook the insturmentation for thread noise + * unhook_thread_events - unhook the instrumentation for thread noise * * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1229,6 +1311,7 @@ static int hook_thread_events(void) static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + unregister_migration_monitor(); } /* @@ -1864,10 +1947,24 @@ static void stop_kthread(unsigned int cpu) kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; if (kthread) { - kthread_stop(kthread); + if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + kthread_stop(kthread); + } else { + /* + * This is a user thread waiting on the timerlat_fd. We need + * to close all users, and the best way to guarantee this is + * by killing the thread. NOTE: this is a purpose specific file. + */ + kill_pid(kthread->thread_pid, SIGKILL, 1); + put_task_struct(kthread); + } per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { + /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + /* + * This is set in the osnoise tracer case. + */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); return; @@ -1912,7 +2009,6 @@ static int start_kthread(unsigned int cpu) barrier(); return 0; } - snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1941,6 +2037,11 @@ static int start_per_cpu_kthreads(void) int retval = 0; int cpu; + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (timerlat_enabled()) + return 0; + } + cpus_read_lock(); /* * Run only on online CPUs in which osnoise is allowed to run. @@ -2281,6 +2382,223 @@ err_free: return err; } +#ifdef CONFIG_TIMERLAT_TRACER +static int timerlat_fd_open(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + long cpu = (long) inode->i_cdev; + + mutex_lock(&interface_lock); + + /* + * This file is accessible only if timerlat is enabled, and + * NO_OSNOISE_WORKLOAD is set. + */ + if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) { + mutex_unlock(&interface_lock); + return -EINVAL; + } + + migrate_disable(); + + osn_var = this_cpu_osn_var(); + + /* + * The osn_var->pid holds the single access to this file. + */ + if (osn_var->pid) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EBUSY; + } + + /* + * timerlat tracer is a per-cpu tracer. Check if the user-space too + * is pinned to a single CPU. The tracer laters monitor if the task + * migrates and then disables tracer if it does. However, it is + * worth doing this basic acceptance test to avoid obviusly wrong + * setup. + */ + if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EPERM; + } + + /* + * From now on, it is good to go. + */ + file->private_data = inode->i_cdev; + + get_task_struct(current); + + osn_var->kthread = current; + osn_var->pid = current->pid; + + /* + * Setup is done. + */ + mutex_unlock(&interface_lock); + + tlat = this_cpu_tmr_var(); + tlat->count = 0; + + migrate_enable(); + return 0; +}; + +/* + * timerlat_fd_read - Read function for "timerlat_fd" file + * @file: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error. + */ +static ssize_t +timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + long cpu = (long) file->private_data; + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + struct timerlat_sample s; + s64 diff; + u64 now; + + migrate_disable(); + + tlat = this_cpu_tmr_var(); + + /* + * While in user-space, the thread is migratable. There is nothing + * we can do about it. + * So, if the thread is running on another CPU, stop the machinery. + */ + if (cpu == smp_processor_id()) { + if (tlat->uthread_migrate) { + migrate_enable(); + return -EINVAL; + } + } else { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user thread migrate\n"); + osnoise_stop_tracing(); + migrate_enable(); + return -EINVAL; + } + + osn_var = this_cpu_osn_var(); + + /* + * The timerlat in user-space runs in a different order: + * the read() starts from the execution of the previous occurrence, + * sleeping for the next occurrence. + * + * So, skip if we are entering on read() before the first wakeup + * from timerlat IRQ: + */ + if (likely(osn_var->sampling)) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_URET; + + trace_timerlat_sample(&s); + + notify_new_max_latency(diff); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + } else { + tlat->tracing_thread = false; + tlat->kthread = current; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + + /* Annotate now to drift new period */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + osn_var->sampling = 1; + } + + /* wait for the next period */ + wait_next_period(tlat); + + /* This is the wakeup from this cycle */ + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); + notify_new_max_latency(diff); + osnoise_stop_tracing(); + } + } + +out: + migrate_enable(); + return 0; +} + +static int timerlat_fd_release(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat_var; + long cpu = (long) file->private_data; + + migrate_disable(); + mutex_lock(&interface_lock); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + + hrtimer_cancel(&tlat_var->timer); + memset(tlat_var, 0, sizeof(*tlat_var)); + + osn_var->sampling = 0; + osn_var->pid = 0; + + /* + * We are leaving, not being stopped... see stop_kthread(); + */ + if (osn_var->kthread) { + put_task_struct(osn_var->kthread); + osn_var->kthread = NULL; + } + + mutex_unlock(&interface_lock); + migrate_enable(); + return 0; +} +#endif + /* * osnoise/runtime_us: cannot be greater than the period. */ @@ -2344,6 +2662,13 @@ static struct trace_min_max_param timerlat_period = { .max = &timerlat_max_period, .min = &timerlat_min_period, }; + +static const struct file_operations timerlat_fd_fops = { + .open = timerlat_fd_open, + .read = timerlat_fd_read, + .release = timerlat_fd_release, + .llseek = generic_file_llseek, +}; #endif static const struct file_operations cpus_fops = { @@ -2381,18 +2706,63 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir) } #endif /* CONFIG_STACKTRACE */ +static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) +{ + struct dentry *timerlat_fd; + struct dentry *per_cpu; + struct dentry *cpu_dir; + char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */ + long cpu; + + /* + * Why not using tracing instance per_cpu/ dir? + * + * Because osnoise/timerlat have a single workload, having + * multiple files like these are wast of memory. + */ + per_cpu = tracefs_create_dir("per_cpu", top_dir); + if (!per_cpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_str, 30, "cpu%ld", cpu); + cpu_dir = tracefs_create_dir(cpu_str, per_cpu); + if (!cpu_dir) + goto out_clean; + + timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, + cpu_dir, NULL, &timerlat_fd_fops); + if (!timerlat_fd) + goto out_clean; + + /* Record the CPU */ + d_inode(timerlat_fd)->i_cdev = (void *)(cpu); + } + + return 0; + +out_clean: + tracefs_remove(per_cpu); + return -ENOMEM; +} + /* * init_timerlat_tracefs - A function to initialize the timerlat interface files */ static int init_timerlat_tracefs(struct dentry *top_dir) { struct dentry *tmp; + int retval; tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); + if (retval) + return retval; + return init_timerlat_stack_tracefs(top_dir); } #else /* CONFIG_TIMERLAT_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..9f10c0071c4f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1446,6 +1446,8 @@ static struct trace_event trace_osnoise_event = { }; /* TRACE_TIMERLAT */ + +static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"}; static enum print_line_t trace_timerlat_print(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1458,7 +1460,7 @@ trace_timerlat_print(struct trace_iterator *iter, int flags, trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", field->seqnum, - field->context ? "thread" : "irq", + timerlat_lat_context[field->context], field->timer_latency); return trace_handle_return(s); -- cgit v1.2.3 From 38638ffa6059049334b4d87bd4d85cf3418b5e27 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 13 Jun 2023 00:41:25 +0000 Subject: tracing/boot: Replace strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). Direct replacement is safe here since return value of -E2BIG is used to check for truncation instead of sizeof(dest). [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Link: https://lore.kernel.org/linux-trace-kernel/20230613004125.3539934-1-azeemshaikh38@gmail.com Cc: Masami Hiramatsu Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_boot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 778200dd8ede..5fe525f1b8cc 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) /* Common ftrace options */ xbc_node_for_each_array_value(node, "options", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) const char *p; xbc_node_for_each_array_value(node, "events", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, p = xbc_node_find_value(enode, "filter", NULL); if (p && *p != '\0') { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("filter string is too long: %s\n", p); else if (apply_event_filter(file, buf) < 0) pr_err("Failed to apply filter: %s\n", buf); @@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); -- cgit v1.2.3 From ed5f297802fca41d88fbfa6f9c13b218e7c6f5cb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 4 Jun 2023 11:29:00 +0900 Subject: tracing/probes: Fix to return NULL and keep using current argc Fix to return NULL and keep using current argc when there is $argN and the BTF is not available. Link: https://lore.kernel.org/all/168584574094.2056209.2694238431743782342.stgit@mhiramat.roam.corp.google.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306030940.Cej2JoUx-lkp@intel.com/ Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ba1c6e059b51..473e1c43bc57 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1273,7 +1273,8 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], trace_probe_log_err(0, NOSUP_BTFARG); return (const char **)params; } - return 0; + *new_argc = argc; + return NULL; } ctx->params = params; ctx->nr_params = nr_params; -- cgit v1.2.3 From 53431798f4bb60d214ae1ec4a79eefdd414f577b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 12 Jun 2023 20:58:57 +0900 Subject: tracing/probes: Fix tracepoint event with $arg* to fetch correct argument To hide the first dummy 'data' argument on the tracepoint probe events, the BTF argument array was modified (skip the first argument for tracepoint), but the '$arg*' meta argument parser missed that. Fix to increment the argument index if it is tracepoint probe. And decrement the index when searching the type of the argument. Link: https://lore.kernel.org/all/168657113778.3038017.12245893750241701312.stgit@mhiramat.roam.corp.google.com/ Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 473e1c43bc57..643aa3a51d5a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -456,7 +456,10 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, if (name && !strcmp(name, varname)) { code->op = FETCH_OP_ARG; - code->param = i; + if (ctx->flags & TPARG_FL_TPOINT) + code->param = i + 1; + else + code->param = i; return 0; } } @@ -470,8 +473,11 @@ static const struct fetch_type *parse_btf_arg_type(int arg_idx, struct btf *btf = traceprobe_get_btf(); const char *typestr = NULL; - if (btf && ctx->params) + if (btf && ctx->params) { + if (ctx->flags & TPARG_FL_TPOINT) + arg_idx--; typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); + } return find_fetch_type(typestr, ctx->flags); } -- cgit v1.2.3 From 5f81018753dfd4989e33ece1f0cb6b8aae498b82 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 15 Jun 2023 13:52:36 +0200 Subject: fprobe: Release rethook after the ftrace_ops is unregistered While running bpf selftests it's possible to get following fault: general protection fault, probably for non-canonical address \ 0x6b6b6b6b6b6b6b6b: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC NOPTI ... Call Trace: fprobe_handler+0xc1/0x270 ? __pfx_bpf_testmod_init+0x10/0x10 ? __pfx_bpf_testmod_init+0x10/0x10 ? bpf_fentry_test1+0x5/0x10 ? bpf_fentry_test1+0x5/0x10 ? bpf_testmod_init+0x22/0x80 ? do_one_initcall+0x63/0x2e0 ? rcu_is_watching+0xd/0x40 ? kmalloc_trace+0xaf/0xc0 ? do_init_module+0x60/0x250 ? __do_sys_finit_module+0xac/0x120 ? do_syscall_64+0x37/0x90 ? entry_SYSCALL_64_after_hwframe+0x72/0xdc In unregister_fprobe function we can't release fp->rethook while it's possible there are some of its users still running on another cpu. Moving rethook_free call after fp->ops is unregistered with unregister_ftrace_function call. Link: https://lore.kernel.org/all/20230615115236.3476617-1-jolsa@kernel.org/ Fixes: 5b0ab78998e3 ("fprobe: Add exit_handler support") Cc: stable@vger.kernel.org Reviewed-by: Steven Rostedt (Google) Signed-off-by: Jiri Olsa Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 18d36842faf5..0121e8c0d54e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -364,19 +364,13 @@ int unregister_fprobe(struct fprobe *fp) fp->ops.saved_func != fprobe_kprobe_handler)) return -EINVAL; - /* - * rethook_free() starts disabling the rethook, but the rethook handlers - * may be running on other processors at this point. To make sure that all - * current running handlers are finished, call unregister_ftrace_function() - * after this. - */ - if (fp->rethook) - rethook_free(fp->rethook); - ret = unregister_ftrace_function(&fp->ops); if (ret < 0) return ret; + if (fp->rethook) + rethook_free(fp->rethook); + ftrace_free_filter(&fp->ops); return ret; -- cgit v1.2.3 From f6d026eea390d59787a6cdc2ef5c983d02e029d0 Mon Sep 17 00:00:00 2001 From: sunliming Date: Mon, 26 Jun 2023 19:13:42 +0800 Subject: tracing/user_events: Fix incorrect return value for writing operation when events are disabled The writing operation return the count of writes regardless of whether events are enabled or disabled. Switch it to return -EBADF to indicates that the event is disabled. Link: https://lkml.kernel.org/r/20230626111344.19136-2-sunliming@kylinos.cn Cc: stable@vger.kernel.org 7f5a08c79df35 ("user_events: Add minimal support for trace_event into ftrace") Acked-by: Beau Belgrave Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 8df0550415e7..09f7d9167b8e 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2096,7 +2096,8 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (unlikely(faulted)) return -EFAULT; - } + } else + return -EBADF; return ret; } -- cgit v1.2.3 From 02b0095e2fbbc060560c1065f86a211d91e27b26 Mon Sep 17 00:00:00 2001 From: Mateusz Stachyra Date: Tue, 4 Jul 2023 12:27:06 +0200 Subject: tracing: Fix null pointer dereference in tracing_err_log_open() Fix an issue in function 'tracing_err_log_open'. The function doesn't call 'seq_open' if the file is opened only with write permissions, which results in 'file->private_data' being left as null. If we then use 'lseek' on that opened file, 'seq_lseek' dereferences 'file->private_data' in 'mutex_lock(&m->lock)', resulting in a kernel panic. Writing to this node requires root privileges, therefore this bug has very little security impact. Tracefs node: /sys/kernel/tracing/error_log Example Kernel panic: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000038 Call trace: mutex_lock+0x30/0x110 seq_lseek+0x34/0xb8 __arm64_sys_lseek+0x6c/0xb8 invoke_syscall+0x58/0x13c el0_svc_common+0xc4/0x10c do_el0_svc+0x24/0x98 el0_svc+0x24/0x88 el0t_64_sync_handler+0x84/0xe4 el0t_64_sync+0x1b4/0x1b8 Code: d503201f aa0803e0 aa1f03e1 aa0103e9 (c8e97d02) ---[ end trace 561d1b49c12cf8a5 ]--- Kernel panic - not syncing: Oops: Fatal exception Link: https://lore.kernel.org/linux-trace-kernel/20230703155237eucms1p4dfb6a19caa14c79eb6c823d127b39024@eucms1p4 Link: https://lore.kernel.org/linux-trace-kernel/20230704102706eucms1p30d7ecdcc287f46ad67679fc8491b2e0f@eucms1p3 Cc: stable@vger.kernel.org Fixes: 8a062902be725 ("tracing: Add tracing error log") Signed-off-by: Mateusz Stachyra Suggested-by: Steven Rostedt Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64a4dde073ef..3d34e6fea6b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8135,7 +8135,7 @@ static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = tracing_lseek, .release = tracing_err_log_release, }; -- cgit v1.2.3 From fddca7db4a4c17f7333793dfb5308d80c76d2896 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 4 Jul 2023 10:08:07 -0400 Subject: tracing/boot: Test strscpy() against less than zero for error Instead of checking for -E2BIG, it is better to just check for less than zero of strscpy() for error. Testing for -E2BIG is not very robust, and the calling code does not really care about the error code, just that there was an error. One of the updates to convert strlcpy() to strscpy() had a v2 version that changed the test from testing against -E2BIG to less than zero, but I took the v1 version that still tested for -E2BIG. Link: https://lore.kernel.org/linux-trace-kernel/20230615180420.400769-1-azeemshaikh38@gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20230704100807.707d1605@rorschach.local.home Cc: Mark Rutland Cc: Azeem Shaikh Cc: Kees Cook Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_boot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 5fe525f1b8cc..7ccc7a8e155b 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) /* Common ftrace options */ xbc_node_for_each_array_value(node, "options", anode, p) { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { pr_err("String is too long: %s\n", p); continue; } @@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) const char *p; xbc_node_for_each_array_value(node, "events", anode, p) { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { pr_err("String is too long: %s\n", p); continue; } @@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, p = xbc_node_find_value(enode, "filter", NULL); if (p && *p != '\0') { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) pr_err("filter string is too long: %s\n", p); else if (apply_event_filter(file, buf) < 0) pr_err("Failed to apply filter: %s\n", buf); @@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); -- cgit v1.2.3 From cf0a624dc706c306294c14e6b3e7694702f25191 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Mon, 3 Jul 2023 07:28:53 +0300 Subject: kernel/trace: Fix cleanup logic of enable_trace_eprobe The enable_trace_eprobe() function enables all event probes, attached to given trace probe. If an error occurs in enabling one of the event probes, all others should be roll backed. There is a bug in that roll back logic - instead of all event probes, only the failed one is disabled. Link: https://lore.kernel.org/all/20230703042853.1427493-1-tz.stoyanov@gmail.com/ Reported-by: Dan Carpenter Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Tzvetomir Stoyanov (VMware) Acked-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 67e854979d53..3f04f0ffe0d7 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -675,6 +675,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, struct trace_eprobe *ep; bool enabled; int ret = 0; + int cnt = 0; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) @@ -698,12 +699,25 @@ static int enable_trace_eprobe(struct trace_event_call *call, if (ret) break; enabled = true; + cnt++; } if (ret) { /* Failed to enable one of them. Roll back all */ - if (enabled) - disable_eprobe(ep, file->tr); + if (enabled) { + /* + * It's a bug if one failed for something other than memory + * not being available but another eprobe succeeded. + */ + WARN_ON_ONCE(ret != -ENOMEM); + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + disable_eprobe(ep, file->tr); + if (!--cnt) + break; + } + } if (file) trace_probe_remove_file(tp, file); else -- cgit v1.2.3 From 5f0c584daf7464f04114c65dd07269ee2bfedc13 Mon Sep 17 00:00:00 2001 From: Ze Gao Date: Mon, 3 Jul 2023 17:23:36 +0800 Subject: fprobe: add unlock to match a succeeded ftrace_test_recursion_trylock Unlock ftrace recursion lock when fprobe_kprobe_handler() is failed because of some running kprobe. Link: https://lore.kernel.org/all/20230703092336.268371-1-zegao@tencent.com/ Fixes: 3cc4e2c5fbae ("fprobe: make fprobe_kprobe_handler recursion free") Reported-by: Yafang Closes: https://lore.kernel.org/linux-trace-kernel/CALOAHbC6UpfFOOibdDiC7xFc5YFUgZnk3MZ=3Ny6we=AcrNbew@mail.gmail.com/ Signed-off-by: Ze Gao Acked-by: Masami Hiramatsu (Google) Acked-by: Yafang Shao Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 0121e8c0d54e..58e4b3607aef 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -102,12 +102,14 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, if (unlikely(kprobe_running())) { fp->nmissed++; - return; + goto recursion_unlock; } kprobe_busy_begin(); __fprobe_handler(ip, parent_ip, ops, fregs); kprobe_busy_end(); + +recursion_unlock: ftrace_test_recursion_unlock(bit); } -- cgit v1.2.3 From 195b9cb5b288fec1c871ef89f78cc9a7461aad3a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 7 Jul 2023 23:03:19 +0900 Subject: fprobe: Ensure running fprobe_exit_handler() finished before calling rethook_free() Ensure running fprobe_exit_handler() has finished before calling rethook_free() in the unregister_fprobe() so that caller can free the fprobe right after unregister_fprobe(). unregister_fprobe() ensured that all running fprobe_entry/exit_handler() have finished by calling unregister_ftrace_function() which synchronizes RCU. But commit 5f81018753df ("fprobe: Release rethook after the ftrace_ops is unregistered") changed to call rethook_free() after unregister_ftrace_function(). So call rethook_stop() to make rethook disabled before unregister_ftrace_function() and ensure it again. Here is the possible code flow that can call the exit handler after unregister_fprobe(). ------ CPU1 CPU2 call unregister_fprobe(fp) ... __fprobe_handler() rethook_hook() on probed function unregister_ftrace_function() return from probed function rethook hooks find rh->handler == fprobe_exit_handler call fprobe_exit_handler() rethook_free(): set rh->handler = NULL; return from unreigster_fprobe; call fp->exit_handler() <- (*) ------ (*) At this point, the exit handler is called after returning from unregister_fprobe(). This fixes it as following; ------ CPU1 CPU2 call unregister_fprobe() ... rethook_stop(): set rh->handler = NULL; __fprobe_handler() rethook_hook() on probed function unregister_ftrace_function() return from probed function rethook hooks find rh->handler == NULL return from rethook rethook_free() return from unreigster_fprobe; ------ Link: https://lore.kernel.org/all/168873859949.156157.13039240432299335849.stgit@devnote2/ Fixes: 5f81018753df ("fprobe: Release rethook after the ftrace_ops is unregistered") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- include/linux/rethook.h | 1 + kernel/trace/fprobe.c | 3 +++ kernel/trace/rethook.c | 13 +++++++++++++ 3 files changed, 17 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/rethook.h b/include/linux/rethook.h index c8ac1e5afcd1..bdbe6717f45a 100644 --- a/include/linux/rethook.h +++ b/include/linux/rethook.h @@ -59,6 +59,7 @@ struct rethook_node { }; struct rethook *rethook_alloc(void *data, rethook_handler_t handler); +void rethook_stop(struct rethook *rh); void rethook_free(struct rethook *rh); void rethook_add_node(struct rethook *rh, struct rethook_node *node); struct rethook_node *rethook_try_get(struct rethook *rh); diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 58e4b3607aef..2571f7f3d5f2 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -366,6 +366,9 @@ int unregister_fprobe(struct fprobe *fp) fp->ops.saved_func != fprobe_kprobe_handler)) return -EINVAL; + if (fp->rethook) + rethook_stop(fp->rethook); + ret = unregister_ftrace_function(&fp->ops); if (ret < 0) return ret; diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 60f6cb2b486b..468006cce7ca 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -53,6 +53,19 @@ static void rethook_free_rcu(struct rcu_head *head) kfree(rh); } +/** + * rethook_stop() - Stop using a rethook. + * @rh: the struct rethook to stop. + * + * Stop using a rethook to prepare for freeing it. If you want to wait for + * all running rethook handler before calling rethook_free(), you need to + * call this first and wait RCU, and call rethook_free(). + */ +void rethook_stop(struct rethook *rh) +{ + WRITE_ONCE(rh->handler, NULL); +} + /** * rethook_free() - Free struct rethook. * @rh: the struct rethook to be freed. -- cgit v1.2.3 From d0a3022f30629a208e5944022caeca3568add9e7 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 29 Jun 2023 23:50:48 +0000 Subject: tracing/user_events: Fix struct arg size match check When users register an event the name of the event and it's argument are checked to ensure they match if the event already exists. Normally all arguments are in the form of "type name", except for when the type starts with "struct ". In those cases, the size of the struct is passed in addition to the name, IE: "struct my_struct a 20" for an argument that is of type "struct my_struct" with a field name of "a" and has the size of 20 bytes. The current code does not honor the above case properly when comparing a match. This causes the event register to fail even when the same string was used for events that contain a struct argument within them. The example above "struct my_struct a 20" generates a match string of "struct my_struct a" omitting the size field. Add the struct size of the existing field when generating a comparison string for a struct field to ensure proper match checking. Link: https://lkml.kernel.org/r/20230629235049.581-2-beaub@linux.microsoft.com Cc: stable@vger.kernel.org Fixes: e6f89a149872 ("tracing/user_events: Ensure user provided strings are safely formatted") Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 4f5e74bbdab2..33cb6af31f39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1317,6 +1317,9 @@ static int user_field_set_string(struct ftrace_event_field *field, pos += snprintf(buf + pos, LEN_OR_ZERO, " "); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + if (str_has_prefix(field->type, "struct ")) + pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size); + if (colon) pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); -- cgit v1.2.3 From 7d8b31b73c79835572611ed1eed649e4d2e14245 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 14:51:48 +0200 Subject: tracing: arm64: Avoid missing-prototype warnings These are all tracing W=1 warnings in arm64 allmodconfig about missing prototypes: kernel/trace/trace_kprobe_selftest.c:7:5: error: no previous prototype for 'kprobe_trace_selftest_target' [-Werror=missing-pro totypes] kernel/trace/ftrace.c:329:5: error: no previous prototype for '__register_ftrace_function' [-Werror=missing-prototypes] kernel/trace/ftrace.c:372:5: error: no previous prototype for '__unregister_ftrace_function' [-Werror=missing-prototypes] kernel/trace/ftrace.c:4130:15: error: no previous prototype for 'arch_ftrace_match_adjust' [-Werror=missing-prototypes] kernel/trace/fgraph.c:243:15: error: no previous prototype for 'ftrace_return_to_handler' [-Werror=missing-prototypes] kernel/trace/fgraph.c:358:6: error: no previous prototype for 'ftrace_graph_sleep_time_control' [-Werror=missing-prototypes] arch/arm64/kernel/ftrace.c:460:6: error: no previous prototype for 'prepare_ftrace_return' [-Werror=missing-prototypes] arch/arm64/kernel/ptrace.c:2172:5: error: no previous prototype for 'syscall_trace_enter' [-Werror=missing-prototypes] arch/arm64/kernel/ptrace.c:2195:6: error: no previous prototype for 'syscall_trace_exit' [-Werror=missing-prototypes] Move the declarations to an appropriate header where they can be seen by the caller and callee, and make sure the headers are included where needed. Link: https://lore.kernel.org/linux-trace-kernel/20230517125215.930689-1-arnd@kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Will Deacon Cc: Kees Cook Cc: Florent Revest Signed-off-by: Arnd Bergmann Acked-by: Catalin Marinas [ Fixed ftrace_return_to_handler() to handle CONFIG_HAVE_FUNCTION_GRAPH_RETVAL case ] Signed-off-by: Steven Rostedt (Google) --- arch/arm64/include/asm/ftrace.h | 4 ++++ arch/arm64/include/asm/syscall.h | 3 +++ arch/arm64/kernel/syscall.c | 3 --- include/linux/ftrace.h | 9 +++++++++ kernel/trace/fgraph.c | 1 + kernel/trace/ftrace_internal.h | 5 +++-- kernel/trace/trace_kprobe_selftest.c | 3 +++ 7 files changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index 21ac1c5c71d3..ab158196480c 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -211,6 +211,10 @@ static inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs { return ret_regs->fp; } + +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, + unsigned long frame_pointer); + #endif /* ifdef CONFIG_FUNCTION_GRAPH_TRACER */ #endif diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index 4cfe9b49709b..ab8e14b96f68 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -85,4 +85,7 @@ static inline int syscall_get_arch(struct task_struct *task) return AUDIT_ARCH_AARCH64; } +int syscall_trace_enter(struct pt_regs *regs); +void syscall_trace_exit(struct pt_regs *regs); + #endif /* __ASM_SYSCALL_H */ diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 5a668d7f3c1f..b1ae2f2eaf77 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -75,9 +75,6 @@ static inline bool has_syscall_work(unsigned long flags) return unlikely(flags & _TIF_SYSCALL_WORK); } -int syscall_trace_enter(struct pt_regs *regs); -void syscall_trace_exit(struct pt_regs *regs); - static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, const syscall_fn_t syscall_table[]) { diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8e59bd954153..ce156c7704ee 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -41,6 +41,15 @@ struct ftrace_ops; struct ftrace_regs; struct dyn_ftrace; +char *arch_ftrace_match_adjust(char *str, const char *search); + +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL +struct fgraph_ret_regs; +unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs); +#else +unsigned long ftrace_return_to_handler(unsigned long frame_pointer); +#endif + #ifdef CONFIG_FUNCTION_TRACER /* * If the arch's mcount caller does not support all of ftrace's diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index cd2c35b1dd8f..c83c005e654e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -15,6 +15,7 @@ #include #include "ftrace_internal.h" +#include "trace.h" #ifdef CONFIG_DYNAMIC_FTRACE #define ASSIGN_OPS_HASH(opsname, val) \ diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 382775edf690..5012c04f92c0 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -2,6 +2,9 @@ #ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H #define _LINUX_KERNEL_FTRACE_INTERNAL_H +int __register_ftrace_function(struct ftrace_ops *ops); +int __unregister_ftrace_function(struct ftrace_ops *ops); + #ifdef CONFIG_FUNCTION_TRACER extern struct mutex ftrace_lock; @@ -15,8 +18,6 @@ int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs); #else /* !CONFIG_DYNAMIC_FTRACE */ -int __register_ftrace_function(struct ftrace_ops *ops); -int __unregister_ftrace_function(struct ftrace_ops *ops); /* Keep as macros so we do not need to define the commands */ # define ftrace_startup(ops, command) \ ({ \ diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c index 16548ee4c8c6..3851cd1e6a62 100644 --- a/kernel/trace/trace_kprobe_selftest.c +++ b/kernel/trace/trace_kprobe_selftest.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 + +#include "trace_kprobe_selftest.h" + /* * Function used during the kprobe self test. This function is in a separate * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it -- cgit v1.2.3 From 7e42907f3a7b4ce3a2d1757f6d78336984daf8f5 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Sun, 9 Jul 2023 06:51:44 +0800 Subject: ring-buffer: Fix deadloop issue on reading trace_pipe Soft lockup occurs when reading file 'trace_pipe': watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [cat:4488] [...] RIP: 0010:ring_buffer_empty_cpu+0xed/0x170 RSP: 0018:ffff88810dd6fc48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffffff93d1aaeb RDX: ffff88810a280040 RSI: 0000000000000008 RDI: ffff88811164b218 RBP: ffff88811164b218 R08: 0000000000000000 R09: ffff88815156600f R10: ffffed102a2acc01 R11: 0000000000000001 R12: 0000000051651901 R13: 0000000000000000 R14: ffff888115e49500 R15: 0000000000000000 [...] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8d853c2000 CR3: 000000010dcd8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __find_next_entry+0x1a8/0x4b0 ? peek_next_entry+0x250/0x250 ? down_write+0xa5/0x120 ? down_write_killable+0x130/0x130 trace_find_next_entry_inc+0x3b/0x1d0 tracing_read_pipe+0x423/0xae0 ? tracing_splice_read_pipe+0xcb0/0xcb0 vfs_read+0x16b/0x490 ksys_read+0x105/0x210 ? __ia32_sys_pwrite64+0x200/0x200 ? switch_fpu_return+0x108/0x220 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Through the vmcore, I found it's because in tracing_read_pipe(), ring_buffer_empty_cpu() found some buffer is not empty but then it cannot read anything due to "rb_num_of_entries() == 0" always true, Then it infinitely loop the procedure due to user buffer not been filled, see following code path: tracing_read_pipe() { ... ... waitagain: tracing_wait_pipe() // 1. find non-empty buffer here trace_find_next_entry_inc() // 2. loop here try to find an entry __find_next_entry() ring_buffer_empty_cpu(); // 3. find non-empty buffer peek_next_entry() // 4. but peek always return NULL ring_buffer_peek() rb_buffer_peek() rb_get_reader_page() // 5. because rb_num_of_entries() == 0 always true here // then return NULL // 6. user buffer not been filled so goto 'waitgain' // and eventually leads to an deadloop in kernel!!! } By some analyzing, I found that when resetting ringbuffer, the 'entries' of its pages are not all cleared (see rb_reset_cpu()). Then when reducing the ringbuffer, and if some reduced pages exist dirty 'entries' data, they will be added into 'cpu_buffer->overrun' (see rb_remove_pages()), which cause wrong 'overrun' count and eventually cause the deadloop issue. To fix it, we need to clear every pages in rb_reset_cpu(). Link: https://lore.kernel.org/linux-trace-kernel/20230708225144.3785600-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: a5fb833172eca ("ring-buffer: Fix uninitialized read_stamp") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 834b361a4a66..14d8001140c8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5242,28 +5242,34 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_size); +static void rb_clear_buffer_page(struct buffer_page *page) +{ + local_set(&page->write, 0); + local_set(&page->entries, 0); + rb_init_page(page->page); + page->read = 0; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + struct buffer_page *page; + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); - local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->entries, 0); - local_set(&cpu_buffer->head_page->page->commit, 0); - - cpu_buffer->head_page->read = 0; + rb_clear_buffer_page(cpu_buffer->head_page); + list_for_each_entry(page, cpu_buffer->pages, list) { + rb_clear_buffer_page(page); + } cpu_buffer->tail_page = cpu_buffer->head_page; cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); - cpu_buffer->reader_page->read = 0; + rb_clear_buffer_page(cpu_buffer->reader_page); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); -- cgit v1.2.3 From 26efd79c4624294e553aeaa3439c646729bad084 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 12 Jul 2023 14:04:52 +0800 Subject: ftrace: Fix possible warning on checking all pages used in ftrace_process_locs() As comments in ftrace_process_locs(), there may be NULL pointers in mcount_loc section: > Some architecture linkers will pad between > the different mcount_loc sections of different > object files to satisfy alignments. > Skip any NULL pointers. After commit 20e5227e9f55 ("ftrace: allow NULL pointers in mcount_loc"), NULL pointers will be accounted when allocating ftrace pages but skipped before adding into ftrace pages, this may result in some pages not being used. Then after commit 706c81f87f84 ("ftrace: Remove extra helper functions"), warning may occur at: WARN_ON(pg->next); To fix it, only warn for case that no pointers skipped but pages not used up, then free those unused pages after releasing ftrace_lock. Link: https://lore.kernel.org/linux-trace-kernel/20230712060452.3175675-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: 706c81f87f84 ("ftrace: Remove extra helper functions") Suggested-by: Steven Rostedt Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3740aca79fe7..05c0024815bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3305,6 +3305,22 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) return cnt; } +static void ftrace_free_pages(struct ftrace_page *pages) +{ + struct ftrace_page *pg = pages; + + while (pg) { + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } + pages = pg->next; + kfree(pg); + pg = pages; + ftrace_number_of_groups--; + } +} + static struct ftrace_page * ftrace_allocate_pages(unsigned long num_to_init) { @@ -3343,17 +3359,7 @@ ftrace_allocate_pages(unsigned long num_to_init) return start_pg; free_pages: - pg = start_pg; - while (pg) { - if (pg->records) { - free_pages((unsigned long)pg->records, pg->order); - ftrace_number_of_pages -= 1 << pg->order; - } - start_pg = pg->next; - kfree(pg); - pg = start_pg; - ftrace_number_of_groups--; - } + ftrace_free_pages(start_pg); pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; } @@ -6471,9 +6477,11 @@ static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg_unuse = NULL; struct ftrace_page *start_pg; struct ftrace_page *pg; struct dyn_ftrace *rec; + unsigned long skipped = 0; unsigned long count; unsigned long *p; unsigned long addr; @@ -6536,8 +6544,10 @@ static int ftrace_process_locs(struct module *mod, * object files to satisfy alignments. * Skip any NULL pointers. */ - if (!addr) + if (!addr) { + skipped++; continue; + } end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { @@ -6551,8 +6561,10 @@ static int ftrace_process_locs(struct module *mod, rec->ip = addr; } - /* We should have used all pages */ - WARN_ON(pg->next); + if (pg->next) { + pg_unuse = pg->next; + pg->next = NULL; + } /* Assign the last page to ftrace_pages */ ftrace_pages = pg; @@ -6574,6 +6586,11 @@ static int ftrace_process_locs(struct module *mod, out: mutex_unlock(&ftrace_lock); + /* We should have used all pages unless we skipped some */ + if (pg_unuse) { + WARN_ON(!skipped); + ftrace_free_pages(pg_unuse); + } return ret; } -- cgit v1.2.3 From bec3c25c247c4f88a33d79675a09e1644c9a3114 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jul 2023 10:52:35 -0400 Subject: tracing: Stop FORTIFY_SOURCE complaining about stack trace caller The stack_trace event is an event created by the tracing subsystem to store stack traces. It originally just contained a hard coded array of 8 words to hold the stack, and a "size" to know how many entries are there. This is exported to user space as: name: kernel_stack ID: 4 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:int size; offset:8; size:4; signed:1; field:unsigned long caller[8]; offset:16; size:64; signed:0; print fmt: "\t=> %ps\n\t=> %ps\n\t=> %ps\n" "\t=> %ps\n\t=> %ps\n\t=> %ps\n" "\t=> %ps\n\t=> %ps\n",i (void *)REC->caller[0], (void *)REC->caller[1], (void *)REC->caller[2], (void *)REC->caller[3], (void *)REC->caller[4], (void *)REC->caller[5], (void *)REC->caller[6], (void *)REC->caller[7] Where the user space tracers could parse the stack. The library was updated for this specific event to only look at the size, and not the array. But some older users still look at the array (note, the older code still checks to make sure the array fits inside the event that it read. That is, if only 4 words were saved, the parser would not read the fifth word because it will see that it was outside of the event size). This event was changed a while ago to be more dynamic, and would save a full stack even if it was greater than 8 words. It does this by simply allocating more ring buffer to hold the extra words. Then it copies in the stack via: memcpy(&entry->caller, fstack->calls, size); As the entry is struct stack_entry, that is created by a macro to both create the structure and export this to user space, it still had the caller field of entry defined as: unsigned long caller[8]. When the stack is greater than 8, the FORTIFY_SOURCE code notices that the amount being copied is greater than the source array and complains about it. It has no idea that the source is pointing to the ring buffer with the required allocation. To hide this from the FORTIFY_SOURCE logic, pointer arithmetic is used: ptr = ring_buffer_event_data(event); entry = ptr; ptr += offsetof(typeof(*entry), caller); memcpy(ptr, fstack->calls, size); Link: https://lore.kernel.org/all/20230612160748.4082850-1-svens@linux.ibm.com/ Link: https://lore.kernel.org/linux-trace-kernel/20230712105235.5fc441aa@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Reported-by: Sven Schnelle Tested-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4529e264cb86..20122eeccf97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3118,6 +3118,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; + void *ptr; /* * Add one, for this function and the call to save_stack_trace() @@ -3161,9 +3162,25 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, trace_ctx); if (!event) goto out; - entry = ring_buffer_event_data(event); + ptr = ring_buffer_event_data(event); + entry = ptr; + + /* + * For backward compatibility reasons, the entry->caller is an + * array of 8 slots to store the stack. This is also exported + * to user space. The amount allocated on the ring buffer actually + * holds enough for the stack specified by nr_entries. This will + * go into the location of entry->caller. Due to string fortifiers + * checking the size of the destination of memcpy() it triggers + * when it detects that size is greater than 8. To hide this from + * the fortifiers, we use "ptr" and pointer arithmetic to assign caller. + * + * The below is really just: + * memcpy(&entry->caller, fstack->calls, size); + */ + ptr += offsetof(typeof(*entry), caller); + memcpy(ptr, fstack->calls, size); - memcpy(&entry->caller, fstack->calls, size); entry->size = nr_entries; if (!call_filter_check_discard(call, entry, buffer, event)) -- cgit v1.2.3 From 6018b585e8c6fa7d85d4b38d9ce49a5b67be7078 Mon Sep 17 00:00:00 2001 From: Mohamed Khalfella Date: Wed, 12 Jul 2023 22:30:21 +0000 Subject: tracing/histograms: Add histograms to hist_vars if they have referenced variables Hist triggers can have referenced variables without having direct variables fields. This can be the case if referenced variables are added for trigger actions. In this case the newly added references will not have field variables. Not taking such referenced variables into consideration can result in a bug where it would be possible to remove hist trigger with variables being refenced. This will result in a bug that is easily reproducable like so $ cd /sys/kernel/tracing $ echo 'synthetic_sys_enter char[] comm; long id' >> synthetic_events $ echo 'hist:keys=common_pid.execname,id.syscall:vals=hitcount:comm=common_pid.execname' >> events/raw_syscalls/sys_enter/trigger $ echo 'hist:keys=common_pid.execname,id.syscall:onmatch(raw_syscalls.sys_enter).synthetic_sys_enter($comm, id)' >> events/raw_syscalls/sys_enter/trigger $ echo '!hist:keys=common_pid.execname,id.syscall:vals=hitcount:comm=common_pid.execname' >> events/raw_syscalls/sys_enter/trigger [ 100.263533] ================================================================== [ 100.264634] BUG: KASAN: slab-use-after-free in resolve_var_refs+0xc7/0x180 [ 100.265520] Read of size 8 at addr ffff88810375d0f0 by task bash/439 [ 100.266320] [ 100.266533] CPU: 2 PID: 439 Comm: bash Not tainted 6.5.0-rc1 #4 [ 100.267277] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-20220807_005459-localhost 04/01/2014 [ 100.268561] Call Trace: [ 100.268902] [ 100.269189] dump_stack_lvl+0x4c/0x70 [ 100.269680] print_report+0xc5/0x600 [ 100.270165] ? resolve_var_refs+0xc7/0x180 [ 100.270697] ? kasan_complete_mode_report_info+0x80/0x1f0 [ 100.271389] ? resolve_var_refs+0xc7/0x180 [ 100.271913] kasan_report+0xbd/0x100 [ 100.272380] ? resolve_var_refs+0xc7/0x180 [ 100.272920] __asan_load8+0x71/0xa0 [ 100.273377] resolve_var_refs+0xc7/0x180 [ 100.273888] event_hist_trigger+0x749/0x860 [ 100.274505] ? kasan_save_stack+0x2a/0x50 [ 100.275024] ? kasan_set_track+0x29/0x40 [ 100.275536] ? __pfx_event_hist_trigger+0x10/0x10 [ 100.276138] ? ksys_write+0xd1/0x170 [ 100.276607] ? do_syscall_64+0x3c/0x90 [ 100.277099] ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 100.277771] ? destroy_hist_data+0x446/0x470 [ 100.278324] ? event_hist_trigger_parse+0xa6c/0x3860 [ 100.278962] ? __pfx_event_hist_trigger_parse+0x10/0x10 [ 100.279627] ? __kasan_check_write+0x18/0x20 [ 100.280177] ? mutex_unlock+0x85/0xd0 [ 100.280660] ? __pfx_mutex_unlock+0x10/0x10 [ 100.281200] ? kfree+0x7b/0x120 [ 100.281619] ? ____kasan_slab_free+0x15d/0x1d0 [ 100.282197] ? event_trigger_write+0xac/0x100 [ 100.282764] ? __kasan_slab_free+0x16/0x20 [ 100.283293] ? __kmem_cache_free+0x153/0x2f0 [ 100.283844] ? sched_mm_cid_remote_clear+0xb1/0x250 [ 100.284550] ? __pfx_sched_mm_cid_remote_clear+0x10/0x10 [ 100.285221] ? event_trigger_write+0xbc/0x100 [ 100.285781] ? __kasan_check_read+0x15/0x20 [ 100.286321] ? __bitmap_weight+0x66/0xa0 [ 100.286833] ? _find_next_bit+0x46/0xe0 [ 100.287334] ? task_mm_cid_work+0x37f/0x450 [ 100.287872] event_triggers_call+0x84/0x150 [ 100.288408] trace_event_buffer_commit+0x339/0x430 [ 100.289073] ? ring_buffer_event_data+0x3f/0x60 [ 100.292189] trace_event_raw_event_sys_enter+0x8b/0xe0 [ 100.295434] syscall_trace_enter.constprop.0+0x18f/0x1b0 [ 100.298653] syscall_enter_from_user_mode+0x32/0x40 [ 100.301808] do_syscall_64+0x1a/0x90 [ 100.304748] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 100.307775] RIP: 0033:0x7f686c75c1cb [ 100.310617] Code: 73 01 c3 48 8b 0d 65 3c 10 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 21 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 35 3c 10 00 f7 d8 64 89 01 48 [ 100.317847] RSP: 002b:00007ffc60137a38 EFLAGS: 00000246 ORIG_RAX: 0000000000000021 [ 100.321200] RAX: ffffffffffffffda RBX: 000055f566469ea0 RCX: 00007f686c75c1cb [ 100.324631] RDX: 0000000000000001 RSI: 0000000000000001 RDI: 000000000000000a [ 100.328104] RBP: 00007ffc60137ac0 R08: 00007f686c818460 R09: 000000000000000a [ 100.331509] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000009 [ 100.334992] R13: 0000000000000007 R14: 000000000000000a R15: 0000000000000007 [ 100.338381] We hit the bug because when second hist trigger has was created has_hist_vars() returned false because hist trigger did not have variables. As a result of that save_hist_vars() was not called to add the trigger to trace_array->hist_vars. Later on when we attempted to remove the first histogram find_any_var_ref() failed to detect it is being used because it did not find the second trigger in hist_vars list. With this change we wait until trigger actions are created so we can take into consideration if hist trigger has variable references. Also, now we check the return value of save_hist_vars() and fail trigger creation if save_hist_vars() fails. Link: https://lore.kernel.org/linux-trace-kernel/20230712223021.636335-1-mkhalfella@purestorage.com Cc: stable@vger.kernel.org Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Signed-off-by: Mohamed Khalfella Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b97d3ad832f1..c8c61381eba4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6663,13 +6663,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (get_named_trigger_data(trigger_data)) goto enable; - if (has_hist_vars(hist_data)) - save_hist_vars(hist_data); - ret = create_actions(hist_data); if (ret) goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + if (save_hist_vars(hist_data)) + goto out_unreg; + } + ret = tracing_map_init(hist_data->map); if (ret) goto out_unreg; -- cgit v1.2.3 From d5a821896360cc8b93a15bd888fabc858c038dc0 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 13 Jul 2023 22:14:35 +0800 Subject: tracing: Fix memory leak of iter->temp when reading trace_pipe kmemleak reports: unreferenced object 0xffff88814d14e200 (size 256): comm "cat", pid 336, jiffies 4294871818 (age 779.490s) hex dump (first 32 bytes): 04 00 01 03 00 00 00 00 08 00 00 00 00 00 00 00 ................ 0c d8 c8 9b ff ff ff ff 04 5a ca 9b ff ff ff ff .........Z...... backtrace: [] __kmalloc+0x4f/0x140 [] trace_find_next_entry+0xbb/0x1d0 [] trace_print_lat_context+0xaf/0x4e0 [] print_trace_line+0x3e0/0x950 [] tracing_read_pipe+0x2d9/0x5a0 [] vfs_read+0x143/0x520 [] ksys_read+0xbd/0x160 [] do_syscall_64+0x3f/0x90 [] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 when reading file 'trace_pipe', 'iter->temp' is allocated or relocated in trace_find_next_entry() but not freed before 'trace_pipe' is closed. To fix it, free 'iter->temp' in tracing_release_pipe(). Link: https://lore.kernel.org/linux-trace-kernel/20230713141435.1133021-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: ff895103a84ab ("tracing: Save off entry when peeking at next entry") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 20122eeccf97..be847d45d81c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6781,6 +6781,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) free_cpumask_var(iter->started); kfree(iter->fmt); + kfree(iter->temp); mutex_destroy(&iter->mutex); kfree(iter); -- cgit v1.2.3 From d5f28bb1ce04636b285726ee2a5afa3a514025f4 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 8 Jul 2023 01:38:03 +0900 Subject: fprobes: Add a comment why fprobe_kprobe_handler exits if kprobe is running Add a comment the reason why fprobe_kprobe_handler() exits if any other kprobe is running. Link: https://lore.kernel.org/all/168874788299.159442.2485957441413653858.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230706120916.3c6abf15@gandalf.local.home/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/fprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2571f7f3d5f2..59321d22f43e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -100,6 +100,12 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, return; } + /* + * This user handler is shared with other kprobes and is not expected to be + * called recursively. So if any other kprobe handler is running, this will + * exit as kprobe does. See the section 'Share the callbacks with kprobes' + * in Documentation/trace/fprobe.rst for more information. + */ if (unlikely(kprobe_running())) { fp->nmissed++; goto recursion_unlock; -- cgit v1.2.3 From 66bcf65d6cf0ca6540e2341e88ee7ef02dbdda08 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:29 +0900 Subject: tracing/probes: Fix to avoid double count of the string length on the array If an array is specified with the ustring or symstr, the length of the strings are accumlated on both of 'ret' and 'total', which means the length is double counted. Just set the length to the 'ret' value for avoiding double counting. Link: https://lore.kernel.org/all/168908492917.123124.15076463491122036025.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 88903c464321 ("tracing/probe: Add ustring type for user-space string") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 00707630788d..4735c5cb76fa 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -156,11 +156,11 @@ stage3: code++; goto array; case FETCH_OP_ST_USTRING: - ret += fetch_store_strlen_user(val + code->offset); + ret = fetch_store_strlen_user(val + code->offset); code++; goto array; case FETCH_OP_ST_SYMSTR: - ret += fetch_store_symstrlen(val + code->offset); + ret = fetch_store_symstrlen(val + code->offset); code++; goto array; default: -- cgit v1.2.3 From b41326b5e0f82e93592c4366359917b5d67b529f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:38 +0900 Subject: tracing/probes: Fix not to count error code to total length Fix not to count the error code (which is minus value) to the total used length of array, because it can mess up the return code of process_fetch_insn_bottom(). Also clear the 'ret' value because it will be used for calculating next data_loc entry. Link: https://lore.kernel.org/all/168908493827.123124.2175257289106364229.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 9b960a38835f ("tracing: probeevent: Unify fetch_insn processing common part") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4735c5cb76fa..ed9d57c6b041 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -204,6 +204,8 @@ stage3: array: /* the last stage: Loop on array */ if (code->op == FETCH_OP_LP_ARRAY) { + if (ret < 0) + ret = 0; total += ret; if (++i < code->param) { code = s3; -- cgit v1.2.3 From e38e2c6a9efc435f9de344b7c91f7697e01b47d5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:48 +0900 Subject: tracing/probes: Fix to update dynamic data counter if fetcharg uses it Fix to update dynamic data counter ('dyndata') and max length ('maxlen') only if the fetcharg uses the dynamic data. Also get out arg->dynamic from unlikely(). This makes dynamic data address wrong if process_fetch_insn() returns error on !arg->dynamic case. Link: https://lore.kernel.org/all/168908494781.123124.8160245359962103684.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230710233400.5aaf024e@gandalf.local.home/ Fixes: 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index ed9d57c6b041..185da001f4c3 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,11 +267,13 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; + if (arg->dynamic) { + if (unlikely(ret < 0)) { + *dl = make_data_loc(0, dyndata - base); + } else { + dyndata += ret; + maxlen -= ret; + } } } } -- cgit v1.2.3 From 4ed8f337dee32df71435689c19d22e4ee846e15a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:57 +0900 Subject: Revert "tracing: Add "(fault)" name injection to kernel probes" This reverts commit 2e9906f84fc7c99388bb7123ade167250d50f1c0. It was turned out that commit 2e9906f84fc7 ("tracing: Add "(fault)" name injection to kernel probes") did not work correctly and probe events still show just '(fault)' (instead of '"(fault)"'). Also, current '(fault)' is more explicit that it faulted. This also moves FAULT_STRING macro to trace.h so that synthetic event can keep using it, and uses it in trace_probe.c too. Link: https://lore.kernel.org/all/168908495772.123124.1250788051922100079.stgit@devnote2/ Link: https://lore.kernel.org/all/20230706230642.3793a593@rorschach.local.home/ Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_probe.c | 2 +- kernel/trace/trace_probe_kernel.h | 31 ++++++------------------------- 3 files changed, 9 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..eee1f3ca4749 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -113,6 +113,8 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define FAULT_STRING "(fault)" + #define HIST_STACKTRACE_DEPTH 16 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..591399ddcee5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -65,7 +65,7 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) int len = *(u32 *)data >> 16; if (!len) - trace_seq_puts(s, "(fault)"); + trace_seq_puts(s, FAULT_STRING); else trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index c4e1d4c03a85..6deae2ce34f8 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -2,8 +2,6 @@ #ifndef __TRACE_PROBE_KERNEL_H_ #define __TRACE_PROBE_KERNEL_H_ -#define FAULT_STRING "(fault)" - /* * This depends on trace_probe.h, but can not include it due to * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. @@ -15,16 +13,8 @@ static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; - int ret; - ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); - /* - * strnlen_user_nofault returns zero on fault, insert the - * FAULT_STRING when that occurs. - */ - if (ret <= 0) - return strlen(FAULT_STRING) + 1; - return ret; + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); } /* Return the length of string -- including null terminal byte */ @@ -44,18 +34,7 @@ fetch_store_strlen(unsigned long addr) len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - /* For faults, return enough to hold the FAULT_STRING */ - return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; -} - -static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) -{ - if (ret >= 0) { - *(u32 *)dest = make_data_loc(ret, __dest - base); - } else { - strscpy(__dest, FAULT_STRING, len); - ret = strlen(__dest) + 1; - } + return (ret < 0) ? ret : len; } /* @@ -76,7 +55,8 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } @@ -107,7 +87,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } -- cgit v1.2.3 From 797311bce5c2ac90b8d65e357603cfd410d36ebb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:16:07 +0900 Subject: tracing/probes: Fix to record 0-length data_loc in fetch_store_string*() if fails Fix to record 0-length data to data_loc in fetch_store_string*() if it fails to get the string data. Currently those expect that the data_loc is updated by store_trace_args() if it returns the error code. However, that does not work correctly if the argument is an array of strings. In that case, store_trace_args() only clears the first entry of the array (which may have no error) and leaves other entries. So it should be cleared by fetch_store_string*() itself. Also, 'dyndata' and 'maxlen' in store_trace_args() should be updated only if it is used (ret > 0 and argument is a dynamic data.) Link: https://lore.kernel.org/all/168908496683.123124.4761206188794205601.stgit@devnote2/ Fixes: 40b53b771806 ("tracing: probeevent: Add array type support") Cc: stable@vger.kernel.org Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe_kernel.h | 13 +++++++++---- kernel/trace/trace_probe_tmpl.h | 10 +++------- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index 6deae2ce34f8..bb723eefd7b7 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -37,6 +37,13 @@ fetch_store_strlen(unsigned long addr) return (ret < 0) ? ret : len; } +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base) +{ + if (ret < 0) + ret = 0; + *(u32 *)dest = make_data_loc(ret, __dest - base); +} + /* * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf * with max length and relative data location. @@ -55,8 +62,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } @@ -87,8 +93,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 185da001f4c3..3935b347f874 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,13 +267,9 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (arg->dynamic) { - if (unlikely(ret < 0)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; - } + if (arg->dynamic && likely(ret > 0)) { + dyndata += ret; + maxlen -= ret; } } } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..7b47e9a2c010 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -170,7 +170,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) */ ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); - } + } else + *(u32 *)dest = make_data_loc(0, (void *)dst - base); return ret; } -- cgit v1.2.3