diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-21 21:52:03 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-21 21:52:03 +0300 |
| commit | 6c4aa896ebee5edf2b35a9d071e5a468797f96d8 (patch) | |
| tree | be4307f37214a440755fb5791ca13a1da0499049 /include/linux | |
| parent | a6640c8c2fc029f015c87672585931c6106971c1 (diff) | |
| parent | b709eb872e19a19607bbb6d2975bc264d59735cf (diff) | |
| download | linux-6c4aa896ebee5edf2b35a9d071e5a468797f96d8.tar.xz | |
Merge tag 'perf-core-2025-01-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar:
"Seqlock optimizations that arose in a perf context and were merged
into the perf tree:
- seqlock: Add raw_seqcount_try_begin (Suren Baghdasaryan)
- mm: Convert mm_lock_seq to a proper seqcount (Suren Baghdasaryan)
- mm: Introduce mmap_lock_speculate_{try_begin|retry} (Suren
Baghdasaryan)
- mm/gup: Use raw_seqcount_try_begin() (Peter Zijlstra)
Core perf enhancements:
- Reduce 'struct page' footprint of perf by mapping pages in advance
(Lorenzo Stoakes)
- Save raw sample data conditionally based on sample type (Yabin Cui)
- Reduce sampling overhead by checking sample_type in
perf_sample_save_callchain() and perf_sample_save_brstack() (Yabin
Cui)
- Export perf_exclude_event() (Namhyung Kim)
Uprobes scalability enhancements: (Andrii Nakryiko)
- Simplify find_active_uprobe_rcu() VMA checks
- Add speculative lockless VMA-to-inode-to-uprobe resolution
- Simplify session consumer tracking
- Decouple return_instance list traversal and freeing
- Ensure return_instance is detached from the list before freeing
- Reuse return_instances between multiple uretprobes within task
- Guard against kmemdup() failing in dup_return_instance()
AMD core PMU driver enhancements:
- Relax privilege filter restriction on AMD IBS (Namhyung Kim)
AMD RAPL energy counters support: (Dhananjay Ugwekar)
- Introduce topology_logical_core_id() (K Prateek Nayak)
- Remove the unused get_rapl_pmu_cpumask() function
- Remove the cpu_to_rapl_pmu() function
- Rename rapl_pmu variables
- Make rapl_model struct global
- Add arguments to the init and cleanup functions
- Modify the generic variable names to *_pkg*
- Remove the global variable rapl_msrs
- Move the cntr_mask to rapl_pmus struct
- Add core energy counter support for AMD CPUs
Intel core PMU driver enhancements:
- Support RDPMC 'metrics clear mode' feature (Kan Liang)
- Clarify adaptive PEBS processing (Kan Liang)
- Factor out functions for PEBS records processing (Kan Liang)
- Simplify the PEBS records processing for adaptive PEBS (Kan Liang)
Intel uncore driver enhancements: (Kan Liang)
- Convert buggy pmu->func_id use to pmu->registered
- Support more units on Granite Rapids"
* tag 'perf-core-2025-01-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
perf: map pages in advance
perf/x86/intel/uncore: Support more units on Granite Rapids
perf/x86/intel/uncore: Clean up func_id
perf/x86/intel: Support RDPMC metrics clear mode
uprobes: Guard against kmemdup() failing in dup_return_instance()
perf/x86: Relax privilege filter restriction on AMD IBS
perf/core: Export perf_exclude_event()
uprobes: Reuse return_instances between multiple uretprobes within task
uprobes: Ensure return_instance is detached from the list before freeing
uprobes: Decouple return_instance list traversal and freeing
uprobes: Simplify session consumer tracking
uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution
uprobes: simplify find_active_uprobe_rcu() VMA checks
mm: introduce mmap_lock_speculate_{try_begin|retry}
mm: convert mm_lock_seq to a proper seqcount
mm/gup: Use raw_seqcount_try_begin()
seqlock: add raw_seqcount_try_begin
perf/x86/rapl: Add core energy counter support for AMD CPUs
perf/x86/rapl: Move the cntr_mask to rapl_pmus struct
perf/x86/rapl: Remove the global variable rapl_msrs
...
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/mm.h | 12 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 7 | ||||
| -rw-r--r-- | include/linux/mmap_lock.h | 84 | ||||
| -rw-r--r-- | include/linux/perf_event.h | 32 | ||||
| -rw-r--r-- | include/linux/seqlock.h | 22 | ||||
| -rw-r--r-- | include/linux/uprobes.h | 16 |
6 files changed, 138 insertions, 35 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index b1c3db9cf355..f02925447e59 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee285662..825c04b56403 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -727,7 +727,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -921,6 +921,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -929,7 +932,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,68 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) +{ + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); +} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { - mmap_assert_write_locked(mm); /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); } -#else -static inline void vma_end_write_all(struct mm_struct *mm) {} -#endif + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +140,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +150,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cb99ec8c9e96..8333f132f4a9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, { int size = 1; + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) + return; + data->callchain = perf_callchain(event, regs); size += data->callchain->nr; @@ -1287,12 +1292,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, + struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; + if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) + return; + do { sum += frag->size; if (perf_raw_frag_last(frag)) @@ -1309,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, data->sample_flags |= PERF_SAMPLE_RAW; } +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, @@ -1316,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, { int size = sizeof(u64); /* nr */ + if (!has_branch_stack(event)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) + return; + if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); @@ -1669,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } +extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); + extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, @@ -1705,11 +1728,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif -static inline bool has_branch_stack(struct perf_event *event) -{ - return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; -} - static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; @@ -1879,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index eb20dcaa51b5..d1a2346cf0f8 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -319,6 +319,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) }) /** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + +/** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e0a4c2082245..b1df7d792fa1 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include <linux/types.h> #include <linux/wait.h> #include <linux/timer.h> +#include <linux/seqlock.h> struct uprobe; struct vm_area_struct; @@ -124,6 +125,10 @@ struct uprobe_task { unsigned int depth; struct return_instance *return_instances; + struct return_instance *ri_pool; + struct timer_list ri_timer; + seqcount_t ri_seqcount; + union { struct { struct arch_uprobe_task autask; @@ -137,7 +142,6 @@ struct uprobe_task { }; struct uprobe *active_uprobe; - struct timer_list ri_timer; unsigned long xol_vaddr; struct arch_uprobe *auprobe; @@ -154,12 +158,18 @@ struct return_instance { unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ - int consumers_cnt; + int cons_cnt; /* total number of session consumers */ struct return_instance *next; /* keep as stack */ struct rcu_head rcu; - struct return_consumer consumers[] __counted_by(consumers_cnt); + /* singular pre-allocated return_consumer instance for common case */ + struct return_consumer consumer; + /* + * extra return_consumer instances for rare cases of multiple session consumers, + * contains (cons_cnt - 1) elements + */ + struct return_consumer *extra_consumers; } ____cacheline_aligned; enum rp_check { |
