summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-21 21:52:03 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-21 21:52:03 +0300
commit6c4aa896ebee5edf2b35a9d071e5a468797f96d8 (patch)
treebe4307f37214a440755fb5791ca13a1da0499049 /include/linux
parenta6640c8c2fc029f015c87672585931c6106971c1 (diff)
parentb709eb872e19a19607bbb6d2975bc264d59735cf (diff)
downloadlinux-6c4aa896ebee5edf2b35a9d071e5a468797f96d8.tar.xz
Merge tag 'perf-core-2025-01-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar: "Seqlock optimizations that arose in a perf context and were merged into the perf tree: - seqlock: Add raw_seqcount_try_begin (Suren Baghdasaryan) - mm: Convert mm_lock_seq to a proper seqcount (Suren Baghdasaryan) - mm: Introduce mmap_lock_speculate_{try_begin|retry} (Suren Baghdasaryan) - mm/gup: Use raw_seqcount_try_begin() (Peter Zijlstra) Core perf enhancements: - Reduce 'struct page' footprint of perf by mapping pages in advance (Lorenzo Stoakes) - Save raw sample data conditionally based on sample type (Yabin Cui) - Reduce sampling overhead by checking sample_type in perf_sample_save_callchain() and perf_sample_save_brstack() (Yabin Cui) - Export perf_exclude_event() (Namhyung Kim) Uprobes scalability enhancements: (Andrii Nakryiko) - Simplify find_active_uprobe_rcu() VMA checks - Add speculative lockless VMA-to-inode-to-uprobe resolution - Simplify session consumer tracking - Decouple return_instance list traversal and freeing - Ensure return_instance is detached from the list before freeing - Reuse return_instances between multiple uretprobes within task - Guard against kmemdup() failing in dup_return_instance() AMD core PMU driver enhancements: - Relax privilege filter restriction on AMD IBS (Namhyung Kim) AMD RAPL energy counters support: (Dhananjay Ugwekar) - Introduce topology_logical_core_id() (K Prateek Nayak) - Remove the unused get_rapl_pmu_cpumask() function - Remove the cpu_to_rapl_pmu() function - Rename rapl_pmu variables - Make rapl_model struct global - Add arguments to the init and cleanup functions - Modify the generic variable names to *_pkg* - Remove the global variable rapl_msrs - Move the cntr_mask to rapl_pmus struct - Add core energy counter support for AMD CPUs Intel core PMU driver enhancements: - Support RDPMC 'metrics clear mode' feature (Kan Liang) - Clarify adaptive PEBS processing (Kan Liang) - Factor out functions for PEBS records processing (Kan Liang) - Simplify the PEBS records processing for adaptive PEBS (Kan Liang) Intel uncore driver enhancements: (Kan Liang) - Convert buggy pmu->func_id use to pmu->registered - Support more units on Granite Rapids" * tag 'perf-core-2025-01-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) perf: map pages in advance perf/x86/intel/uncore: Support more units on Granite Rapids perf/x86/intel/uncore: Clean up func_id perf/x86/intel: Support RDPMC metrics clear mode uprobes: Guard against kmemdup() failing in dup_return_instance() perf/x86: Relax privilege filter restriction on AMD IBS perf/core: Export perf_exclude_event() uprobes: Reuse return_instances between multiple uretprobes within task uprobes: Ensure return_instance is detached from the list before freeing uprobes: Decouple return_instance list traversal and freeing uprobes: Simplify session consumer tracking uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution uprobes: simplify find_active_uprobe_rcu() VMA checks mm: introduce mmap_lock_speculate_{try_begin|retry} mm: convert mm_lock_seq to a proper seqcount mm/gup: Use raw_seqcount_try_begin() seqlock: add raw_seqcount_try_begin perf/x86/rapl: Add core energy counter support for AMD CPUs perf/x86/rapl: Move the cntr_mask to rapl_pmus struct perf/x86/rapl: Remove the global variable rapl_msrs ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/mm.h12
-rw-r--r--include/linux/mm_types.h7
-rw-r--r--include/linux/mmap_lock.h84
-rw-r--r--include/linux/perf_event.h32
-rw-r--r--include/linux/seqlock.h22
-rw-r--r--include/linux/uprobes.h16
6 files changed, 138 insertions, 35 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1c3db9cf355..f02925447e59 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below.
*/
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
return false;
if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all().
*/
- if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
up_read(&vma->vm_lock->lock);
return false;
}
@@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
}
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
{
mmap_assert_write_locked(vma->vm_mm);
@@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
- *mm_lock_seq = vma->vm_mm->mm_lock_seq;
+ *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
return (vma->vm_lock_seq == *mm_lock_seq);
}
@@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
- int mm_lock_seq;
+ unsigned int mm_lock_seq;
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
@@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
- int mm_lock_seq;
+ unsigned int mm_lock_seq;
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 332cee285662..825c04b56403 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -727,7 +727,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the
* slowpath.
*/
- int vm_lock_seq;
+ unsigned int vm_lock_seq;
/* Unstable RCU readers are allowed to read this. */
struct vma_lock *vm_lock;
#endif
@@ -921,6 +921,9 @@ struct mm_struct {
* Roughly speaking, incrementing the sequence number is
* equivalent to releasing locks on VMAs; reading the sequence
* number can be part of taking a read lock on a VMA.
+ * Incremented every time mmap_lock is write-locked/unlocked.
+ * Initialized to 0, therefore odd values indicate mmap_lock
+ * is write-locked and even values that it's released.
*
* Can be modified under write mmap_lock using RELEASE
* semantics.
@@ -929,7 +932,7 @@ struct mm_struct {
* Can be read with ACQUIRE semantics if not holding write
* mmap_lock.
*/
- int mm_lock_seq;
+ seqcount_t mm_lock_seq;
#endif
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index de9dc20b01ba..45a21faa3ff6 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -71,39 +71,68 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
}
#ifdef CONFIG_PER_VMA_LOCK
-/*
- * Drop all currently-held per-VMA locks.
- * This is called from the mmap_lock implementation directly before releasing
- * a write-locked mmap_lock (or downgrading it to read-locked).
- * This should normally NOT be called manually from other places.
- * If you want to call this manually anyway, keep in mind that this will release
- * *all* VMA write locks, including ones from further up the stack.
- */
-static inline void vma_end_write_all(struct mm_struct *mm)
+
+static inline void mm_lock_seqcount_init(struct mm_struct *mm)
+{
+ seqcount_init(&mm->mm_lock_seq);
+}
+
+static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
+{
+ do_raw_write_seqcount_begin(&mm->mm_lock_seq);
+}
+
+static inline void mm_lock_seqcount_end(struct mm_struct *mm)
+{
+ ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
+ do_raw_write_seqcount_end(&mm->mm_lock_seq);
+}
+
+static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
- mmap_assert_write_locked(mm);
/*
- * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
- * mmap_lock being held.
- * We need RELEASE semantics here to ensure that preceding stores into
- * the VMA take effect before we unlock it with this store.
- * Pairs with ACQUIRE semantics in vma_start_read().
+ * Since mmap_lock is a sleeping lock, and waiting for it to become
+ * unlocked is more or less equivalent with taking it ourselves, don't
+ * bother with the speculative path if mmap_lock is already write-locked
+ * and take the slow path, which takes the lock.
*/
- smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
+ return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
+}
+
+static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
+{
+ return read_seqcount_retry(&mm->mm_lock_seq, seq);
}
-#else
-static inline void vma_end_write_all(struct mm_struct *mm) {}
-#endif
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
+static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
+static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
+
+static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
+{
+ return false;
+}
+
+static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
+{
+ return true;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
+ mm_lock_seqcount_init(mm);
}
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
+ mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
@@ -111,6 +140,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
__mmap_lock_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
+ mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
@@ -120,10 +150,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
__mmap_lock_trace_start_locking(mm, true);
ret = down_write_killable(&mm->mmap_lock);
+ if (!ret)
+ mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
return ret;
}
+/*
+ * Drop all currently-held per-VMA locks.
+ * This is called from the mmap_lock implementation directly before releasing
+ * a write-locked mmap_lock (or downgrading it to read-locked).
+ * This should normally NOT be called manually from other places.
+ * If you want to call this manually anyway, keep in mind that this will release
+ * *all* VMA write locks, including ones from further up the stack.
+ */
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+ mm_lock_seqcount_end(mm);
+}
+
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index cb99ec8c9e96..8333f132f4a9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data,
{
int size = 1;
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ return;
+ if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN))
+ return;
+
data->callchain = perf_callchain(event, regs);
size += data->callchain->nr;
@@ -1287,12 +1292,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data,
}
static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
+ struct perf_event *event,
struct perf_raw_record *raw)
{
struct perf_raw_frag *frag = &raw->frag;
u32 sum = 0;
int size;
+ if (!(event->attr.sample_type & PERF_SAMPLE_RAW))
+ return;
+ if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW))
+ return;
+
do {
sum += frag->size;
if (perf_raw_frag_last(frag))
@@ -1309,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
data->sample_flags |= PERF_SAMPLE_RAW;
}
+static inline bool has_branch_stack(struct perf_event *event)
+{
+ return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
+}
+
static inline void perf_sample_save_brstack(struct perf_sample_data *data,
struct perf_event *event,
struct perf_branch_stack *brs,
@@ -1316,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data,
{
int size = sizeof(u64); /* nr */
+ if (!has_branch_stack(event))
+ return;
+ if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK))
+ return;
+
if (branch_sample_hw_index(event))
size += sizeof(u64);
size += brs->nr * sizeof(struct perf_branch_entry);
@@ -1669,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}
+extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs);
+
extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
int entry_size, struct pt_regs *regs,
@@ -1705,11 +1728,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
# define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs)
#endif
-static inline bool has_branch_stack(struct perf_event *event)
-{
- return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
-}
-
static inline bool needs_branch_stack(struct perf_event *event)
{
return event->attr.branch_sample_type != 0;
@@ -1879,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{
return 0;
}
+static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
+{
+ return 0;
+}
#endif
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index eb20dcaa51b5..d1a2346cf0f8 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -319,6 +319,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
})
/**
+ * raw_seqcount_try_begin() - begin a seqcount_t read critical section
+ * w/o lockdep and w/o counter stabilization
+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ *
+ * Similar to raw_seqcount_begin(), except it enables eliding the critical
+ * section entirely if odd, instead of doing the speculation knowing it will
+ * fail.
+ *
+ * Useful when counter stabilization is more or less equivalent to taking
+ * the lock and there is a slowpath that does that.
+ *
+ * If true, start will be set to the (even) sequence count read.
+ *
+ * Return: true when a read critical section is started.
+ */
+#define raw_seqcount_try_begin(s, start) \
+({ \
+ start = raw_read_seqcount(s); \
+ !(start & 1); \
+})
+
+/**
* raw_seqcount_begin() - begin a seqcount_t read critical section w/o
* lockdep and w/o counter stabilization
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index e0a4c2082245..b1df7d792fa1 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/wait.h>
#include <linux/timer.h>
+#include <linux/seqlock.h>
struct uprobe;
struct vm_area_struct;
@@ -124,6 +125,10 @@ struct uprobe_task {
unsigned int depth;
struct return_instance *return_instances;
+ struct return_instance *ri_pool;
+ struct timer_list ri_timer;
+ seqcount_t ri_seqcount;
+
union {
struct {
struct arch_uprobe_task autask;
@@ -137,7 +142,6 @@ struct uprobe_task {
};
struct uprobe *active_uprobe;
- struct timer_list ri_timer;
unsigned long xol_vaddr;
struct arch_uprobe *auprobe;
@@ -154,12 +158,18 @@ struct return_instance {
unsigned long stack; /* stack pointer */
unsigned long orig_ret_vaddr; /* original return address */
bool chained; /* true, if instance is nested */
- int consumers_cnt;
+ int cons_cnt; /* total number of session consumers */
struct return_instance *next; /* keep as stack */
struct rcu_head rcu;
- struct return_consumer consumers[] __counted_by(consumers_cnt);
+ /* singular pre-allocated return_consumer instance for common case */
+ struct return_consumer consumer;
+ /*
+ * extra return_consumer instances for rare cases of multiple session consumers,
+ * contains (cons_cnt - 1) elements
+ */
+ struct return_consumer *extra_consumers;
} ____cacheline_aligned;
enum rp_check {