summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-09-30 21:11:21 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2025-09-30 21:11:21 +0300
commite4dcbdff114e2c0a8059c396e233aa5d9637afce (patch)
tree6fb072aa7dabaa12eecae8d9522d9832b110d093 /arch
parent6c7340a7a8d2b6ecad1ad108f6daa73ba1dc082f (diff)
parent6d48436560e91be858158e227f21aab71698814e (diff)
downloadlinux-e4dcbdff114e2c0a8059c396e233aa5d9637afce.tar.xz
Merge tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar: "Core perf code updates: - Convert mmap() related reference counts to refcount_t. This is in reaction to the recently fixed refcount bugs, which could have been detected earlier and could have mitigated the bug somewhat (Thomas Gleixner, Peter Zijlstra) - Clean up and simplify the callchain code, in preparation for sframes (Steven Rostedt, Josh Poimboeuf) Uprobes updates: - Add support to optimize usdt probes on x86-64, which gives a substantial speedup (Jiri Olsa) - Cleanups and fixes on x86 (Peter Zijlstra) PMU driver updates: - Various optimizations and fixes to the Intel PMU driver (Dapeng Mi) Misc cleanups and fixes: - Remove redundant __GFP_NOWARN (Qianfeng Rong)" * tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) selftests/bpf: Fix uprobe_sigill test for uprobe syscall error value uprobes/x86: Return error from uprobe syscall when not called from trampoline perf: Skip user unwind if the task is a kernel thread perf: Simplify get_perf_callchain() user logic perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL perf: Have get_perf_callchain() return NULL if crosstask and user are set perf: Remove get_perf_callchain() init_nr argument perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK perf/x86/intel: Change macro GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48) perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error perf/x86/intel: Use early_initcall() to hook bts_init() uprobes: Remove redundant __GFP_NOWARN selftests/seccomp: validate uprobe syscall passes through seccomp seccomp: passthrough uprobe systemcall without filtering selftests/bpf: Fix uprobe syscall shadow stack test selftests/bpf: Change test_uretprobe_regs_change for uprobe and uretprobe selftests/bpf: Add uprobe_regs_equal test selftests/bpf: Add optimized usdt variant for basic usdt test ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/probes/uprobes/core.c2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/events/core.c16
-rw-r--r--arch/x86/events/intel/bts.c2
-rw-r--r--arch/x86/events/intel/core.c21
-rw-r--r--arch/x86/include/asm/msr-index.h14
-rw-r--r--arch/x86/include/asm/perf_event.h8
-rw-r--r--arch/x86/include/asm/shstk.h4
-rw-r--r--arch/x86/include/asm/uprobes.h7
-rw-r--r--arch/x86/kernel/shstk.c40
-rw-r--r--arch/x86/kernel/uprobes.c635
-rw-r--r--arch/x86/kvm/pmu.h2
12 files changed, 700 insertions, 52 deletions
diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c
index 885e0c5e8c20..3d96fb41d624 100644
--- a/arch/arm/probes/uprobes/core.c
+++ b/arch/arm/probes/uprobes/core.c
@@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr,
- __opcode_to_mem_arm(auprobe->bpinsn));
+ __opcode_to_mem_arm(auprobe->bpinsn), true);
}
bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92cf0fe2291e..ced2a1deecd7 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
335 common uretprobe sys_uretprobe
+336 common uprobe sys_uprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7610f26dfbd9..745caa6c15a3 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event)
void x86_pmu_show_pmu_cap(struct pmu *pmu)
{
- pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
- pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu));
- pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
- pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu));
- pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl));
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
+ pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu));
+ pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64));
+ pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu));
+ pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64));
+ pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask);
+ pr_info("... max period: %016llx\n", x86_pmu.max_period);
+ pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl));
}
static int __init init_hw_perf_events(void)
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 61da6b8a3d51..cbac54cb3a9e 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -643,4 +643,4 @@ static __init int bts_init(void)
return perf_pmu_register(&bts_pmu, "intel_bts", -1);
}
-arch_initcall(bts_init);
+early_initcall(bts_init);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c2fb729c270e..28f5468a6ea3 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- u64 mask, bits = 0;
int idx = hwc->idx;
+ u64 bits = 0;
if (is_topdown_idx(idx)) {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
idx -= INTEL_PMC_IDX_FIXED;
bits = intel_fixed_bits_by_idx(idx, bits);
- mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
-
- if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
+ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip)
bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
- mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
- }
- cpuc->fixed_ctrl_val &= ~mask;
+ cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
cpuc->fixed_ctrl_val |= bits;
}
@@ -2997,7 +2993,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc)
if (event->group_leader != leader->group_leader)
break;
for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) {
- if (WARN_ON_ONCE(i + idx > cpuc->n_events))
+ if (i + idx >= cpuc->n_events ||
+ !is_acr_event_group(cpuc->event_list[i + idx]))
return;
__set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1);
}
@@ -5318,9 +5315,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
if (pmu->intel_cap.perf_metrics)
- pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+ pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
else
- pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
+ pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
intel_pmu_check_event_constraints(pmu->event_constraints,
pmu->cntr_mask64,
@@ -5455,7 +5452,7 @@ static void intel_pmu_cpu_starting(int cpu)
rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
if (!perf_cap.perf_metrics) {
x86_pmu.intel_cap.perf_metrics = 0;
- x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
+ x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
}
}
@@ -7789,7 +7786,7 @@ __init int intel_pmu_init(void)
}
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
- x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+ x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
if (x86_pmu.intel_cap.pebs_timing_info)
x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b65c3ba5fa14..f627196eb796 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -315,12 +315,14 @@
#define PERF_CAP_PT_IDX 16
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
-#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
-#define PERF_CAP_ARCH_REG BIT_ULL(7)
-#define PERF_CAP_PEBS_FORMAT 0xf00
-#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
-#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
- PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
+#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
+#define PERF_CAP_ARCH_REG BIT_ULL(7)
+#define PERF_CAP_PEBS_FORMAT 0xf00
+#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
+#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17)
+#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
+ PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
+ PERF_CAP_PEBS_TIMING_INFO)
#define MSR_IA32_RTIT_CTL 0x00000570
#define RTIT_CTL_TRACEEN BIT(0)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 70d1d94aca7e..49a4d442f3fc 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -35,7 +35,6 @@
#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36)
#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40)
-#define INTEL_FIXED_BITS_MASK 0xFULL
#define INTEL_FIXED_BITS_STRIDE 4
#define INTEL_FIXED_0_KERNEL (1ULL << 0)
#define INTEL_FIXED_0_USER (1ULL << 1)
@@ -48,6 +47,11 @@
#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34)
#define ICL_FIXED_0_ADAPTIVE (1ULL << 32)
+#define INTEL_FIXED_BITS_MASK \
+ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \
+ INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \
+ ICL_FIXED_0_ADAPTIVE)
+
#define intel_fixed_bits_by_idx(_idx, _bits) \
((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
@@ -430,7 +434,7 @@ static inline bool is_topdown_idx(int idx)
#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
-#define GLOBAL_CTRL_EN_PERF_METRICS 48
+#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
/*
* We model guest LBR event tracing as another fixed-mode PMC like BTS.
*
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 0f50e0125943..fc7dcec58fd4 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksignal *ksig);
int restore_signal_shadow_stack(void);
int shstk_update_last_frame(unsigned long val);
bool shstk_is_enabled(void);
+int shstk_pop(u64 *val);
+int shstk_push(u64 val);
#else
static inline long shstk_prctl(struct task_struct *task, int option,
unsigned long arg2) { return -EINVAL; }
@@ -35,6 +37,8 @@ static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
static inline int restore_signal_shadow_stack(void) { return 0; }
static inline int shstk_update_last_frame(unsigned long val) { return 0; }
static inline bool shstk_is_enabled(void) { return false; }
+static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
+static inline int shstk_push(u64 val) { return -ENOTSUPP; }
#endif /* CONFIG_X86_USER_SHADOW_STACK */
#endif /* __ASSEMBLER__ */
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..1ee2e5115955 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
#define UPROBE_SWBP_INSN 0xcc
#define UPROBE_SWBP_INSN_SIZE 1
+enum {
+ ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0,
+ ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1,
+};
+
struct uprobe_xol_ops;
struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
u8 ilen;
} push;
};
+
+ unsigned long flags;
};
struct arch_uprobe_task {
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 5eba6c5a6775..978232b6d48d 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void)
return ssp;
}
+int shstk_pop(u64 *val)
+{
+ int ret = 0;
+ u64 ssp;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -ENOTSUPP;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+ if (val && get_user(*val, (__user u64 *)ssp))
+ ret = -EFAULT;
+ else
+ wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
+ fpregs_unlock();
+
+ return ret;
+}
+
+int shstk_push(u64 val)
+{
+ u64 ssp;
+ int ret;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -ENOTSUPP;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+ ssp -= SS_FRAME_SIZE;
+ ret = write_user_shstk_64((__user void *)ssp, val);
+ if (!ret)
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return ret;
+}
+
#define SHSTK_DATA_BIT BIT(63)
static int put_shstk_data(u64 __user *addr, u64 data)
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6d383839e839..845aeaf36b8d 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <asm/insn.h>
#include <asm/mmu_context.h>
+#include <asm/nops.h>
/* Post-execution fixups. */
@@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
#ifdef CONFIG_X86_64
+struct uretprobe_syscall_args {
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long ax;
+};
+
asm (
".pushsection .rodata\n"
".global uretprobe_trampoline_entry\n"
"uretprobe_trampoline_entry:\n"
- "pushq %rax\n"
- "pushq %rcx\n"
- "pushq %r11\n"
- "movq $" __stringify(__NR_uretprobe) ", %rax\n"
+ "push %rax\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "mov $" __stringify(__NR_uretprobe) ", %rax\n"
"syscall\n"
".global uretprobe_syscall_check\n"
"uretprobe_syscall_check:\n"
- "popq %r11\n"
- "popq %rcx\n"
-
- /* The uretprobe syscall replaces stored %rax value with final
+ "pop %r11\n"
+ "pop %rcx\n"
+ /*
+ * The uretprobe syscall replaces stored %rax value with final
* return address, so we don't restore %rax in here and just
* call ret.
*/
- "retq\n"
+ "ret\n"
+ "int3\n"
".global uretprobe_trampoline_end\n"
"uretprobe_trampoline_end:\n"
".popsection\n"
@@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[];
extern u8 uretprobe_trampoline_end[];
extern u8 uretprobe_syscall_check[];
-void *arch_uprobe_trampoline(unsigned long *psize)
+void *arch_uretprobe_trampoline(unsigned long *psize)
{
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
struct pt_regs *regs = task_pt_regs(current);
@@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp)
SYSCALL_DEFINE0(uretprobe)
{
struct pt_regs *regs = task_pt_regs(current);
- unsigned long err, ip, sp, r11_cx_ax[3], tramp;
+ struct uretprobe_syscall_args args;
+ unsigned long err, ip, sp, tramp;
/* If there's no trampoline, we are called from wrong place. */
tramp = uprobe_get_trampoline_vaddr();
@@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe)
if (unlikely(regs->ip != trampoline_check_ip(tramp)))
goto sigill;
- err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
if (err)
goto sigill;
/* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
- regs->r11 = r11_cx_ax[0];
- regs->cx = r11_cx_ax[1];
- regs->ax = r11_cx_ax[2];
- regs->sp += sizeof(r11_cx_ax);
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ax = args.ax;
+ regs->sp += sizeof(args);
regs->orig_ax = -1;
ip = regs->ip;
@@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe)
*/
if (regs->sp != sp || shstk_is_enabled())
return regs->ax;
- regs->sp -= sizeof(r11_cx_ax);
+ regs->sp -= sizeof(args);
/* for the case uprobe_consumer has changed r11/cx */
- r11_cx_ax[0] = regs->r11;
- r11_cx_ax[1] = regs->cx;
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
/*
* ax register is passed through as return value, so we can use
* its space on stack for ip value and jump to it through the
* trampoline's ret instruction
*/
- r11_cx_ax[2] = regs->ip;
+ args.ax = regs->ip;
regs->ip = ip;
- err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
if (err)
goto sigill;
@@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
*sr = utask->autask.saved_scratch_register;
}
}
+
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
+static struct page *tramp_mapping_pages[2] __ro_after_init;
+
+static struct vm_special_mapping tramp_mapping = {
+ .name = "[uprobes-trampoline]",
+ .mremap = tramp_mremap,
+ .pages = tramp_mapping_pages,
+};
+
+struct uprobe_trampoline {
+ struct hlist_node node;
+ unsigned long vaddr;
+};
+
+static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
+{
+ long delta = (long)(vaddr + 5 - vtramp);
+
+ return delta >= INT_MIN && delta <= INT_MAX;
+}
+
+static unsigned long find_nearest_trampoline(unsigned long vaddr)
+{
+ struct vm_unmapped_area_info info = {
+ .length = PAGE_SIZE,
+ .align_mask = ~PAGE_MASK,
+ };
+ unsigned long low_limit, high_limit;
+ unsigned long low_tramp, high_tramp;
+ unsigned long call_end = vaddr + 5;
+
+ if (check_add_overflow(call_end, INT_MIN, &low_limit))
+ low_limit = PAGE_SIZE;
+
+ high_limit = call_end + INT_MAX;
+
+ /* Search up from the caller address. */
+ info.low_limit = call_end;
+ info.high_limit = min(high_limit, TASK_SIZE);
+ high_tramp = vm_unmapped_area(&info);
+
+ /* Search down from the caller address. */
+ info.low_limit = max(low_limit, PAGE_SIZE);
+ info.high_limit = call_end;
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ low_tramp = vm_unmapped_area(&info);
+
+ if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp))
+ return -ENOMEM;
+ if (IS_ERR_VALUE(high_tramp))
+ return low_tramp;
+ if (IS_ERR_VALUE(low_tramp))
+ return high_tramp;
+
+ /* Return address that's closest to the caller address. */
+ if (call_end - low_tramp < high_tramp - call_end)
+ return low_tramp;
+ return high_tramp;
+}
+
+static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct mm_struct *mm = current->mm;
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+
+ if (!user_64bit_mode(regs))
+ return NULL;
+
+ vaddr = find_nearest_trampoline(vaddr);
+ if (IS_ERR_VALUE(vaddr))
+ return NULL;
+
+ tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
+ if (unlikely(!tramp))
+ return NULL;
+
+ tramp->vaddr = vaddr;
+ vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
+ VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+ &tramp_mapping);
+ if (IS_ERR(vma)) {
+ kfree(tramp);
+ return NULL;
+ }
+ return tramp;
+}
+
+static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
+{
+ struct uprobes_state *state = &current->mm->uprobes_state;
+ struct uprobe_trampoline *tramp = NULL;
+
+ if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
+ return NULL;
+
+ hlist_for_each_entry(tramp, &state->head_tramps, node) {
+ if (is_reachable_by_call(tramp->vaddr, vaddr)) {
+ *new = false;
+ return tramp;
+ }
+ }
+
+ tramp = create_uprobe_trampoline(vaddr);
+ if (!tramp)
+ return NULL;
+
+ *new = true;
+ hlist_add_head(&tramp->node, &state->head_tramps);
+ return tramp;
+}
+
+static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
+{
+ /*
+ * We do not unmap and release uprobe trampoline page itself,
+ * because there's no easy way to make sure none of the threads
+ * is still inside the trampoline.
+ */
+ hlist_del(&tramp->node);
+ kfree(tramp);
+}
+
+void arch_uprobe_init_state(struct mm_struct *mm)
+{
+ INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
+}
+
+void arch_uprobe_clear_state(struct mm_struct *mm)
+{
+ struct uprobes_state *state = &mm->uprobes_state;
+ struct uprobe_trampoline *tramp;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
+ destroy_uprobe_trampoline(tramp);
+}
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+ struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+ return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ bool found, retry = true;
+ unsigned int seq;
+
+ rcu_read_lock();
+ if (mmap_lock_speculate_try_begin(mm, &seq)) {
+ found = __in_uprobe_trampoline(ip);
+ retry = mmap_lock_speculate_retry(mm, seq);
+ }
+ rcu_read_unlock();
+
+ if (retry) {
+ mmap_read_lock(mm);
+ found = __in_uprobe_trampoline(ip);
+ mmap_read_unlock(mm);
+ }
+ return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+ unsigned long ax;
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uprobe_syscall_args args;
+ unsigned long ip, sp, sret;
+ int err;
+
+ /* Allow execution only from uprobe trampolines. */
+ if (!in_uprobe_trampoline(regs->ip))
+ return -ENXIO;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ ip = regs->ip;
+
+ /*
+ * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+ * - adjust ip to the probe address, call saved next instruction address
+ * - adjust sp to the probe's stack frame (check trampoline code)
+ */
+ regs->ax = args.ax;
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ip = args.retaddr - 5;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ sp = regs->sp;
+
+ err = shstk_pop((u64 *)&sret);
+ if (err == -EFAULT || (!err && sret != args.retaddr))
+ goto sigill;
+
+ handle_syscall_uprobe(regs, regs->ip);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ */
+ if (regs->sp != sp) {
+ /* skip the trampoline call */
+ if (args.retaddr - 5 == regs->ip)
+ regs->ip += 5;
+ return regs->ax;
+ }
+
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed ax/r11/cx */
+ args.ax = regs->ax;
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /* keep return address unless we are instructed otherwise */
+ if (args.retaddr - 5 != regs->ip)
+ args.retaddr = regs->ip;
+
+ if (shstk_push(args.retaddr) == -EFAULT)
+ goto sigill;
+
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+ return 0;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ "uprobe_trampoline_entry:\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "push %rax\n"
+ "mov $" __stringify(__NR_uprobe) ", %rax\n"
+ "syscall\n"
+ "pop %rax\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ "ret\n"
+ "int3\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ ".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+ tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+ return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
+enum {
+ EXPECT_SWBP,
+ EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+ unsigned long base;
+ int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+ return *insn == CALL_INSN_OPCODE;
+}
+
+/*
+ * Verification callback used by int3_update uprobe_write calls to make sure
+ * the underlying instruction is as expected - either int3 or call.
+ */
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+ int nbytes, void *data)
+{
+ struct write_opcode_ctx *ctx = data;
+ uprobe_opcode_t old_opcode[5];
+
+ uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+ switch (ctx->expect) {
+ case EXPECT_SWBP:
+ if (is_swbp_insn(&old_opcode[0]))
+ return 1;
+ break;
+ case EXPECT_CALL:
+ if (is_call_insn(&old_opcode[0]))
+ return 1;
+ break;
+ }
+
+ return -1;
+}
+
+/*
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ * (borrowed comment from smp_text_poke_batch_finish)
+ *
+ * The way it is done:
+ * - Add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - Update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - Replace the first byte (INT3) by the first byte of the replacing opcode
+ * - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, char *insn, bool optimize)
+{
+ uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+ struct write_opcode_ctx ctx = {
+ .base = vaddr,
+ };
+ int err;
+
+ /*
+ * Write int3 trap.
+ *
+ * The swbp_optimize path comes with breakpoint already installed,
+ * so we can skip this step for optimize == true.
+ */
+ if (!optimize) {
+ ctx.expect = EXPECT_CALL;
+ err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+ }
+
+ smp_text_poke_sync_each_cpu();
+
+ /* Write all but the first byte of the patched range. */
+ ctx.expect = EXPECT_SWBP;
+ err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Write first byte.
+ *
+ * The swbp_unoptimize needs to finish uprobe removal together
+ * with ref_ctr update, using uprobe_write with proper flags.
+ */
+ err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+ optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+ return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, unsigned long tramp)
+{
+ u8 call[5];
+
+ __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+ (const void *) tramp, CALL_INSN_SIZE);
+ return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+ unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ uprobe_copy_from_page(page, vaddr, dst, len);
+ put_page(page);
+ return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+ struct __packed __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } *call = (struct __arch_relative_insn *) insn;
+
+ if (!is_call_insn(insn))
+ return false;
+ return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
+{
+ uprobe_opcode_t insn[5];
+ int err;
+
+ err = copy_from_vaddr(mm, vaddr, &insn, 5);
+ if (err)
+ return err;
+ return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+ return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+ test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (should_optimize(auprobe)) {
+ /*
+ * We could race with another thread that already optimized the probe,
+ * so let's not overwrite it with int3 again in this case.
+ */
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+ true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret) {
+ ret = swbp_unoptimize(auprobe, vma, vaddr);
+ WARN_ON_ONCE(ret);
+ return ret;
+ }
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+ false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+ unsigned long vaddr)
+{
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+ bool new = false;
+ int err = 0;
+
+ vma = find_vma(mm, vaddr);
+ if (!vma)
+ return -EINVAL;
+ tramp = get_uprobe_trampoline(vaddr, &new);
+ if (!tramp)
+ return -EINVAL;
+ err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+ if (WARN_ON_ONCE(err) && new)
+ destroy_uprobe_trampoline(tramp);
+ return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn[5];
+
+ if (!should_optimize(auprobe))
+ return;
+
+ mmap_write_lock(mm);
+
+ /*
+ * Check if some other thread already optimized the uprobe for us,
+ * if it's the case just go away silently.
+ */
+ if (copy_from_vaddr(mm, vaddr, &insn, 5))
+ goto unlock;
+ if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+ goto unlock;
+
+ /*
+ * If we fail to optimize the uprobe we set the fail bit so the
+ * above should_optimize will fail from now on.
+ */
+ if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+ set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+ mmap_write_unlock(mm);
+}
+
+static bool insn_is_nop(struct insn *insn)
+{
+ return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
+}
+
+static bool insn_is_nopl(struct insn *insn)
+{
+ if (insn->opcode.nbytes != 2)
+ return false;
+
+ if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
+ return false;
+
+ if (!insn->modrm.nbytes)
+ return false;
+
+ if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
+ return false;
+
+ /* 0f 1f /0 - NOPL */
+ return true;
+}
+
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ if (!insn->x86_64 || insn->length != 5)
+ return false;
+
+ if (!insn_is_nop(insn) && !insn_is_nopl(insn))
+ return false;
+
+ /* We can't do cross page atomic writes yet. */
+ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
@@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ return false;
+}
#endif /* CONFIG_X86_64 */
struct uprobe_xol_ops {
@@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
*/
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{
- struct insn insn;
u8 fix_ip_or_call = UPROBE_FIX_IP;
+ struct insn insn;
int ret;
ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
if (ret)
return ret;
+ if (can_optimize(&insn, addr))
+ set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
ret = branch_setup_xol_ops(auprobe, &insn);
if (ret != -ENOSYS)
return ret;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index ad89d0bd6005..103604c4b33b 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -13,7 +13,7 @@
#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
-/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
#define fixed_ctrl_field(ctrl_reg, idx) \
(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)