summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 20:44:15 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 20:44:15 +0300
commitc0e809e244804d428bcd976eaf9369f60508ea8a (patch)
tree99fa85899a3c11d2ebeb6d090f218fda968a0e6a /arch
parent2180f214f4a5d8e2d8b7138d9a59246ee05753b9 (diff)
parent0cc4bd8f70d1ea2940295f1050508c663fe9eff9 (diff)
downloadlinux-c0e809e244804d428bcd976eaf9369f60508ea8a.tar.xz
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar: "Kernel side changes: - Ftrace is one of the last W^X violators (after this only KLP is left). These patches move it over to the generic text_poke() interface and thereby get rid of this oddity. This requires a surprising amount of surgery, by Peter Zijlstra. - x86/AMD PMUs: add support for 'Large Increment per Cycle Events' to count certain types of events that have a special, quirky hw ABI (by Kim Phillips) - kprobes fixes by Masami Hiramatsu Lots of tooling updates as well, the following subcommands were updated: annotate/report/top, c2c, clang, record, report/top TUI, sched timehist, tests; plus updates were done to the gtk ui, libperf, headers and the parser" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) perf/x86/amd: Add support for Large Increment per Cycle Events perf/x86/amd: Constrain Large Increment per Cycle events perf/x86/intel/rapl: Add Comet Lake support tracing: Initialize ret in syscall_enter_define_fields() perf header: Use last modification time for timestamp perf c2c: Fix return type for histogram sorting comparision functions perf beauty sockaddr: Fix augmented syscall format warning perf/ui/gtk: Fix gtk2 build perf ui gtk: Add missing zalloc object perf tools: Use %define api.pure full instead of %pure-parser libperf: Setup initial evlist::all_cpus value perf report: Fix no libunwind compiled warning break s390 issue perf tools: Support --prefix/--prefix-strip perf report: Clarify in help that --children is default tools build: Fix test-clang.cpp with Clang 8+ perf clang: Fix build with Clang 9 kprobes: Fix optimize_kprobe()/unoptimize_kprobe() cancellation logic tools lib: Fix builds when glibc contains strlcpy() perf report/top: Make 'e' visible in the help and make it toggle showing callchains perf report/top: Do not offer annotation for symbols without samples ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/kernel/Makefile4
-rw-r--r--arch/arm/kernel/ftrace.c10
-rw-r--r--arch/nds32/kernel/ftrace.c12
-rw-r--r--arch/x86/events/amd/core.c109
-rw-r--r--arch/x86/events/core.c74
-rw-r--r--arch/x86/events/intel/rapl.c2
-rw-r--r--arch/x86/events/perf_event.h20
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/kprobes.h14
-rw-r--r--arch/x86/include/asm/set_memory.h2
-rw-r--r--arch/x86/include/asm/text-patching.h86
-rw-r--r--arch/x86/kernel/alternative.c198
-rw-r--r--arch/x86/kernel/ftrace.c688
-rw-r--r--arch/x86/kernel/jump_label.c116
-rw-r--r--arch/x86/kernel/kprobes/core.c20
-rw-r--r--arch/x86/kernel/kprobes/opt.c67
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/mm/init_32.c28
-rw-r--r--arch/x86/mm/init_64.c36
19 files changed, 594 insertions, 903 deletions
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 8b679e2ca3c3..89e5d864e923 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -53,8 +53,8 @@ obj-$(CONFIG_HAVE_ARM_SCU) += smp_scu.o
obj-$(CONFIG_HAVE_ARM_TWD) += smp_twd.o
obj-$(CONFIG_ARM_ARCH_TIMER) += arch_timer.o
obj-$(CONFIG_FUNCTION_TRACER) += entry-ftrace.o
-obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o insn.o
-obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o insn.o
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o insn.o patch.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o insn.o patch.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o insn.o patch.o
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
# Main staffs in KPROBES are in arch/arm/probes/ .
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index bda949fd84e8..2a5ff69c28e6 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -22,6 +22,7 @@
#include <asm/ftrace.h>
#include <asm/insn.h>
#include <asm/set_memory.h>
+#include <asm/patch.h>
#ifdef CONFIG_THUMB2_KERNEL
#define NOP 0xf85deb04 /* pop.w {lr} */
@@ -35,9 +36,7 @@ static int __ftrace_modify_code(void *data)
{
int *command = data;
- set_kernel_text_rw();
ftrace_modify_all_code(*command);
- set_kernel_text_ro();
return 0;
}
@@ -59,13 +58,11 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr)
int ftrace_arch_code_modify_prepare(void)
{
- set_all_modules_text_rw();
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
- set_all_modules_text_ro();
/* Make sure any TLB misses during machine stop are cleared. */
flush_tlb_all();
return 0;
@@ -97,10 +94,7 @@ static int ftrace_modify_code(unsigned long pc, unsigned long old,
return -EINVAL;
}
- if (probe_kernel_write((void *)pc, &new, MCOUNT_INSN_SIZE))
- return -EPERM;
-
- flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+ __patch_text((void *)pc, new);
return 0;
}
diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c
index fd2a54b8cd57..22ab77ea27ad 100644
--- a/arch/nds32/kernel/ftrace.c
+++ b/arch/nds32/kernel/ftrace.c
@@ -89,18 +89,6 @@ int __init ftrace_dyn_arch_init(void)
return 0;
}
-int ftrace_arch_code_modify_prepare(void)
-{
- set_all_modules_text_rw();
- return 0;
-}
-
-int ftrace_arch_code_modify_post_process(void)
-{
- set_all_modules_text_ro();
- return 0;
-}
-
static unsigned long gen_sethi_insn(unsigned long addr)
{
unsigned long opcode = 0x46000000;
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index a7752cd78b89..1f22b6bbda68 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -14,6 +14,10 @@
static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp);
static unsigned long perf_nmi_window;
+/* AMD Event 0xFFF: Merge. Used with Large Increment per Cycle events */
+#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL)
+#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE)
+
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -301,6 +305,25 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel)
return offset;
}
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+ return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc)
+{
+ if (!(x86_pmu.flags & PMU_FL_PAIR))
+ return false;
+
+ switch (amd_get_event_code(hwc)) {
+ case 0x003: return true; /* Retired SSE/AVX FLOPs */
+ default: return false;
+ }
+}
+
static int amd_core_hw_config(struct perf_event *event)
{
if (event->attr.exclude_host && event->attr.exclude_guest)
@@ -316,15 +339,10 @@ static int amd_core_hw_config(struct perf_event *event)
else if (event->attr.exclude_guest)
event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
- return 0;
-}
+ if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw))
+ event->hw.flags |= PERF_X86_EVENT_PAIR;
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
- return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+ return 0;
}
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
@@ -855,6 +873,29 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
}
}
+static struct event_constraint pair_constraint;
+
+static struct event_constraint *
+amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (amd_is_pair_event_code(hwc))
+ return &pair_constraint;
+
+ return &unconstrained;
+}
+
+static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (is_counter_pair(hwc))
+ --cpuc->n_pair;
+}
+
static ssize_t amd_event_sysfs_show(char *page, u64 config)
{
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
@@ -898,33 +939,15 @@ static __initconst const struct x86_pmu amd_pmu = {
static int __init amd_core_pmu_init(void)
{
+ u64 even_ctr_mask = 0ULL;
+ int i;
+
if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
return 0;
- /* Avoid calulating the value each time in the NMI handler */
+ /* Avoid calculating the value each time in the NMI handler */
perf_nmi_window = msecs_to_jiffies(100);
- switch (boot_cpu_data.x86) {
- case 0x15:
- pr_cont("Fam15h ");
- x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
- break;
- case 0x17:
- pr_cont("Fam17h ");
- /*
- * In family 17h, there are no event constraints in the PMC hardware.
- * We fallback to using default amd_get_event_constraints.
- */
- break;
- case 0x18:
- pr_cont("Fam18h ");
- /* Using default amd_get_event_constraints. */
- break;
- default:
- pr_err("core perfctr but no constraints; unknown hardware!\n");
- return -ENODEV;
- }
-
/*
* If core performance counter extensions exists, we must use
* MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
@@ -939,6 +962,32 @@ static int __init amd_core_pmu_init(void)
*/
x86_pmu.amd_nb_constraints = 0;
+ if (boot_cpu_data.x86 == 0x15) {
+ pr_cont("Fam15h ");
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+ }
+ if (boot_cpu_data.x86 >= 0x17) {
+ pr_cont("Fam17h+ ");
+ /*
+ * Family 17h and compatibles have constraints for Large
+ * Increment per Cycle events: they may only be assigned an
+ * even numbered counter that has a consecutive adjacent odd
+ * numbered counter following it.
+ */
+ for (i = 0; i < x86_pmu.num_counters - 1; i += 2)
+ even_ctr_mask |= 1 << i;
+
+ pair_constraint = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, even_ctr_mask, 0,
+ x86_pmu.num_counters / 2, 0,
+ PERF_X86_EVENT_PAIR);
+
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
+ x86_pmu.put_event_constraints = amd_put_event_constraints_f17h;
+ x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE;
+ x86_pmu.flags |= PMU_FL_PAIR;
+ }
+
pr_cont("core perfctr, ");
return 0;
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f118af9f0718..3bb738f5a472 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -618,6 +618,7 @@ void x86_pmu_disable_all(void)
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
u64 val;
if (!test_bit(idx, cpuc->active_mask))
@@ -627,6 +628,8 @@ void x86_pmu_disable_all(void)
continue;
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(x86_pmu_config_addr(idx), val);
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_config_addr(idx + 1), 0);
}
}
@@ -699,7 +702,7 @@ struct sched_state {
int counter; /* counter index */
int unassigned; /* number of events to be assigned left */
int nr_gp; /* number of GP counters used */
- unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ u64 used;
};
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
@@ -756,8 +759,12 @@ static bool perf_sched_restore_state(struct perf_sched *sched)
sched->saved_states--;
sched->state = sched->saved[sched->saved_states];
- /* continue with next counter: */
- clear_bit(sched->state.counter++, sched->state.used);
+ /* this assignment didn't work out */
+ /* XXX broken vs EVENT_PAIR */
+ sched->state.used &= ~BIT_ULL(sched->state.counter);
+
+ /* try the next one */
+ sched->state.counter++;
return true;
}
@@ -782,20 +789,32 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
idx = INTEL_PMC_IDX_FIXED;
for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
- if (!__test_and_set_bit(idx, sched->state.used))
- goto done;
+ u64 mask = BIT_ULL(idx);
+
+ if (sched->state.used & mask)
+ continue;
+
+ sched->state.used |= mask;
+ goto done;
}
}
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
- if (!__test_and_set_bit(idx, sched->state.used)) {
- if (sched->state.nr_gp++ >= sched->max_gp)
- return false;
+ u64 mask = BIT_ULL(idx);
- goto done;
- }
+ if (c->flags & PERF_X86_EVENT_PAIR)
+ mask |= mask << 1;
+
+ if (sched->state.used & mask)
+ continue;
+
+ if (sched->state.nr_gp++ >= sched->max_gp)
+ return false;
+
+ sched->state.used |= mask;
+ goto done;
}
return false;
@@ -872,12 +891,10 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
struct event_constraint *c;
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ u64 used_mask = 0;
/*
* Compute the number of events already present; see x86_pmu_add(),
@@ -920,6 +937,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* fastpath, try to reuse previous register
*/
for (i = 0; i < n; i++) {
+ u64 mask;
+
hwc = &cpuc->event_list[i]->hw;
c = cpuc->event_constraint[i];
@@ -931,11 +950,16 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if (!test_bit(hwc->idx, c->idxmsk))
break;
+ mask = BIT_ULL(hwc->idx);
+ if (is_counter_pair(hwc))
+ mask |= mask << 1;
+
/* not already used */
- if (test_bit(hwc->idx, used_mask))
+ if (used_mask & mask)
break;
- __set_bit(hwc->idx, used_mask);
+ used_mask |= mask;
+
if (assign)
assign[i] = hwc->idx;
}
@@ -958,6 +982,15 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
READ_ONCE(cpuc->excl_cntrs->exclusive_present))
gpmax /= 2;
+ /*
+ * Reduce the amount of available counters to allow fitting
+ * the extra Merge events needed by large increment events.
+ */
+ if (x86_pmu.flags & PMU_FL_PAIR) {
+ gpmax = x86_pmu.num_counters - cpuc->n_pair;
+ WARN_ON(gpmax <= 0);
+ }
+
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
wmax, gpmax, assign);
}
@@ -1038,6 +1071,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
return -EINVAL;
cpuc->event_list[n] = leader;
n++;
+ if (is_counter_pair(&leader->hw))
+ cpuc->n_pair++;
}
if (!dogrp)
return n;
@@ -1052,6 +1087,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
cpuc->event_list[n] = event;
n++;
+ if (is_counter_pair(&event->hw))
+ cpuc->n_pair++;
}
return n;
}
@@ -1238,6 +1275,13 @@ int x86_perf_event_set_period(struct perf_event *event)
wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
/*
+ * Clear the Merge event counter's upper 16 bits since
+ * we currently declare a 48-bit counter width
+ */
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_event_addr(idx + 1), 0);
+
+ /*
* Due to erratum on certan cpu we need
* a second write to be sure the register
* is updated properly
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 5053a403e4ae..09913121e726 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -741,6 +741,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE_L, model_skl),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE, model_skl),
{},
};
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 930611db8f9a..f1cd1ca1a77b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -77,6 +77,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */
+#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
struct amd_nb {
int nb_id; /* NorthBridge id */
@@ -272,6 +273,7 @@ struct cpu_hw_events {
struct amd_nb *amd_nb;
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
+ int n_pair; /* Large increment events */
void *kfree_on_online[X86_PERF_KFREE_MAX];
};
@@ -694,6 +696,7 @@ struct x86_pmu {
* AMD bits
*/
unsigned int amd_nb_constraints : 1;
+ u64 perf_ctr_pair_en;
/*
* Extra registers for events
@@ -743,6 +746,7 @@ do { \
#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
+#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -838,6 +842,11 @@ int x86_pmu_hw_config(struct perf_event *event);
void x86_pmu_disable_all(void);
+static inline bool is_counter_pair(struct hw_perf_event *hwc)
+{
+ return hwc->flags & PERF_X86_EVENT_PAIR;
+}
+
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
@@ -845,6 +854,14 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
if (hwc->extra_reg.reg)
wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+
+ /*
+ * Add enabled Merge event on next counter
+ * if large increment event being enabled on this counter
+ */
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en);
+
wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
}
@@ -861,6 +878,9 @@ static inline void x86_pmu_disable_event(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
wrmsrl(hwc->config_base, hwc->config);
+
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0);
}
void x86_pmu_enable_event(struct perf_event *event);
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index c2a7458f912c..85be2f506272 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -47,8 +47,6 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};
-int ftrace_int3_handler(struct pt_regs *regs);
-
#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
#endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 5dc909d9ad81..95b1f053bd96 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -11,12 +11,11 @@
#include <asm-generic/kprobes.h>
-#define BREAKPOINT_INSTRUCTION 0xcc
-
#ifdef CONFIG_KPROBES
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/percpu.h>
+#include <asm/text-patching.h>
#include <asm/insn.h>
#define __ARCH_WANT_KPROBES_INSN_SLOT
@@ -25,10 +24,7 @@ struct pt_regs;
struct kprobe;
typedef u8 kprobe_opcode_t;
-#define RELATIVEJUMP_OPCODE 0xe9
-#define RELATIVEJUMP_SIZE 5
-#define RELATIVECALL_OPCODE 0xe8
-#define RELATIVE_ADDR_SIZE 4
+
#define MAX_STACK_SIZE 64
#define CUR_STACK_SIZE(ADDR) \
(current_top_of_stack() - (unsigned long)(ADDR))
@@ -43,11 +39,11 @@ extern __visible kprobe_opcode_t optprobe_template_entry[];
extern __visible kprobe_opcode_t optprobe_template_val[];
extern __visible kprobe_opcode_t optprobe_template_call[];
extern __visible kprobe_opcode_t optprobe_template_end[];
-#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + DISP32_SIZE)
#define MAX_OPTINSN_SIZE \
(((unsigned long)optprobe_template_end - \
(unsigned long)optprobe_template_entry) + \
- MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
+ MAX_OPTIMIZED_LENGTH + JMP32_INSN_SIZE)
extern const int kretprobe_blacklist_size;
@@ -73,7 +69,7 @@ struct arch_specific_insn {
struct arch_optimized_insn {
/* copy of the original instructions */
- kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
+ kprobe_opcode_t copied_insn[DISP32_SIZE];
/* detour code buffer */
kprobe_opcode_t *insn;
/* the size of instructions copied to detour code buffer */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 2ee8e469dcf5..64c3dce374e5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -81,8 +81,6 @@ int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly;
-void set_kernel_text_rw(void);
-void set_kernel_text_ro(void);
#ifdef CONFIG_X86_64
static inline int set_mce_nospec(unsigned long pfn)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 23c626a742e8..67315fa3956a 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -25,14 +25,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
*/
#define POKE_MAX_OPCODE_SIZE 5
-struct text_poke_loc {
- void *addr;
- int len;
- s32 rel32;
- u8 opcode;
- const u8 text[POKE_MAX_OPCODE_SIZE];
-};
-
extern void text_poke_early(void *addr, const void *opcode, size_t len);
/*
@@ -50,21 +42,13 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
* an inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
-extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries);
-extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
- const void *opcode, size_t len, const void *emulate);
-extern int after_bootmem;
-extern __ro_after_init struct mm_struct *poking_mm;
-extern __ro_after_init unsigned long poking_addr;
-#ifndef CONFIG_UML_X86
-static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
-{
- regs->ip = ip;
-}
+extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
+extern void text_poke_finish(void);
#define INT3_INSN_SIZE 1
#define INT3_INSN_OPCODE 0xCC
@@ -78,6 +62,67 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
#define JMP8_INSN_SIZE 2
#define JMP8_INSN_OPCODE 0xEB
+#define DISP32_SIZE 4
+
+static inline int text_opcode_size(u8 opcode)
+{
+ int size = 0;
+
+#define __CASE(insn) \
+ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break
+
+ switch(opcode) {
+ __CASE(INT3);
+ __CASE(CALL);
+ __CASE(JMP32);
+ __CASE(JMP8);
+ }
+
+#undef __CASE
+
+ return size;
+}
+
+union text_poke_insn {
+ u8 text[POKE_MAX_OPCODE_SIZE];
+ struct {
+ u8 opcode;
+ s32 disp;
+ } __attribute__((packed));
+};
+
+static __always_inline
+void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
+{
+ static union text_poke_insn insn; /* per instance */
+ int size = text_opcode_size(opcode);
+
+ insn.opcode = opcode;
+
+ if (size > 1) {
+ insn.disp = (long)dest - (long)(addr + size);
+ if (size == 2) {
+ /*
+ * Ensure that for JMP9 the displacement
+ * actually fits the signed byte.
+ */
+ BUG_ON((insn.disp >> 31) != (insn.disp >> 7));
+ }
+ }
+
+ return &insn.text;
+}
+
+extern int after_bootmem;
+extern __ro_after_init struct mm_struct *poking_mm;
+extern __ro_after_init unsigned long poking_addr;
+
+#ifndef CONFIG_UML_X86
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
{
/*
@@ -85,6 +130,9 @@ static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
* stack where the break point happened, and the saving of
* pt_regs. We can extend the original stack because of
* this gap. See the idtentry macro's create_gap option.
+ *
+ * Similarly entry_32.S will have a gap on the stack for (any) hardware
+ * exception and pt_regs; see FIXUP_FRAME.
*/
regs->sp -= sizeof(unsigned long);
*(unsigned long *)regs->sp = val;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9ec463fe96f2..34360ca301a2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -936,44 +936,81 @@ static void do_sync_core(void *info)
sync_core();
}
-static struct bp_patching_desc {
+void text_poke_sync(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+struct text_poke_loc {
+ s32 rel_addr; /* addr := _stext + rel_addr */
+ s32 rel32;
+ u8 opcode;
+ const u8 text[POKE_MAX_OPCODE_SIZE];
+};
+
+struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
-} bp_patching;
+ atomic_t refs;
+};
+
+static struct bp_patching_desc *bp_desc;
+
+static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
+{
+ struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */
+
+ if (!desc || !atomic_inc_not_zero(&desc->refs))
+ return NULL;
+
+ return desc;
+}
+
+static inline void put_desc(struct bp_patching_desc *desc)
+{
+ smp_mb__before_atomic();
+ atomic_dec(&desc->refs);
+}
-static int patch_cmp(const void *key, const void *elt)
+static inline void *text_poke_addr(struct text_poke_loc *tp)
+{
+ return _stext + tp->rel_addr;
+}
+
+static int notrace patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
- if (key < tp->addr)
+ if (key < text_poke_addr(tp))
return -1;
- if (key > tp->addr)
+ if (key > text_poke_addr(tp))
return 1;
return 0;
}
NOKPROBE_SYMBOL(patch_cmp);
-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
{
+ struct bp_patching_desc *desc;
struct text_poke_loc *tp;
+ int len, ret = 0;
void *ip;
+ if (user_mode(regs))
+ return 0;
+
/*
* Having observed our INT3 instruction, we now must observe
- * bp_patching.nr_entries.
+ * bp_desc:
*
- * nr_entries != 0 INT3
+ * bp_desc = desc INT3
* WMB RMB
- * write INT3 if (nr_entries)
- *
- * Idem for other elements in bp_patching.
+ * write INT3 if (desc)
*/
smp_rmb();
- if (likely(!bp_patching.nr_entries))
- return 0;
-
- if (user_mode(regs))
+ desc = try_get_desc(&bp_desc);
+ if (!desc)
return 0;
/*
@@ -984,19 +1021,20 @@ int poke_int3_handler(struct pt_regs *regs)
/*
* Skip the binary search if there is a single member in the vector.
*/
- if (unlikely(bp_patching.nr_entries > 1)) {
- tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
+ if (unlikely(desc->nr_entries > 1)) {
+ tp = bsearch(ip, desc->vec, desc->nr_entries,
sizeof(struct text_poke_loc),
patch_cmp);
if (!tp)
- return 0;
+ goto out_put;
} else {
- tp = bp_patching.vec;
- if (tp->addr != ip)
- return 0;
+ tp = desc->vec;
+ if (text_poke_addr(tp) != ip)
+ goto out_put;
}
- ip += tp->len;
+ len = text_opcode_size(tp->opcode);
+ ip += len;
switch (tp->opcode) {
case INT3_INSN_OPCODE:
@@ -1004,7 +1042,7 @@ int poke_int3_handler(struct pt_regs *regs)
* Someone poked an explicit INT3, they'll want to handle it,
* do not consume.
*/
- return 0;
+ goto out_put;
case CALL_INSN_OPCODE:
int3_emulate_call(regs, (long)ip + tp->rel32);
@@ -1019,10 +1057,18 @@ int poke_int3_handler(struct pt_regs *regs)
BUG();
}
- return 1;
+ ret = 1;
+
+out_put:
+ put_desc(desc);
+ return ret;
}
NOKPROBE_SYMBOL(poke_int3_handler);
+#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
+static struct text_poke_loc tp_vec[TP_VEC_MAX];
+static int tp_vec_nr;
+
/**
* text_poke_bp_batch() -- update instructions on live kernel on SMP
* @tp: vector of instructions to patch
@@ -1044,16 +1090,20 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* replacing opcode
* - sync cores
*/
-void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
+static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
+ struct bp_patching_desc desc = {
+ .vec = tp,
+ .nr_entries = nr_entries,
+ .refs = ATOMIC_INIT(1),
+ };
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
lockdep_assert_held(&text_mutex);
- bp_patching.vec = tp;
- bp_patching.nr_entries = nr_entries;
+ smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
/*
* Corresponding read barrier in int3 notifier for making sure the
@@ -1065,18 +1115,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* First step: add a int3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++)
- text_poke(tp[i].addr, &int3, sizeof(int3));
+ text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
/*
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
- if (tp[i].len - sizeof(int3) > 0) {
- text_poke((char *)tp[i].addr + sizeof(int3),
- (const char *)tp[i].text + sizeof(int3),
- tp[i].len - sizeof(int3));
+ int len = text_opcode_size(tp[i].opcode);
+
+ if (len - INT3_INSN_SIZE > 0) {
+ text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ (const char *)tp[i].text + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
do_sync++;
}
}
@@ -1087,7 +1139,7 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
}
/*
@@ -1098,19 +1150,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
if (tp[i].text[0] == INT3_INSN_OPCODE)
continue;
- text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+ text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
do_sync++;
}
if (do_sync)
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
/*
- * sync_core() implies an smp_mb() and orders this store against
- * the writing of the new instruction.
+ * Remove and synchronize_rcu(), except we have a very primitive
+ * refcount based completion.
*/
- bp_patching.vec = NULL;
- bp_patching.nr_entries = 0;
+ WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
+ if (!atomic_dec_and_test(&desc.refs))
+ atomic_cond_read_acquire(&desc.refs, !VAL);
}
void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
@@ -1118,11 +1171,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
{
struct insn insn;
- if (!opcode)
- opcode = (void *)tp->text;
- else
- memcpy((void *)tp->text, opcode, len);
-
+ memcpy((void *)tp->text, opcode, len);
if (!emulate)
emulate = opcode;
@@ -1132,8 +1181,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
BUG_ON(!insn_complete(&insn));
BUG_ON(len != insn.length);
- tp->addr = addr;
- tp->len = len;
+ tp->rel_addr = addr - (void *)_stext;
tp->opcode = insn.opcode.bytes[0];
switch (tp->opcode) {
@@ -1167,6 +1215,55 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
}
}
+/*
+ * We hard rely on the tp_vec being ordered; ensure this is so by flushing
+ * early if needed.
+ */
+static bool tp_order_fail(void *addr)
+{
+ struct text_poke_loc *tp;
+
+ if (!tp_vec_nr)
+ return false;
+
+ if (!addr) /* force */
+ return true;
+
+ tp = &tp_vec[tp_vec_nr - 1];
+ if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
+ return true;
+
+ return false;
+}
+
+static void text_poke_flush(void *addr)
+{
+ if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
+ text_poke_bp_batch(tp_vec, tp_vec_nr);
+ tp_vec_nr = 0;
+ }
+}
+
+void text_poke_finish(void)
+{
+ text_poke_flush(NULL);
+}
+
+void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ struct text_poke_loc *tp;
+
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
+ text_poke_flush(addr);
+
+ tp = &tp_vec[tp_vec_nr++];
+ text_poke_loc_init(tp, addr, opcode, len, emulate);
+}
+
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
@@ -1178,10 +1275,15 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2009047bb015..37a0aeaf89e7 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -35,6 +35,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+static int ftrace_poke_late = 0;
+
int ftrace_arch_code_modify_prepare(void)
__acquires(&text_mutex)
{
@@ -44,84 +46,37 @@ int ftrace_arch_code_modify_prepare(void)
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
- set_kernel_text_rw();
- set_all_modules_text_rw();
+ ftrace_poke_late = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
- set_all_modules_text_ro();
- set_kernel_text_ro();
+ /*
+ * ftrace_make_{call,nop}() may be called during
+ * module load, and we need to finish the text_poke_queue()
+ * that they do, here.
+ */
+ text_poke_finish();
+ ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
return 0;
}
-union ftrace_code_union {
- char code[MCOUNT_INSN_SIZE];
- struct {
- unsigned char op;
- int offset;
- } __attribute__((packed));
-};
-
-static int ftrace_calc_offset(long ip, long addr)
-{
- return (int)(addr - ip);
-}
-
-static unsigned char *
-ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
+static const char *ftrace_nop_replace(void)
{
- static union ftrace_code_union calc;
-
- calc.op = op;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
-
- return calc.code;
-}
-
-static unsigned char *
-ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
- return ftrace_text_replace(0xe8, ip, addr);
-}
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr < end;
+ return ideal_nops[NOP_ATOMIC5];
}
-static unsigned long text_ip_addr(unsigned long ip)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only, so we use
- * the kernel identity mapping instead of the kernel text mapping
- * to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return ip;
+ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
}
-static const unsigned char *ftrace_nop_replace(void)
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
{
- return ideal_nops[NOP_ATOMIC5];
-}
-
-static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
-
- ftrace_expected = old_code;
+ char cur_code[MCOUNT_INSN_SIZE];
/*
* Note:
@@ -130,31 +85,46 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
* Carefully read and modify the code with probe_kernel_*(), and make
* sure what we read is what we expected it to be before modifying it.
*/
-
/* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ WARN_ON(1);
return -EFAULT;
+ }
/* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+ WARN_ON(1);
return -EINVAL;
+ }
- ip = text_ip_addr(ip);
-
- /* replace the text with the new text */
- if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
- return -EPERM;
+ return 0;
+}
- sync_core();
+/*
+ * Marked __ref because it calls text_poke_early() which is .init.text. That is
+ * ok because that call will happen early, during boot, when .init sections are
+ * still present.
+ */
+static int __ref
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+ const char *new_code)
+{
+ int ret = ftrace_verify_code(ip, old_code);
+ if (ret)
+ return ret;
+ /* replace the text with the new text */
+ if (ftrace_poke_late)
+ text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
@@ -168,19 +138,20 @@ int ftrace_make_nop(struct module *mod,
* just modify the code directly.
*/
if (addr == MCOUNT_ADDR)
- return ftrace_modify_code_direct(rec->ip, old, new);
+ return ftrace_modify_code_direct(ip, old, new);
- ftrace_expected = NULL;
-
- /* Normal cases use add_brk_on_nop */
+ /*
+ * x86 overrides ftrace_replace_code -- this function will never be used
+ * in this case.
+ */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
}
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
@@ -190,43 +161,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
}
/*
- * The modifying_ftrace_code is used to tell the breakpoint
- * handler to call ftrace_int3_handler(). If it fails to
- * call this handler for a breakpoint added by ftrace, then
- * the kernel may crash.
- *
- * As atomic_writes on x86 do not need a barrier, we do not
- * need to add smp_mb()s for this to work. It is also considered
- * that we can not read the modifying_ftrace_code before
- * executing the breakpoint. That would be quite remarkable if
- * it could do that. Here's the flow that is required:
- *
- * CPU-0 CPU-1
- *
- * atomic_inc(mfc);
- * write int3s
- * <trap-int3> // implicit (r)mb
- * if (atomic_read(mfc))
- * call ftrace_int3_handler()
- *
- * Then when we are finished:
- *
- * atomic_dec(mfc);
- *
- * If we hit a breakpoint that was not set by ftrace, it does not
- * matter if ftrace_int3_handler() is called or not. It will
- * simply be ignored. But it is crucial that a ftrace nop/caller
- * breakpoint is handled. No other user should ever place a
- * breakpoint on an ftrace nop/caller location. It must only
- * be done by this code.
- */
-atomic_t modifying_ftrace_code __read_mostly;
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code);
-
-/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
* ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
@@ -238,452 +172,84 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
WARN_ON(1);
- ftrace_expected = NULL;
return -EINVAL;
}
-static unsigned long ftrace_update_func;
-static unsigned long ftrace_update_func_call;
-
-static int update_ftrace_func(unsigned long ip, void *new)
-{
- unsigned char old[MCOUNT_INSN_SIZE];
- int ret;
-
- memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
-
- ftrace_update_func = ip;
- /* Make sure the breakpoints see the ftrace_update_func update */
- smp_wmb();
-
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
- ret = ftrace_modify_code(ip, old, new);
-
- atomic_dec(&modifying_ftrace_code);
-
- return ret;
-}
-
int ftrace_update_ftrace_func(ftrace_func_t func)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char *new;
- int ret;
-
- ftrace_update_func_call = (unsigned long)func;
-
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
-
- /* Also update the regs callback function */
- if (!ret) {
- ip = (unsigned long)(&ftrace_regs_call);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- }
-
- return ret;
-}
-
-static nokprobe_inline int is_ftrace_caller(unsigned long ip)
-{
- if (ip == ftrace_update_func)
- return 1;
-
- return 0;
-}
-
-/*
- * A breakpoint was added to the code address we are about to
- * modify, and this is the handle that will just skip over it.
- * We are either changing a nop into a trace call, or a trace
- * call to a nop. While the change is taking place, we treat
- * it just like it was a nop.
- */
-int ftrace_int3_handler(struct pt_regs *regs)
-{
unsigned long ip;
+ const char *new;
- if (WARN_ON_ONCE(!regs))
- return 0;
-
- ip = regs->ip - INT3_INSN_SIZE;
-
- if (ftrace_location(ip)) {
- int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
- return 1;
- } else if (is_ftrace_caller(ip)) {
- if (!ftrace_update_func_call) {
- int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
- return 1;
- }
- int3_emulate_call(regs, ftrace_update_func_call);
- return 1;
- }
-
- return 0;
-}
-NOKPROBE_SYMBOL(ftrace_int3_handler);
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
- ip = text_ip_addr(ip);
-
- if (probe_kernel_write((void *)ip, val, size))
- return -EPERM;
-
- return 0;
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
-
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- ftrace_expected = old;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
-
- return ftrace_write(ip, &brk, 1);
-}
-
-static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned const char *old;
- unsigned long ip = rec->ip;
-
- old = ftrace_call_replace(ip, addr);
-
- return add_break(rec->ip, old);
-}
-
-
-static int add_brk_on_nop(struct dyn_ftrace *rec)
-{
- unsigned const char *old;
-
- old = ftrace_nop_replace();
-
- return add_break(rec->ip, old);
-}
-
-static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ftrace_addr = ftrace_get_addr_curr(rec);
-
- ret = ftrace_test_record(rec, enable);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_brk_on_nop(rec);
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_brk_on_call(rec, ftrace_addr);
- }
- return 0;
-}
-
-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
-{
- unsigned char ins[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
- const unsigned char *nop;
- unsigned long ftrace_addr;
- unsigned long ip = rec->ip;
-
- /* If we fail the read, just give up */
- if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* If this does not have a breakpoint, we are done */
- if (ins[0] != brk)
- return 0;
-
- nop = ftrace_nop_replace();
-
- /*
- * If the last 4 bytes of the instruction do not match
- * a nop, then we assume that this is a call to ftrace_addr.
- */
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
- /*
- * For extra paranoidism, we check if the breakpoint is on
- * a call that would actually jump to the ftrace_addr.
- * If not, don't touch the breakpoint, we make just create
- * a disaster.
- */
- ftrace_addr = ftrace_get_addr_new(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
- goto update;
-
- /* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = ftrace_get_addr_curr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- ftrace_expected = nop;
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
- return -EINVAL;
- }
-
- update:
- return ftrace_write(ip, nop, 1);
-}
-
-static int add_update_code(unsigned long ip, unsigned const char *new)
-{
- /* skip breakpoint */
- ip++;
- new++;
- return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
-}
-
-static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
- return add_update_code(ip, new);
-}
-
-static int add_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
- return add_update_code(ip, new);
-}
-
-static int add_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_test_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_update_call(rec, ftrace_addr);
-
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_update_nop(rec);
- }
-
- return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_update_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return finish_update_call(rec, ftrace_addr);
+ ip = (unsigned long)(&ftrace_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return finish_update_nop(rec);
- }
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
-static void do_sync_core(void *data)
-{
- sync_core();
-}
-
-static void run_sync(void)
-{
- int enable_irqs;
-
- /* No need to sync if there's only one CPU */
- if (num_online_cpus() == 1)
- return;
-
- enable_irqs = irqs_disabled();
-
- /* We may be called with interrupts disabled (on bootup). */
- if (enable_irqs)
- local_irq_enable();
- on_each_cpu(do_sync_core, NULL, 1);
- if (enable_irqs)
- local_irq_disable();
-}
-
void ftrace_replace_code(int enable)
{
struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
- const char *report = "adding breakpoints";
- int count = 0;
+ const char *new, *old;
int ret;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = add_breakpoints(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- report = "updating code";
- count = 0;
+ case FTRACE_UPDATE_MAKE_CALL:
+ old = ftrace_nop_replace();
+ break;
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+ break;
+ }
- ret = add_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
+ ret = ftrace_verify_code(rec->ip, old);
+ if (ret) {
+ ftrace_bug(ret, rec);
+ return;
+ }
}
- run_sync();
-
- report = "removing breakpoints";
- count = 0;
-
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = finish_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- run_sync();
+ case FTRACE_UPDATE_MAKE_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+ break;
- return;
+ case FTRACE_UPDATE_MAKE_NOP:
+ new = ftrace_nop_replace();
+ break;
+ }
- remove_breakpoints:
- pr_warn("Failed on %s (%d):\n", report, count);
- ftrace_bug(ret, rec);
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
- /*
- * Breakpoints are handled only when this function is in
- * progress. The system could not work with them.
- */
- if (remove_breakpoint(rec))
- BUG();
+ text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ ftrace_update_record(rec, enable);
}
- run_sync();
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = add_break(ip, old_code);
- if (ret)
- goto out;
-
- run_sync();
-
- ret = add_update_code(ip, new_code);
- if (ret)
- goto fail_update;
-
- run_sync();
-
- ret = ftrace_write(ip, new_code, 1);
- /*
- * The breakpoint is handled only when this function is in progress.
- * The system could not work if we could not remove it.
- */
- BUG_ON(ret);
- out:
- run_sync();
- return ret;
-
- fail_update:
- /* Also here the system could not work with the breakpoint */
- if (ftrace_write(ip, old_code, 1))
- BUG();
- goto out;
+ text_poke_finish();
}
void arch_ftrace_update_code(int command)
{
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ftrace_modify_all_code(command);
-
- atomic_dec(&modifying_ftrace_code);
}
int __init ftrace_dyn_arch_init(void)
@@ -748,6 +314,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long start_offset;
unsigned long end_offset;
unsigned long op_offset;
+ unsigned long call_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -764,10 +331,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
start_offset = (unsigned long)ftrace_regs_caller;
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_regs_call;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_epilogue;
op_offset = (unsigned long)ftrace_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_call;
}
size = end_offset - start_offset;
@@ -824,16 +393,21 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* put in the new offset to the ftrace_ops */
memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ /* put in the call to the function */
+ mutex_lock(&text_mutex);
+ call_offset -= start_offset;
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE,
+ trampoline + call_offset,
+ ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
set_vm_flush_reset_perms(trampoline);
- /*
- * Module allocation needs to be completed by making the page
- * executable. The page is still writable, which is a security hazard,
- * but anyhow ftrace breaks W^X completely.
- */
+ set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
@@ -860,62 +434,54 @@ static unsigned long calc_trampoline_call_offset(bool save_regs)
void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
ftrace_func_t func;
- unsigned char *new;
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret, npages;
+ const char *new;
- if (ops->trampoline) {
- /*
- * The ftrace_ops caller may set up its own trampoline.
- * In such a case, this code must not modify it.
- */
- if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
- return;
- npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
- set_memory_rw(ops->trampoline, npages);
- } else {
+ if (!ops->trampoline) {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
- npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ return;
}
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
ip = ops->trampoline + offset;
-
func = ftrace_ops_get_func(ops);
- ftrace_update_func_call = (unsigned long)func;
-
+ mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- set_memory_ro(ops->trampoline, npages);
-
- /* The update should never fail */
- WARN_ON(ret);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ mutex_unlock(&text_mutex);
}
/* Return the address of the function the trampoline calls */
static void *addr_from_call(void *ptr)
{
- union ftrace_code_union calc;
+ union text_poke_insn call;
int ret;
- ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE);
+ ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE);
if (WARN_ON_ONCE(ret < 0))
return NULL;
/* Make sure this is a call */
- if (WARN_ON_ONCE(calc.op != 0xe8)) {
- pr_warn("Expected e8, got %x\n", calc.op);
+ if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) {
+ pr_warn("Expected E8, got %x\n", call.opcode);
return NULL;
}
- return ptr + MCOUNT_INSN_SIZE + calc.offset;
+ return ptr + CALL_INSN_SIZE + call.disp;
}
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
@@ -982,19 +548,18 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- return ftrace_text_replace(0xe9, ip, addr);
+ return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
}
static int ftrace_mod_jmp(unsigned long ip, void *func)
{
- unsigned char *new;
+ const char *new;
- ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);
-
- return update_ftrace_func(ip, new);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ return 0;
}
int ftrace_enable_ftrace_graph_caller(void)
@@ -1020,10 +585,9 @@ int ftrace_disable_ftrace_graph_caller(void)
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long frame_pointer)
{
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
unsigned long old;
int faulted;
- unsigned long return_hooker = (unsigned long)
- &return_to_handler;
/*
* When resuming from suspend-to-ram, this function can be indirectly
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index c1a8b9e71408..9c4498ea0b3c 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,15 +16,7 @@
#include <asm/alternative.h>
#include <asm/text-patching.h>
-union jump_code_union {
- char code[JUMP_LABEL_NOP_SIZE];
- struct {
- char jump;
- int offset;
- } __attribute__((packed));
-};
-
-static void bug_at(unsigned char *ip, int line)
+static void bug_at(const void *ip, int line)
{
/*
* The location is not an op that we were expecting.
@@ -35,42 +27,42 @@ static void bug_at(unsigned char *ip, int line)
BUG();
}
-static void __jump_label_set_jump_code(struct jump_entry *entry,
- enum jump_label_type type,
- union jump_code_union *code,
- int init)
+static const void *
+__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
{
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
- const void *expect;
+ const void *expect, *code;
+ const void *addr, *dest;
int line;
- code->jump = 0xe9;
- code->offset = jump_entry_target(entry) -
- (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+ addr = (void *)jump_entry_code(entry);
+ dest = (void *)jump_entry_target(entry);
+
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
if (init) {
expect = default_nop; line = __LINE__;
} else if (type == JUMP_LABEL_JMP) {
expect = ideal_nop; line = __LINE__;
} else {
- expect = code->code; line = __LINE__;
+ expect = code; line = __LINE__;
}
- if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
- bug_at((void *)jump_entry_code(entry), line);
+ if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
+ bug_at(addr, line);
if (type == JUMP_LABEL_NOP)
- memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE);
+ code = ideal_nop;
+
+ return code;
}
-static void __ref __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+static void inline __jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
{
- union jump_code_union code;
-
- __jump_label_set_jump_code(entry, type, &code, init);
+ const void *opcode = __jump_label_set_jump_code(entry, type, init);
/*
* As long as only a single processor is running and the code is still
@@ -84,31 +76,33 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
- text_poke_early((void *)jump_entry_code(entry), &code,
+ text_poke_early((void *)jump_entry_code(entry), opcode,
JUMP_LABEL_NOP_SIZE);
return;
}
- text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL);
+ text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
}
-void arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type)
+static void __ref jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
{
mutex_lock(&text_mutex);
- __jump_label_transform(entry, type, 0);
+ __jump_label_transform(entry, type, init);
mutex_unlock(&text_mutex);
}
-#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
-static struct text_poke_loc tp_vec[TP_VEC_MAX];
-static int tp_vec_nr;
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ jump_label_transform(entry, type, 0);
+}
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
- struct text_poke_loc *tp;
- void *entry_code;
+ const void *opcode;
if (system_state == SYSTEM_BOOTING) {
/*
@@ -118,53 +112,19 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
return true;
}
- /*
- * No more space in the vector, tell upper layer to apply
- * the queue before continuing.
- */
- if (tp_vec_nr == TP_VEC_MAX)
- return false;
-
- tp = &tp_vec[tp_vec_nr];
-
- entry_code = (void *)jump_entry_code(entry);
-
- /*
- * The INT3 handler will do a bsearch in the queue, so we need entries
- * to be sorted. We can survive an unsorted list by rejecting the entry,
- * forcing the generic jump_label code to apply the queue. Warning once,
- * to raise the attention to the case of an unsorted entry that is
- * better not happen, because, in the worst case we will perform in the
- * same way as we do without batching - with some more overhead.
- */
- if (tp_vec_nr > 0) {
- int prev = tp_vec_nr - 1;
- struct text_poke_loc *prev_tp = &tp_vec[prev];
-
- if (WARN_ON_ONCE(prev_tp->addr > entry_code))
- return false;
- }
-
- __jump_label_set_jump_code(entry, type,
- (union jump_code_union *)&tp->text, 0);
-
- text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL);
-
- tp_vec_nr++;
-
+ mutex_lock(&text_mutex);
+ opcode = __jump_label_set_jump_code(entry, type, 0);
+ text_poke_queue((void *)jump_entry_code(entry),
+ opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ mutex_unlock(&text_mutex);
return true;
}
void arch_jump_label_transform_apply(void)
{
- if (!tp_vec_nr)
- return;
-
mutex_lock(&text_mutex);
- text_poke_bp_batch(tp_vec, tp_vec_nr);
+ text_poke_finish();
mutex_unlock(&text_mutex);
-
- tp_vec_nr = 0;
}
static enum {
@@ -193,5 +153,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
jlstate = JL_STATE_NO_UPDATE;
}
if (jlstate == JL_STATE_UPDATE)
- __jump_label_transform(entry, type, 1);
+ jump_label_transform(entry, type, 1);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a0c223ab7264..4d7022a740ab 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -120,14 +120,14 @@ __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
void synthesize_reljump(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
+ __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_reljump);
/* Insert a call instruction at address 'from', which calls address 'to'.*/
void synthesize_relcall(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
+ __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_relcall);
@@ -302,7 +302,7 @@ static int can_probe(unsigned long paddr)
* Another debugging subsystem might insert this breakpoint.
* In that case, we can't recover it.
*/
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
addr += insn.length;
}
@@ -357,7 +357,7 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
return 0;
/* Another subsystem puts a breakpoint, failed to recover */
- if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
/* We should not singlestep on the exception masking instructions */
@@ -401,14 +401,14 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
int len = insn->length;
if (can_boost(insn, p->addr) &&
- MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
+ MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
/*
* These instructions can be executed directly if it
* jumps back to correct address.
*/
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
- len += RELATIVEJUMP_SIZE;
+ len += JMP32_INSN_SIZE;
p->ainsn.boostable = true;
} else {
p->ainsn.boostable = false;
@@ -502,12 +502,14 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+ text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ text_poke_sync();
}
void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
+ text_poke_sync();
}
void arch_remove_kprobe(struct kprobe *p)
@@ -610,7 +612,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
/* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
+ if (p->opcode == INT3_INSN_OPCODE)
regs->ip = (unsigned long)p->addr;
else
regs->ip = (unsigned long)p->ainsn.insn;
@@ -696,7 +698,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
reset_current_kprobe();
return 1;
}
- } else if (*addr != BREAKPOINT_INSTRUCTION) {
+ } else if (*addr != INT3_INSN_OPCODE) {
/*
* The breakpoint instruction was removed right
* after we hit it. Another cpu has removed
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 8900329c28a7..3f45b5c43a71 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -38,7 +38,7 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
long offs;
int i;
- for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+ for (i = 0; i < JMP32_INSN_SIZE; i++) {
kp = get_kprobe((void *)addr - i);
/* This function only handles jump-optimized kprobe */
if (kp && kprobe_optimized(kp)) {
@@ -62,10 +62,10 @@ found:
if (addr == (unsigned long)kp->addr) {
buf[0] = kp->opcode;
- memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
} else {
offs = addr - (unsigned long)kp->addr - 1;
- memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+ memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
}
return (unsigned long)buf;
@@ -141,8 +141,6 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_END_IDX \
((long)optprobe_template_end - (long)optprobe_template_entry)
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-
/* Optimized kprobe call back function: called from optinsn */
static void
optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
@@ -162,7 +160,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
regs->cs |= get_kernel_rpl();
regs->gs = 0;
#endif
- regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
regs->orig_ax = ~0UL;
__this_cpu_write(current_kprobe, &op->kp);
@@ -179,7 +177,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
struct insn insn;
int len = 0, ret;
- while (len < RELATIVEJUMP_SIZE) {
+ while (len < JMP32_INSN_SIZE) {
ret = __copy_instruction(dest + len, src + len, real + len, &insn);
if (!ret || !can_boost(&insn, src + len))
return -EINVAL;
@@ -271,7 +269,7 @@ static int can_optimize(unsigned long paddr)
return 0;
/* Check there is enough space for a relative jump. */
- if (size - offset < RELATIVEJUMP_SIZE)
+ if (size - offset < JMP32_INSN_SIZE)
return 0;
/* Decode instructions */
@@ -290,15 +288,15 @@ static int can_optimize(unsigned long paddr)
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
/* Another subsystem puts a breakpoint */
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
/* Check any instructions don't jump into target */
if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_SIZE,
- RELATIVE_ADDR_SIZE))
+ insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
+ DISP32_SIZE))
return 0;
addr += insn.length;
}
@@ -374,7 +372,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
* Verify if the address gap is in 2GB range, because this uses
* a relative jump.
*/
- rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
if (abs(rel) > 0x7fffffff) {
ret = -ERANGE;
goto err;
@@ -401,7 +399,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
/* Set returning jmp instruction at the tail of out-of-line buffer */
synthesize_reljump(buf + len, slot + len,
(u8 *)op->kp.addr + op->optinsn.size);
- len += RELATIVEJUMP_SIZE;
+ len += JMP32_INSN_SIZE;
/* We have to use text_poke() for instruction buffer because it is RO */
text_poke(slot, buf, len);
@@ -416,49 +414,50 @@ err:
}
/*
- * Replace breakpoints (int3) with relative jumps.
+ * Replace breakpoints (INT3) with relative jumps (JMP.d32).
* Caller must call with locking kprobe_mutex and text_mutex.
+ *
+ * The caller will have installed a regular kprobe and after that issued
+ * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
+ * the 4 bytes after the INT3 are unused and can now be overwritten.
*/
void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
- u8 insn_buff[RELATIVEJUMP_SIZE];
+ u8 insn_buff[JMP32_INSN_SIZE];
list_for_each_entry_safe(op, tmp, oplist, list) {
s32 rel = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+ ((long)op->kp.addr + JMP32_INSN_SIZE));
WARN_ON(kprobe_disabled(&op->kp));
/* Backup instructions which will be replaced by jump address */
- memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
- RELATIVE_ADDR_SIZE);
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
+ DISP32_SIZE);
- insn_buff[0] = RELATIVEJUMP_OPCODE;
+ insn_buff[0] = JMP32_INSN_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL);
+ text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
list_del_init(&op->list);
}
}
-/* Replace a relative jump with a breakpoint (int3). */
+/*
+ * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
+ *
+ * After that, we can restore the 4 bytes after the INT3 to undo what
+ * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
+ * unused once the INT3 lands.
+ */
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- u8 insn_buff[RELATIVEJUMP_SIZE];
- u8 emulate_buff[RELATIVEJUMP_SIZE];
-
- /* Set int3 to first byte for kprobes */
- insn_buff[0] = BREAKPOINT_INSTRUCTION;
- memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-
- emulate_buff[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- emulate_buff);
+ arch_arm_kprobe(&op->kp);
+ text_poke(op->kp.addr + INT3_INSN_SIZE,
+ op->optinsn.copied_insn, DISP32_SIZE);
+ text_poke_sync();
}
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 05da6b5b167b..f19de6f45d48 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -572,15 +572,6 @@ NOKPROBE_SYMBOL(do_general_protection);
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * ftrace must be first, everything else may cause a recursive crash.
- * See note by declaration of modifying_ftrace_code in ftrace.c
- */
- if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
-#endif
if (poke_int3_handler(regs))
return;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8d29ae8d3eb7..23df4885bbed 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -873,34 +873,6 @@ void arch_remove_memory(int nid, u64 start, u64 size,
int kernel_set_to_readonly __read_mostly;
-void set_kernel_text_rw(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read write\n",
- start, start+size);
-
- set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read only\n",
- start, start+size);
-
- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
static void mark_nxdata_nx(void)
{
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bcfede46fe02..abbdecb75fad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1258,42 +1258,6 @@ void __init mem_init(void)
int kernel_set_to_readonly;
-void set_kernel_text_rw(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long end = PFN_ALIGN(_etext);
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read write\n",
- start, end);
-
- /*
- * Make the kernel identity mapping for text RW. Kernel text
- * mapping will always be RO. Refer to the comment in
- * static_protections() in pageattr.c
- */
- set_memory_rw(start, (end - start) >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long end = PFN_ALIGN(_etext);
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read only\n",
- start, end);
-
- /*
- * Set the kernel identity mapping for text RO.
- */
- set_memory_ro(start, (end - start) >> PAGE_SHIFT);
-}
-
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);