From 510419435c6948fb32959d691bf84eaba41ca474 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:36 +0100 Subject: perf/x86: Implement IBS event configuration This patch implements perf configuration for AMD IBS. The IBS pmu is selected using the type attribute in sysfs. There are two types of ibs pmus, for instruction fetch (IBS_FETCH) and for instruction execution (IBS_OP): /sys/bus/event_source/devices/ibs_fetch/type /sys/bus/event_source/devices/ibs_op/type Except for the sample period IBS can only be set up with raw config values and raw data samples. The event attributes for the syscall should be programmed like this (IBS_FETCH): type = get_pmu_type("/sys/bus/event_source/devices/ibs_fetch/type"); memset(&attr, 0, sizeof(attr)); attr.type = type; attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW; attr.config = IBS_FETCH_CONFIG_DEFAULT; This implementation does not yet support 64 bit counters. It is limited to the hardware counter bit width which is 20 bits. 64 bit support can be added later. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 92 +++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3b8a2d30d14e..36684eb248de 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -16,12 +16,67 @@ static u32 ibs_caps; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) -static struct pmu perf_ibs; +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; +}; + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} static int perf_ibs_init(struct perf_event *event) { - if (perf_ibs.type != event->attr.type) + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (!perf_ibs) return -ENOENT; + + config = event->attr.config; + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (hwc->sample_period & 0x0f) + /* lower 4 bits can not be set in ibs max cnt */ + return -EINVAL; + max_cnt = hwc->sample_period >> 4; + if (max_cnt & ~perf_ibs->cnt_mask) + /* out of range */ + return -EINVAL; + config |= max_cnt; + } else { + max_cnt = config & perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!max_cnt) + return -EINVAL; + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + return 0; } @@ -34,10 +89,32 @@ static void perf_ibs_del(struct perf_event *event, int flags) { } -static struct pmu perf_ibs = { - .event_init= perf_ibs_init, - .add= perf_ibs_add, - .del= perf_ibs_del, +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, +}; + +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT, + .enable_mask = IBS_OP_ENABLE, }; static __init int perf_event_ibs_init(void) @@ -45,7 +122,8 @@ static __init int perf_event_ibs_init(void) if (!ibs_caps) return -ENODEV; /* ibs not supported by the cpu */ - perf_pmu_register(&perf_ibs, "ibs", -1); + perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1); + perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; -- cgit v1.2.3 From b7074f1fbd6149eac1ec25063e4a364c39a85473 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:37 +0100 Subject: perf/x86: Implement IBS interrupt handler This patch implements code to handle ibs interrupts. If ibs data is available a raw perf_event data sample is created and sent back to the userland. This patch only implements the storage of ibs data in the raw sample, but this could be extended in a later patch by generating generic event data such as the rip from the ibs sampling data. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 5 ++ arch/x86/kernel/cpu/perf_event_amd_ibs.c | 84 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a6962d9161a0..4e3cd382a06f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -127,6 +127,8 @@ #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL< +#include + +#include + #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT @@ -25,6 +30,18 @@ struct perf_ibs { u64 config_mask; u64 cnt_mask; u64 enable_mask; + u64 valid_mask; + unsigned long offset_mask[1]; + int offset_max; +}; + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; }; static struct perf_ibs perf_ibs_fetch; @@ -101,6 +118,9 @@ static struct perf_ibs perf_ibs_fetch = { .config_mask = IBS_FETCH_CONFIG_MASK, .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, }; static struct perf_ibs perf_ibs_op = { @@ -115,8 +135,71 @@ static struct perf_ibs perf_ibs_op = { .config_mask = IBS_OP_CONFIG_MASK, .cnt_mask = IBS_OP_MAX_CNT, .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, }; +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct perf_event *event = NULL; + struct hw_perf_event *hwc = &event->hw; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size; + unsigned int msr; + u64 *buf; + + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + return 0; + + perf_sample_data_init(&data, 0); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < perf_ibs->offset_max); + raw.size = sizeof(u32) + sizeof(u64) * size; + raw.data = ibs_data.data; + data.raw = &raw; + } + + regs = *iregs; /* XXX: update ip from ibs sample */ + + if (perf_event_overflow(event, &data, ®s)) + ; /* stop */ + else + /* reenable */ + wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); + + return 1; +} + +static int __kprobes +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + static __init int perf_event_ibs_init(void) { if (!ibs_caps) @@ -124,6 +207,7 @@ static __init int perf_event_ibs_init(void) perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1); perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1); + register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; -- cgit v1.2.3 From 4db2e8e6500d9ba6406f2714fa3968b39a325274 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:38 +0100 Subject: perf/x86: Implement IBS pmu control ops Add code to control the IBS pmu. We need to maintain per-cpu states. Since some states are used and changed by the nmi handler, access to these states must be atomic. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-4-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 106 ++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index a7ec6bdf0a63..40a6d9d5dd23 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -24,6 +24,19 @@ static u32 ibs_caps; #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + struct perf_ibs { struct pmu pmu; unsigned int msr; @@ -33,6 +46,7 @@ struct perf_ibs { u64 valid_mask; unsigned long offset_mask[1]; int offset_max; + struct cpu_perf_ibs __percpu *pcpu; }; struct perf_ibs_data { @@ -97,15 +111,66 @@ static int perf_ibs_init(struct perf_event *event) return 0; } +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_STARTED, pcpu->state)) + return; + + wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 val; + + if (!test_and_clear_bit(IBS_STARTED, pcpu->state)) + return; + + set_bit(IBS_STOPPING, pcpu->state); + + rdmsrl(hwc->config_base, val); + val &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, val); +} + static int perf_ibs_add(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + return 0; } static void perf_ibs_del(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, 0); + + pcpu->event = NULL; } +static void perf_ibs_read(struct perf_event *event) { } + static struct perf_ibs perf_ibs_fetch = { .pmu = { .task_ctx_nr = perf_invalid_context, @@ -113,6 +178,9 @@ static struct perf_ibs perf_ibs_fetch = { .event_init = perf_ibs_init, .add = perf_ibs_add, .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_CONFIG_MASK, @@ -130,6 +198,9 @@ static struct perf_ibs perf_ibs_op = { .event_init = perf_ibs_init, .add = perf_ibs_add, .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, @@ -142,7 +213,8 @@ static struct perf_ibs perf_ibs_op = { static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { - struct perf_event *event = NULL; + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; struct perf_raw_record raw; @@ -152,6 +224,14 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) unsigned int msr; u64 *buf; + if (!test_bit(IBS_STARTED, pcpu->state)) { + /* Catch spurious interrupts after stopping IBS: */ + if (!test_and_clear_bit(IBS_STOPPING, pcpu->state)) + return 0; + rdmsrl(perf_ibs->msr, *ibs_data.regs); + return (*ibs_data.regs & perf_ibs->valid_mask) ? 1 : 0; + } + msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); @@ -200,13 +280,33 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) return handled; } +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + static __init int perf_event_ibs_init(void) { if (!ibs_caps) return -ENODEV; /* ibs not supported by the cpu */ - perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1); - perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1); + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); -- cgit v1.2.3 From db98c5faf8cb350212ea3af786cb3ba0d4e7a01e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:39 +0100 Subject: perf/x86: Implement 64-bit counter support for IBS This patch implements 64 bit counter support for IBS. The sampling period is no longer limited to the hw counter width. The functions perf_event_set_period() and perf_event_try_update() can be used as generic functions. They can replace similar code that is duplicate across architectures. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-5-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event_amd_ibs.c | 204 +++++++++++++++++++++++++++---- 2 files changed, 185 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e8fb2c7a5f4f..9cf66965141d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -177,6 +177,8 @@ struct x86_pmu_capability { #define IBS_FETCH_MAX_CNT 0x0000FFFFULL /* IbsOpCtl bits */ +/* lower 4 bits of the current count are ignored: */ +#define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 40a6d9d5dd23..573d24873459 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -44,9 +44,11 @@ struct perf_ibs { u64 cnt_mask; u64 enable_mask; u64 valid_mask; + u64 max_period; unsigned long offset_mask[1]; int offset_max; struct cpu_perf_ibs __percpu *pcpu; + u64 (*get_count)(u64 config); }; struct perf_ibs_data { @@ -58,6 +60,78 @@ struct perf_ibs_data { u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; }; +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left <= 0)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < min)) + left = min; + + if (left > max) + left = max; + + *count = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + static struct perf_ibs perf_ibs_fetch; static struct perf_ibs perf_ibs_op; @@ -91,18 +165,14 @@ static int perf_ibs_init(struct perf_event *event) if (hwc->sample_period & 0x0f) /* lower 4 bits can not be set in ibs max cnt */ return -EINVAL; - max_cnt = hwc->sample_period >> 4; - if (max_cnt & ~perf_ibs->cnt_mask) - /* out of range */ - return -EINVAL; - config |= max_cnt; } else { max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; event->attr.sample_period = max_cnt << 4; hwc->sample_period = event->attr.sample_period; } - if (!max_cnt) + if (!hwc->sample_period) return -EINVAL; hwc->config_base = perf_ibs->msr; @@ -111,16 +181,71 @@ static int perf_ibs_init(struct perf_event *event) return 0; } +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int ret; + + /* ignore lower 4 bits in min count: */ + ret = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return ret; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ + return (config & IBS_OP_CUR_CNT) >> 32; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 config) +{ + u64 count = perf_ibs->get_count(config); + + while (!perf_event_try_update(event, count, 20)) { + rdmsrl(event->hw.config_base, config); + count = perf_ibs->get_count(config); + } +} + +/* Note: The enable mask must be encoded in the config argument. */ +static inline void perf_ibs_enable_event(struct hw_perf_event *hwc, u64 config) +{ + wrmsrl(hwc->config_base, hwc->config | config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ static void perf_ibs_start(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; - if (test_and_set_bit(IBS_STARTED, pcpu->state)) + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; - wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &config); + config = (config >> 4) | perf_ibs->enable_mask; + set_bit(IBS_STARTED, pcpu->state); + perf_ibs_enable_event(hwc, config); + + perf_event_update_userpage(event); } static void perf_ibs_stop(struct perf_event *event, int flags) @@ -129,15 +254,28 @@ static void perf_ibs_stop(struct perf_event *event, int flags) struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); u64 val; + int stopping; - if (!test_and_clear_bit(IBS_STARTED, pcpu->state)) - return; + stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); - set_bit(IBS_STOPPING, pcpu->state); + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; rdmsrl(hwc->config_base, val); - val &= ~perf_ibs->enable_mask; - wrmsrl(hwc->config_base, val); + + if (stopping) { + set_bit(IBS_STOPPING, pcpu->state); + val &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, val); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + perf_ibs_event_update(perf_ibs, event, val); + hwc->state |= PERF_HES_UPTODATE; } static int perf_ibs_add(struct perf_event *event, int flags) @@ -148,6 +286,8 @@ static int perf_ibs_add(struct perf_event *event, int flags) if (test_and_set_bit(IBS_ENABLED, pcpu->state)) return -ENOSPC; + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + pcpu->event = event; if (flags & PERF_EF_START) @@ -164,9 +304,11 @@ static void perf_ibs_del(struct perf_event *event, int flags) if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) return; - perf_ibs_stop(event, 0); + perf_ibs_stop(event, PERF_EF_UPDATE); pcpu->event = NULL; + + perf_event_update_userpage(event); } static void perf_ibs_read(struct perf_event *event) { } @@ -187,8 +329,11 @@ static struct perf_ibs perf_ibs_fetch = { .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + + .get_count = get_ibs_fetch_count, }; static struct perf_ibs perf_ibs_op = { @@ -207,8 +352,11 @@ static struct perf_ibs perf_ibs_op = { .cnt_mask = IBS_OP_MAX_CNT, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + + .get_count = get_ibs_op_count, }; static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) @@ -220,9 +368,9 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_raw_record raw; struct pt_regs regs; struct perf_ibs_data ibs_data; - int offset, size; + int offset, size, overflow, reenable; unsigned int msr; - u64 *buf; + u64 *buf, config; if (!test_bit(IBS_STARTED, pcpu->state)) { /* Catch spurious interrupts after stopping IBS: */ @@ -257,11 +405,25 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs = *iregs; /* XXX: update ip from ibs sample */ - if (perf_event_overflow(event, &data, ®s)) - ; /* stop */ - else - /* reenable */ - wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); + /* + * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not + * supported in all cpus. As this triggered an interrupt, we + * set the current count to the max count. + */ + config = ibs_data.regs[0]; + if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { + config &= ~IBS_OP_CUR_CNT; + config |= (config & IBS_OP_MAX_CNT) << 36; + } + + perf_ibs_event_update(perf_ibs, event, config); + + overflow = perf_ibs_set_period(perf_ibs, hwc, &config); + reenable = !(overflow && perf_event_overflow(event, &data, ®s)); + config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0); + perf_ibs_enable_event(hwc, config); + + perf_event_update_userpage(event); return 1; } -- cgit v1.2.3 From fab06992de6433af097c4a1d2d1b119812753ca7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Apr 2012 12:55:22 +0200 Subject: perf/x86: Clean up register_nmi_handler() usage A function name represents the pointer to it - no need to take the address of it. (Fixing this helps us introduce some macro magic around register_nmi_handler() in the future.) Cc: Robert Richter Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 573d24873459..8ff74d439041 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -469,7 +469,7 @@ static __init int perf_event_ibs_init(void) perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); - register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs"); + register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; -- cgit v1.2.3 From 10c250234c98928d1e15c4cea1c44b9a25354ccf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 5 Apr 2012 18:24:41 +0200 Subject: perf: Trivial cleanup of duplicate code Removing duplicate code. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333643084-26776-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bb8e03407e18..e33e9cf160eb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; - - /* mark not used */ - event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); -- cgit v1.2.3 From 5f09fc688936705b2020ca247df39ee27283668a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 5 Apr 2012 18:24:42 +0200 Subject: perf/x86: Fix cmpxchg() usage in amd_put_event_constraints() Now the return value of cmpxchg() is used to match an event. The change removes the duplicate event comparison and traverses the list until an event was removed. This also fixes the following warning: arch/x86/kernel/cpu/perf_event_amd.c:170: warning: value computed is not used Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333643084-26776-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 95e7fe1c5f0b..589286f28877 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -205,10 +205,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * when we come here */ for (i = 0; i < x86_pmu.num_counters; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); + if (cmpxchg(nb->owners + i, event, NULL) == event) break; - } } } -- cgit v1.2.3 From 08d636b6d4fb80647fe8869ea1cd97b2c26a4751 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Aug 2011 09:57:10 -0400 Subject: ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine This method changes x86 to add a breakpoint to the mcount locations instead of calling stop machine. Now that iret can be handled by NMIs, we perform the following to update code: 1) Add a breakpoint to all locations that will be modified 2) Sync all cores 3) Update all locations to be either a nop or call (except breakpoint op) 4) Sync all cores 5) Remove the breakpoint with the new code. 6) Sync all cores [ Added updates that Masami suggested: Use unlikely(modifying_ftrace_code) in int3 trap to keep kprobes efficient. Don't use NOTIFY_* in ftrace handler in int3 as it is not a notifier. ] Cc: H. Peter Anvin Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 3 + arch/x86/kernel/ftrace.c | 342 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 8 +- include/linux/ftrace.h | 6 + 4 files changed, 358 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 268c783ab1c0..18d9005d9e4f 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); +extern int modifying_ftrace_code; static inline unsigned long ftrace_call_adjust(unsigned long addr) { @@ -50,6 +51,8 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; +int ftrace_int3_handler(struct pt_regs *regs); + #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f272fd..80af34739a9a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -334,6 +335,347 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } +int modifying_ftrace_code __read_mostly; + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ + if (WARN_ON_ONCE(!regs)) + return 0; + + if (!ftrace_location(regs->ip - 1)) + return 0; + + regs->ip += MCOUNT_INSN_SIZE - 1; + + return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + + return probe_kernel_write((void *)ip, val, size); +} + +static int add_break(unsigned long ip, const char *old) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + if (ftrace_write(ip, &brk, 1)) + return -EPERM; + + return 0; +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + + return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ + unsigned const char *old; + + old = ftrace_nop_replace(); + + return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_brk_on_nop(rec); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_brk_on_call(rec, ftrace_addr); + } + return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ + unsigned char ins[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + const unsigned char *nop; + unsigned long ftrace_addr; + unsigned long ip = rec->ip; + + /* If we fail the read, just give up */ + if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* If this does not have a breakpoint, we are done */ + if (ins[0] != brk) + return -1; + + nop = ftrace_nop_replace(); + + /* + * If the last 4 bytes of the instruction do not match + * a nop, then we assume that this is a call to ftrace_addr. + */ + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { + /* + * For extra paranoidism, we check if the breakpoint is on + * a call that would actually jump to the ftrace_addr. + * If not, don't touch the breakpoint, we make just create + * a disaster. + */ + ftrace_addr = (unsigned long)FTRACE_ADDR; + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) + return -EINVAL; + } + + return probe_kernel_write((void *)ip, &nop[0], 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ + /* skip breakpoint */ + ip++; + new++; + if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) + return -EPERM; + return 0; +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_update_nop(rec); + } + + return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + + return 0; +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + return 0; +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_update_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return finish_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return finish_update_nop(rec); + } + + return 0; +} + +static void do_sync_core(void *data) +{ + sync_core(); +} + +static void run_sync(void) +{ + int enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disbled (on bootup). */ + if (enable_irqs) + local_irq_enable(); + on_each_cpu(do_sync_core, NULL, 1); + if (enable_irqs) + local_irq_disable(); +} + +static void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *report = "adding breakpoints"; + int count = 0; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_breakpoints(rec, enable); + if (ret) + goto remove_breakpoints; + count++; + } + + run_sync(); + + report = "updating code"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + report = "removing breakpoints"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = finish_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + return; + + remove_breakpoints: + ftrace_bug(ret, rec ? rec->ip : 0); + printk(KERN_WARNING "Failed on %s (%d):\n", report, count); + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + remove_breakpoint(rec); + } +} + +void arch_ftrace_update_code(int command) +{ + modifying_ftrace_code++; + + if (command & FTRACE_UPDATE_CALLS) + ftrace_replace_code(1); + else if (command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); + + modifying_ftrace_code--; +} + int __init ftrace_dyn_arch_init(void *data) { /* The return code is retured via data */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff9281f16029..92d5756d85fc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -303,8 +304,13 @@ gp_in_kernel: } /* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_DYNAMIC_FTRACE + /* ftrace must be first, everything else may cause a recursive crash */ + if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + return; +#endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 72a6cabb4d5b..0b5590330bca 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -286,6 +286,12 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void); struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter); struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); +#define for_ftrace_rec_iter(iter) \ + for (iter = ftrace_rec_iter_start(); \ + iter; \ + iter = ftrace_rec_iter_next(iter)) + + int ftrace_update_record(struct dyn_ftrace *rec, int enable); int ftrace_test_record(struct dyn_ftrace *rec, int enable); void ftrace_run_stop_machine(int command); -- cgit v1.2.3 From 4a6d70c9505fef1d8906b1d61db3de5d8ecf9454 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Apr 2012 16:31:07 -0400 Subject: ftrace/x86: Remove the complex ftrace NMI handling code As ftrace function tracing would require modifying code that could be executed in NMI context, which is not stopped with stop_machine(), ftrace had to do a complex algorithm with various stages of setup and memory barriers to make it work. With the new breakpoint method, this is no longer required. The changes to the code can be done without any problem in NMI context, as well as without stop machine altogether. Remove the complex code as it is no longer needed. Also, a lot of the notrace annotations could be removed from the NMI code as it is now safe to trace them. With the exception of do_nmi itself, which does some special work to handle running in the debug stack. The breakpoint method can cause NMIs to double nest the debug stack if it's not setup properly, and that is done in do_nmi(), thus that function must not be traced. (Note the arch sh may want to do the same) Cc: Paul Mundt Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 - arch/x86/kernel/ftrace.c | 169 +---------------------------------------------- arch/x86/kernel/nmi.c | 10 +-- 3 files changed, 6 insertions(+), 174 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..1324139612e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -40,7 +40,6 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 80af34739a9a..cf2d03ec1793 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -27,38 +27,18 @@ #include #include #include -#include - #ifdef CONFIG_DYNAMIC_FTRACE -/* - * modifying_code is set to notify NMIs that they need to use - * memory barriers when entering or exiting. But we don't want - * to burden NMIs with unnecessary memory barriers when code - * modification is not being done (which is most of the time). - * - * A mutex is already held when ftrace_arch_code_modify_prepare - * and post_process are called. No locks need to be taken here. - * - * Stop machine will make sure currently running NMIs are done - * and new NMIs will see the updated variable before we need - * to worry about NMIs doing memory barriers. - */ -static int modifying_code __read_mostly; -static DEFINE_PER_CPU(int, save_modifying_code); - int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); set_all_modules_text_rw(); - modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { - modifying_code = 0; set_all_modules_text_ro(); set_kernel_text_ro(); return 0; @@ -91,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -/* - * Modifying code must take extra care. On an SMP machine, if - * the code being modified is also being executed on another CPU - * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. - * But this does not stop NMIs from happening. We still need - * to protect against that. We separate out the modification of - * the code to take care of this. - * - * Two buffers are added: An IP buffer and a "code" buffer. - * - * 1) Put the instruction pointer into the IP buffer - * and the new code into the "code" buffer. - * 2) Wait for any running NMIs to finish and set a flag that says - * we are modifying code, it is done in an atomic operation. - * 3) Write the code - * 4) clear the flag. - * 5) Wait for any running NMIs to finish. - * - * If an NMI is executed, the first thing it does is to call - * "ftrace_nmi_enter". This will check if the flag is set to write - * and if it is, it will write what is in the IP and "code" buffers. - * - * The trick is, it does not matter if everyone is writing the same - * content to the code location. Also, if a CPU is executing code - * it is OK to write to that code location if the contents being written - * are the same as what exists. - */ - -#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ -static atomic_t nmi_running = ATOMIC_INIT(0); -static int mod_code_status; /* holds return value of text write */ -static void *mod_code_ip; /* holds the IP to write to */ -static const void *mod_code_newcode; /* holds the text to write to the IP */ - -static unsigned nmi_wait_count; -static atomic_t nmi_update_count = ATOMIC_INIT(0); - -int ftrace_arch_read_dyn_info(char *buf, int size) -{ - int r; - - r = snprintf(buf, size, "%u %u", - nmi_wait_count, - atomic_read(&nmi_update_count)); - return r; -} - -static void clear_mod_flag(void) -{ - int old = atomic_read(&nmi_running); - - for (;;) { - int new = old & ~MOD_CODE_WRITE_FLAG; - - if (old == new) - break; - - old = atomic_cmpxchg(&nmi_running, old, new); - } -} - -static void ftrace_mod_code(void) -{ - /* - * Yes, more than one CPU process can be writing to mod_code_status. - * (and the code itself) - * But if one were to fail, then they all should, and if one were - * to succeed, then they all should. - */ - mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, - MCOUNT_INSN_SIZE); - - /* if we fail, then kill any new writers */ - if (mod_code_status) - clear_mod_flag(); -} - -void ftrace_nmi_enter(void) -{ - __this_cpu_write(save_modifying_code, modifying_code); - - if (!__this_cpu_read(save_modifying_code)) - return; - - if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { - smp_rmb(); - ftrace_mod_code(); - atomic_inc(&nmi_update_count); - } - /* Must have previous changes seen before executions */ - smp_mb(); -} - -void ftrace_nmi_exit(void) -{ - if (!__this_cpu_read(save_modifying_code)) - return; - - /* Finish all executions before clearing nmi_running */ - smp_mb(); - atomic_dec(&nmi_running); -} - -static void wait_for_nmi_and_set_mod_flag(void) -{ - if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) - return; - - do { - cpu_relax(); - } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); - - nmi_wait_count++; -} - -static void wait_for_nmi(void) -{ - if (!atomic_read(&nmi_running)) - return; - - do { - cpu_relax(); - } while (atomic_read(&nmi_running)); - - nmi_wait_count++; -} - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -239,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa(ip)); - mod_code_ip = (void *)ip; - mod_code_newcode = new_code; - - /* The buffers need to be visible before we let NMIs write them */ - smp_mb(); - - wait_for_nmi_and_set_mod_flag(); - - /* Make sure all running NMIs have finished before we write the code */ - smp_mb(); - - ftrace_mod_code(); - - /* Make sure the write happens before clearing the bit */ - smp_mb(); - - clear_mod_flag(); - wait_for_nmi(); - - return mod_code_status; + return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } static const unsigned char *ftrace_nop_replace(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf319165..eb1539eac393 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) -static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -209,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static notrace __kprobes void +static __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", @@ -236,7 +236,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; @@ -263,7 +263,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { int handled; @@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; -- cgit v1.2.3 From 59a094c994a138049b41a44bc29cff9407d51c5b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 May 2012 09:26:16 -0400 Subject: ftrace/x86: Use asm/kprobes.h instead of linux/kprobes.h If CONFIG_KPROBES is not set, then linux/kprobes.h will not include asm/kprobes.h needed by x86/ftrace.c for the BREAKPOINT macro. The x86/ftrace.c file should just include asm/kprobes.h as it does not need the rest of kprobes. Reported-by: Ingo Molnar Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cf2d03ec1793..4243e8bbdcb1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -20,11 +20,11 @@ #include #include #include -#include #include #include +#include #include #include -- cgit v1.2.3 From c75841a398d667d9968245b9519d93cedbfb4780 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:07 +0200 Subject: perf/x86-ibs: Fix update of period The last sw period was not correctly updated on overflow and thus led to wrong distribution of events. We always need to properly initialize data.period in struct perf_sample_data. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 8ff74d439041..c8f69bea6624 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -386,7 +386,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (!(*buf++ & perf_ibs->valid_mask)) return 0; + /* + * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not + * supported in all cpus. As this triggered an interrupt, we + * set the current count to the max count. + */ + config = ibs_data.regs[0]; + if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { + config &= ~IBS_OP_CUR_CNT; + config |= (config & IBS_OP_MAX_CNT) << 36; + } + + perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + if (event->attr.sample_type & PERF_SAMPLE_RAW) { ibs_data.caps = ibs_caps; size = 1; @@ -405,19 +419,6 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs = *iregs; /* XXX: update ip from ibs sample */ - /* - * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not - * supported in all cpus. As this triggered an interrupt, we - * set the current count to the max count. - */ - config = ibs_data.regs[0]; - if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { - config &= ~IBS_OP_CUR_CNT; - config |= (config & IBS_OP_MAX_CNT) << 36; - } - - perf_ibs_event_update(perf_ibs, event, config); - overflow = perf_ibs_set_period(perf_ibs, hwc, &config); reenable = !(overflow && perf_event_overflow(event, &data, ®s)); config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0); -- cgit v1.2.3 From fd0d000b2c34aa43d4e92dcf0dfaeda7e123008a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:08 +0200 Subject: perf: Pass last sampling period to perf_sample_data_init() We always need to pass the last sample period to perf_sample_data_init(), otherwise the event distribution will be wrong. Thus, modifiyng the function interface with the required period as argument. So basically a pattern like this: perf_sample_data_init(&data, ~0ULL); data.period = event->hw.last_period; will now be like that: perf_sample_data_init(&data, ~0ULL, event->hw.last_period); Avoids unininitialized data.period and simplifies code. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 3 +-- arch/arm/kernel/perf_event_v6.c | 4 +--- arch/arm/kernel/perf_event_v7.c | 4 +--- arch/arm/kernel/perf_event_xscale.c | 8 ++------ arch/mips/kernel/perf_event_mipsxx.c | 2 +- arch/powerpc/perf/core-book3s.c | 3 +-- arch/powerpc/perf/core-fsl-emb.c | 3 +-- arch/sparc/kernel/perf_event.c | 4 +--- arch/x86/kernel/cpu/perf_event.c | 4 +--- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 3 +-- arch/x86/kernel/cpu/perf_event_intel.c | 4 +--- arch/x86/kernel/cpu/perf_event_intel_ds.c | 6 ++---- arch/x86/kernel/cpu/perf_event_p4.c | 6 +++--- include/linux/perf_event.h | 5 ++++- kernel/events/core.c | 9 ++++----- 15 files changed, 25 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 0dae252f7a33..d821b17047e0 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -824,7 +824,6 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, idx = la_ptr; - perf_sample_data_init(&data, 0); for (j = 0; j < cpuc->n_events; j++) { if (cpuc->current_idx[j] == idx) break; @@ -848,7 +847,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, hwc = &event->hw; alpha_perf_event_update(event, hwc, idx, alpha_pmu->pmc_max_period[idx]+1); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (alpha_perf_event_set_period(event, hwc, idx)) { if (perf_event_overflow(event, &data, regs)) { diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c index b78af0cc6ef3..ab627a740fa3 100644 --- a/arch/arm/kernel/perf_event_v6.c +++ b/arch/arm/kernel/perf_event_v6.c @@ -489,8 +489,6 @@ armv6pmu_handle_irq(int irq_num, */ armv6_pmcr_write(pmcr); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -509,7 +507,7 @@ armv6pmu_handle_irq(int irq_num, hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 00755d82e2f2..d3c536068162 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -1077,8 +1077,6 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) */ regs = get_irq_regs(); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -1097,7 +1095,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c index 71a21e6712f5..e34e7254e652 100644 --- a/arch/arm/kernel/perf_event_xscale.c +++ b/arch/arm/kernel/perf_event_xscale.c @@ -248,8 +248,6 @@ xscale1pmu_handle_irq(int irq_num, void *dev) regs = get_irq_regs(); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -263,7 +261,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev) hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; @@ -588,8 +586,6 @@ xscale2pmu_handle_irq(int irq_num, void *dev) regs = get_irq_regs(); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -603,7 +599,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev) hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 811084f4e422..ab73fa2fb9b5 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -1325,7 +1325,7 @@ static int mipsxx_pmu_handle_shared_irq(void) regs = get_irq_regs(); - perf_sample_data_init(&data, 0); + perf_sample_data_init(&data, 0, 0); switch (counters) { #define HANDLE_COUNTER(n) \ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 02aee03e713c..8f84bcba18da 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1299,8 +1299,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (record) { struct perf_sample_data data; - perf_sample_data_init(&data, ~0ULL); - data.period = event->hw.last_period; + perf_sample_data_init(&data, ~0ULL, event->hw.last_period); if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c index 0a6d2a9d569c..106c53354675 100644 --- a/arch/powerpc/perf/core-fsl-emb.c +++ b/arch/powerpc/perf/core-fsl-emb.c @@ -613,8 +613,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (record) { struct perf_sample_data data; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (perf_event_overflow(event, &data, regs)) fsl_emb_pmu_stop(event, 0); diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 28559ce5eeb5..5713957dcb8a 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1296,8 +1296,6 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, regs = args->regs; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* If the PMU has the TOE IRQ enable bits, we need to do a @@ -1321,7 +1319,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, if (val & (1ULL << 31)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!sparc_perf_event_set_period(event, hwc, idx)) continue; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e33e9cf160eb..e049d6da0183 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1183,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1219,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index c8f69bea6624..2317228b5299 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -398,8 +398,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) } perf_ibs_event_update(perf_ibs, event, config); - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (event->attr.sample_type & PERF_SAMPLE_RAW) { ibs_data.caps = ibs_caps; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 26b3e2fef104..166546ec6aef 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 status; int handled; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1082,7 +1080,7 @@ again: if (!intel_pmu_save_and_restart(event)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f64df19e7dd..5a3edc27f6e5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs.ip = 0; /* @@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, if (!intel_pmu_save_and_restart(event)) return; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); /* * We use the interrupt regs as a base because the PEBS record diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index a2dfacfd7103..47124a73dd73 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) handled += overflow; /* event overflow for sure */ - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) continue; + + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ddbb6a901f65..f32578634d9d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1132,11 +1132,14 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; }; -static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) +static inline void perf_sample_data_init(struct perf_sample_data *data, + u64 addr, u64 period) { + /* remaining struct members initialized in perf_prepare_sample() */ data->addr = addr; data->raw = NULL; data->br_stack = NULL; + data->period = period; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/kernel/events/core.c b/kernel/events/core.c index 9789a56b7d54..00c58df9f4e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) if (rctx < 0) return; - perf_sample_data_init(&data, addr); + perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); @@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, .data = record, }; - perf_sample_data_init(&data, addr); + perf_sample_data_init(&data, addr, 0); data.raw = &raw; hlist_for_each_entry_rcu(event, node, head, hlist_entry) { @@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; - perf_sample_data_init(&sample, bp->attr.bp_addr); + perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); @@ -5344,8 +5344,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { -- cgit v1.2.3 From 7bf352384fda3f678a283928c6c5b2cd9da877e4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:09 +0200 Subject: perf/x86-ibs: Enable ibs op micro-ops counting mode Allow enabling ibs op micro-ops counting mode. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-4-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 2317228b5299..ebf169fe40ef 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -468,6 +468,8 @@ static __init int perf_event_ibs_init(void) return -ENODEV; /* ibs not supported by the cpu */ perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ibs_caps & IBS_CAPS_OPCNT) + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); -- cgit v1.2.3 From 6accb9cf76080422d400a641d9068b6b2a2c216f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:10 +0200 Subject: perf/x86-ibs: Fix frequency profiling Fixing profiling at a fixed frequency, in this case the freq value and sample period was setup incorrectly. Since sampling periods are adjusted we also allow periods that have lower 4 bits set. Another fix is the setup of the hw counter: If we modify hwc->sample_period, we also need to update hwc->last_period and hwc->period_left. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-5-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index ebf169fe40ef..bc401bd9f14a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -162,9 +162,16 @@ static int perf_ibs_init(struct perf_event *event) if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ return -EINVAL; - if (hwc->sample_period & 0x0f) - /* lower 4 bits can not be set in ibs max cnt */ + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; } else { max_cnt = config & perf_ibs->cnt_mask; config &= ~perf_ibs->cnt_mask; @@ -175,6 +182,13 @@ static int perf_ibs_init(struct perf_event *event) if (!hwc->sample_period) return -EINVAL; + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + hwc->config_base = perf_ibs->msr; hwc->config = config; -- cgit v1.2.3 From d47e8238cd76f1ffa7c8cd30e08b8e9074fd597e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:11 +0200 Subject: perf/x86-ibs: Take instruction pointer from ibs sample Each IBS sample contains a linear address of the instruction that caused the sample to trigger. This address is more precise than the rip that was taken from the interrupt handler's stack. Update the rip with that address. We use this in the next patch to implement precise-event sampling on AMD systems using IBS. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-6-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 6 ++-- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 48 +++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8a3c75d824b7..4e40a64315c9 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -158,6 +158,7 @@ struct x86_pmu_capability { #define IBS_CAPS_OPCNT (1U<<4) #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) +#define IBS_CAPS_RIPINVALIDCHK (1U<<7) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -170,14 +171,14 @@ struct x86_pmu_capability { #define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) #define IBSCTL_LVT_OFFSET_MASK 0x0F -/* IbsFetchCtl bits/masks */ +/* ibs fetch bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT 0xFFFF0000ULL #define IBS_FETCH_MAX_CNT 0x0000FFFFULL -/* IbsOpCtl bits */ +/* ibs op bits/masks */ /* lower 4 bits of the current count are ignored: */ #define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) @@ -185,6 +186,7 @@ struct x86_pmu_capability { #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_RIP_INVALID (1ULL<<38) extern u32 get_ibs_caps(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index bc401bd9f14a..cc1f3293d6c2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -382,7 +383,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_raw_record raw; struct pt_regs regs; struct perf_ibs_data ibs_data; - int offset, size, overflow, reenable; + int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; u64 *buf, config; @@ -413,28 +414,41 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &config)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) + offset_max = 2; + else + offset_max = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (!check_rip || !(ibs_data.regs[2] & IBS_RIP_INVALID)) + instruction_pointer_set(®s, ibs_data.regs[1]); if (event->attr.sample_type & PERF_SAMPLE_RAW) { - ibs_data.caps = ibs_caps; - size = 1; - offset = 1; - do { - rdmsrl(msr + offset, *buf++); - size++; - offset = find_next_bit(perf_ibs->offset_mask, - perf_ibs->offset_max, - offset + 1); - } while (offset < perf_ibs->offset_max); - raw.size = sizeof(u32) + sizeof(u64) * size; + raw.size = sizeof(u32) + ibs_data.size; raw.data = ibs_data.data; data.raw = &raw; } - regs = *iregs; /* XXX: update ip from ibs sample */ - - overflow = perf_ibs_set_period(perf_ibs, hwc, &config); - reenable = !(overflow && perf_event_overflow(event, &data, ®s)); - config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0); + throttle = perf_event_overflow(event, &data, ®s); +out: + config = (config >> 4) | (throttle ? 0 : perf_ibs->enable_mask); perf_ibs_enable_event(hwc, config); perf_event_update_userpage(event); -- cgit v1.2.3 From 450bbd493d436f9eadd1b7828158f37559f26674 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 12 Mar 2012 12:54:32 +0100 Subject: perf/x86-ibs: Precise event sampling with IBS for AMD CPUs This patch adds support for precise event sampling with IBS. There are two counting modes to count either cycles or micro-ops. If the corresponding performance counter events (hw events) are setup with the precise flag set, the request is redirected to the ibs pmu: perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count perf record -a -e r076:p ... # same as -e cpu-cycles:p perf record -a -e r0C1:p ... # use ibs op counting micro-ops Each ibs sample contains a linear address that points to the instruction that was causing the sample to trigger. With ibs we have skid 0. Thus, ibs supports precise levels 1 and 2. Samples are marked with the PERF_EFLAGS_EXACT flag set. In rare cases the rip is invalid when IBS was not able to record the rip correctly. Then the PERF_EFLAGS_EXACT flag is cleared and the rip is taken from pt_regs. V2: * don't drop samples in precise level 2 if rip is invalid, instead support the PERF_EFLAGS_EXACT flag Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120502103309.GP18810@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 7 ++- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 73 ++++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 589286f28877..65652265fffd 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event) static int amd_pmu_hw_config(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); + int ret; + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + ret = x86_pmu_hw_config(event); if (ret) return ret; diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cc1f3293d6c2..34dfa853f6df 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -145,17 +145,80 @@ static struct perf_ibs *get_ibs_pmu(int type) return NULL; } +/* + * Use IBS for precise event sampling: + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ + switch (event->attr.precise_ip) { + case 0: + return -ENOENT; + case 1: + case 2: + break; + default: + return -EOPNOTSUPP; + } + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; u64 max_cnt, config; + int ret; perf_ibs = get_ibs_pmu(event->attr.type); - if (!perf_ibs) + if (perf_ibs) { + config = event->attr.config; + } else { + perf_ibs = &perf_ibs_op; + ret = perf_ibs_precise_event(event, &config); + if (ret) + return ret; + } + + if (event->pmu != &perf_ibs->pmu) return -ENOENT; - config = event->attr.config; if (config & ~perf_ibs->config_mask) return -EINVAL; @@ -437,8 +500,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) ibs_data.size = sizeof(u64) * size; regs = *iregs; - if (!check_rip || !(ibs_data.regs[2] & IBS_RIP_INVALID)) + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { instruction_pointer_set(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.size = sizeof(u32) + ibs_data.size; -- cgit v1.2.3 From 98112d2e957e0d348f06d8a40f2f720204a70b55 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:13 +0200 Subject: perf/x86-ibs: Rename some variables Simple patch that just renames some variables for better understanding. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-8-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 34dfa853f6df..29a1bffe1dfb 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -62,7 +62,7 @@ struct perf_ibs_data { }; static int -perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count) +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) { s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; @@ -91,7 +91,7 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count) if (left > max) left = max; - *count = (u64)left; + *hw_period = (u64)left; return overflow; } @@ -262,13 +262,13 @@ static int perf_ibs_init(struct perf_event *event) static int perf_ibs_set_period(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 *period) { - int ret; + int overflow; /* ignore lower 4 bits in min count: */ - ret = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); local64_set(&hwc->prev_count, 0); - return ret; + return overflow; } static u64 get_ibs_fetch_count(u64 config) -- cgit v1.2.3 From fc006cf7cc7471e1bdf34e40111971e03622af6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:14 +0200 Subject: perf/x86-ibs: Trigger overflow if remaining period is too small There are cases where the remaining period is smaller than the minimal possible value. In this case the counter is restarted with the minimal period. This is of no use as the interrupt handler will trigger immediately again and most likely hits itself. This biases the results. So, if the remaining period is within the min range, we better do not restart the counter and instead trigger the overflow. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-9-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 29a1bffe1dfb..3e32908292a7 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -78,16 +78,13 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_perio overflow = 1; } - if (unlikely(left <= 0)) { + if (unlikely(left < (s64)min)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; overflow = 1; } - if (unlikely(left < min)) - left = min; - if (left > max) left = max; -- cgit v1.2.3 From 7caaf4d8241feecafb87919402b0a6dbb1b71d9e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:15 +0200 Subject: perf/x86-ibs: Extend hw period that triggers overflow If the last hw period is too short we might hit the irq handler which biases the results. Thus try to have a max last period that triggers the sw overflow. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-10-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3e32908292a7..cb51a3e55870 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -85,8 +85,19 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_perio overflow = 1; } - if (left > max) - left = max; + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } *hw_period = (u64)left; -- cgit v1.2.3 From c9574fe0bdb9ac9a2698e02a712088ce8431e9f8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:16 +0200 Subject: perf/x86-ibs: Implement workaround for IBS erratum #420 When disabling ibs there might be the case where hardware continuously generates interrupts. This is described in erratum #420 (Instruction- Based Sampling Engine May Generate Interrupt that Cannot Be Cleared). To avoid this we must clear the counter mask first and then clear the enable bit. This patch implements this. See Revision Guide for AMD Family 10h Processors, Publication #41322. Note: We now keep track of the last read ibs config value which is then used to disable ibs. To update the config value we pass now a pointer to the functions reading it. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-11-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 62 ++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cb51a3e55870..b14e71127c82 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -291,20 +291,36 @@ static u64 get_ibs_op_count(u64 config) static void perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, - u64 config) + u64 *config) { - u64 count = perf_ibs->get_count(config); + u64 count = perf_ibs->get_count(*config); while (!perf_event_try_update(event, count, 20)) { - rdmsrl(event->hw.config_base, config); - count = perf_ibs->get_count(config); + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); } } -/* Note: The enable mask must be encoded in the config argument. */ -static inline void perf_ibs_enable_event(struct hw_perf_event *hwc, u64 config) +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) { - wrmsrl(hwc->config_base, hwc->config | config); + wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); } /* @@ -318,7 +334,7 @@ static void perf_ibs_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 config; + u64 period; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -326,10 +342,9 @@ static void perf_ibs_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; - perf_ibs_set_period(perf_ibs, hwc, &config); - config = (config >> 4) | perf_ibs->enable_mask; + perf_ibs_set_period(perf_ibs, hwc, &period); set_bit(IBS_STARTED, pcpu->state); - perf_ibs_enable_event(hwc, config); + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); perf_event_update_userpage(event); } @@ -339,7 +354,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 val; + u64 config; int stopping; stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); @@ -347,12 +362,11 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (!stopping && (hwc->state & PERF_HES_UPTODATE)) return; - rdmsrl(hwc->config_base, val); + rdmsrl(hwc->config_base, config); if (stopping) { set_bit(IBS_STOPPING, pcpu->state); - val &= ~perf_ibs->enable_mask; - wrmsrl(hwc->config_base, val); + perf_ibs_disable_event(perf_ibs, hwc, config); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -360,7 +374,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; - perf_ibs_event_update(perf_ibs, event, val); + perf_ibs_event_update(perf_ibs, event, &config); hwc->state |= PERF_HES_UPTODATE; } @@ -456,7 +470,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; - u64 *buf, config; + u64 *buf, *config, period; if (!test_bit(IBS_STARTED, pcpu->state)) { /* Catch spurious interrupts after stopping IBS: */ @@ -477,15 +491,15 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) * supported in all cpus. As this triggered an interrupt, we * set the current count to the max count. */ - config = ibs_data.regs[0]; + config = &ibs_data.regs[0]; if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { - config &= ~IBS_OP_CUR_CNT; - config |= (config & IBS_OP_MAX_CNT) << 36; + *config &= ~IBS_OP_CUR_CNT; + *config |= (*config & IBS_OP_MAX_CNT) << 36; } perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0, hwc->last_period); - if (!perf_ibs_set_period(perf_ibs, hwc, &config)) + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) goto out; /* no sw counter overflow */ ibs_data.caps = ibs_caps; @@ -523,8 +537,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) throttle = perf_event_overflow(event, &data, ®s); out: - config = (config >> 4) | (throttle ? 0 : perf_ibs->enable_mask); - perf_ibs_enable_event(hwc, config); + if (throttle) + perf_ibs_disable_event(perf_ibs, hwc, *config); + else + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); perf_event_update_userpage(event); -- cgit v1.2.3 From fc5fb2b5e1874e5894e2ac503bfb744220db89a1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:17 +0200 Subject: perf/x86-ibs: Catch spurious interrupts after stopping IBS After disabling IBS there could be still incomming NMIs with samples that even have the valid bit cleared. Mark all this NMIs as handled to avoid spurious interrupt messages. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-12-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index b14e71127c82..5a9f95b5cc26 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -473,11 +473,13 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) u64 *buf, *config, period; if (!test_bit(IBS_STARTED, pcpu->state)) { - /* Catch spurious interrupts after stopping IBS: */ - if (!test_and_clear_bit(IBS_STOPPING, pcpu->state)) - return 0; - rdmsrl(perf_ibs->msr, *ibs_data.regs); - return (*ibs_data.regs & perf_ibs->valid_mask) ? 1 : 0; + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incomming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; } msr = hwc->config_base; -- cgit v1.2.3 From 8b1e13638d465863572c8207a5cfceeef0cf0441 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:18 +0200 Subject: perf/x86-ibs: Fix usage of IBS op current count The value of IbsOpCurCnt rolls over when it reaches IbsOpMaxCnt. Thus, it is reset to zero by hardware. To get the correct count we need to add the max count to it in case we received an ibs sample (valid bit set). Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-13-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 33 ++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 5a9f95b5cc26..da9bcdcd9856 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -286,7 +286,15 @@ static u64 get_ibs_fetch_count(u64 config) static u64 get_ibs_op_count(u64 config) { - return (config & IBS_OP_CUR_CNT) >> 32; + u64 count = 0; + + if (config & IBS_OP_VAL) + count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + + if (ibs_caps & IBS_CAPS_RDWROPCNT) + count += (config & IBS_OP_CUR_CNT) >> 32; + + return count; } static void @@ -295,7 +303,12 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, { u64 count = perf_ibs->get_count(*config); - while (!perf_event_try_update(event, count, 20)) { + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { rdmsrl(event->hw.config_base, *config); count = perf_ibs->get_count(*config); } @@ -374,6 +387,12 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + perf_ibs_event_update(perf_ibs, event, &config); hwc->state |= PERF_HES_UPTODATE; } @@ -488,17 +507,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (!(*buf++ & perf_ibs->valid_mask)) return 0; - /* - * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not - * supported in all cpus. As this triggered an interrupt, we - * set the current count to the max count. - */ config = &ibs_data.regs[0]; - if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { - *config &= ~IBS_OP_CUR_CNT; - *config |= (*config & IBS_OP_MAX_CNT) << 36; - } - perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0, hwc->last_period); if (!perf_ibs_set_period(perf_ibs, hwc, &period)) -- cgit v1.2.3 From 978da300c7a65494692b329a6a4cbf364afc37c5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 11 May 2012 11:44:59 +0200 Subject: perf/x86/ibs: Fix undefined reference to `get_ibs_caps' Fixing i386 allnoconfig built errors: arch/x86/built-in.o: In function `amd_pmu_hw_config': perf_event_amd.c:(.text+0xc3e1): undefined reference to `get_ibs_caps' Reported-by: Andrew Morton Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 4e40a64315c9..588f52ea810e 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,7 +188,11 @@ struct x86_pmu_capability { #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_RIP_INVALID (1ULL<<38) +#ifdef CONFIG_X86_LOCAL_APIC extern u32 get_ibs_caps(void); +#else +static inline u32 get_ibs_caps(void) { return 0; } +#endif #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); -- cgit v1.2.3 From e4f5d5440bb860a3e8942ca8f7277a7f31798965 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Apr 2012 09:13:18 -0400 Subject: ftrace/x86: Have x86 ftrace use the ftrace_modify_all_code() To remove duplicate code, have the ftrace arch_ftrace_update_code() use the generic ftrace_modify_all_code(). This requires that the default ftrace_replace_code() becomes a weak function so that an arch may override it. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 15 ++------------- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 4 ++-- 3 files changed, 5 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4243e8bbdcb1..32ff36596ab1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -435,7 +435,7 @@ static void run_sync(void) local_irq_disable(); } -static void ftrace_replace_code(int enable) +void ftrace_replace_code(int enable) { struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; @@ -493,18 +493,7 @@ void arch_ftrace_update_code(int command) { modifying_ftrace_code++; - if (command & FTRACE_UPDATE_CALLS) - ftrace_replace_code(1); - else if (command & FTRACE_DISABLE_CALLS) - ftrace_replace_code(0); - - if (command & FTRACE_UPDATE_TRACE_FUNC) - ftrace_update_ftrace_func(ftrace_trace_function); - - if (command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); - else if (command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); + ftrace_modify_all_code(command); modifying_ftrace_code--; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cd72ace7ade3..55e6d63d46d0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -314,6 +314,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void *data); +extern void ftrace_replace_code(int enable); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3c345825cc23..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1683,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return -1; /* unknow ftrace bug */ } -static void ftrace_replace_code(int update) +void __weak ftrace_replace_code(int enable) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1693,7 +1693,7 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - failed = __ftrace_replace_code(rec, update); + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ -- cgit v1.2.3