summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig17
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/include/asm/barrier.h43
-rw-r--r--arch/x86/include/asm/fpu-internal.h13
-rw-r--r--arch/x86/include/asm/hw_irq.h3
-rw-r--r--arch/x86/kernel/apic/io_apic.c20
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c53
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c679
-rw-r--r--arch/x86/kernel/entry_32.S4
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/irq.c19
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/net/bpf_jit_comp.c14
-rw-r--r--arch/x86/vdso/vclock_gettime.c8
17 files changed, 842 insertions, 67 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0952ecd60eca..838e7c34dd60 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -125,6 +125,7 @@ config X86
select RTC_LIB
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
+ select HAVE_CC_STACKPROTECTOR
config INSTRUCTION_DECODER
def_bool y
@@ -1617,22 +1618,6 @@ config SECCOMP
If unsure, say Y. Only embedded should say N here.
-config CC_STACKPROTECTOR
- bool "Enable -fstack-protector buffer overflow detection"
- ---help---
- This option turns on the -fstack-protector GCC feature. This
- feature puts, at the beginning of functions, a canary value on
- the stack just before the return address, and validates
- the value just before actually returning. Stack based buffer
- overflows (that need to overwrite this return address) now also
- overwrite the canary, which gets detected and the attack is then
- neutralized via a kernel panic.
-
- This feature requires gcc version 4.2 or above, or a distribution
- gcc with the feature backported. Older versions are automatically
- detected and for those versions, this configuration option is
- ignored. (and a warning is printed during bootup)
-
source kernel/Kconfig.hz
config KEXEC
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 57d021507120..13b22e0f681d 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -89,13 +89,11 @@ else
KBUILD_CFLAGS += -maccumulate-outgoing-args
endif
+# Make sure compiler does not have buggy stack-protector support.
ifdef CONFIG_CC_STACKPROTECTOR
cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
- ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
- stackp-y := -fstack-protector
- KBUILD_CFLAGS += $(stackp-y)
- else
- $(warning stack protector enabled but no compiler support)
+ ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
+ $(warning stack-protector enabled but compiler support broken)
endif
endif
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index c6cd358a1eec..04a48903b2eb 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -92,12 +92,53 @@
#endif
#define smp_read_barrier_depends() read_barrier_depends()
#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else
+#else /* !SMP */
#define smp_mb() barrier()
#define smp_rmb() barrier()
#define smp_wmb() barrier()
#define smp_read_barrier_depends() do { } while (0)
#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif /* SMP */
+
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+
+/*
+ * For either of these options x86 doesn't have a strong TSO memory
+ * model and we should fall back to full barriers.
+ */
+
+#define smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ smp_mb(); \
+ ACCESS_ONCE(*p) = (v); \
+} while (0)
+
+#define smp_load_acquire(p) \
+({ \
+ typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ compiletime_assert_atomic_type(*p); \
+ smp_mb(); \
+ ___p1; \
+})
+
+#else /* regular x86 TSO memory ordering */
+
+#define smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ barrier(); \
+ ACCESS_ONCE(*p) = (v); \
+} while (0)
+
+#define smp_load_acquire(p) \
+({ \
+ typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ compiletime_assert_atomic_type(*p); \
+ barrier(); \
+ ___p1; \
+})
+
#endif
/*
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index c49a613c6452..cea1c76d49bf 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -293,12 +293,13 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. "m" is a random variable that should be in L1 */
- alternative_input(
- ASM_NOP8 ASM_NOP2,
- "emms\n\t" /* clear stack tags */
- "fildl %P[addr]", /* set F?P to defined value */
- X86_FEATURE_FXSAVE_LEAK,
- [addr] "m" (tsk->thread.fpu.has_fpu));
+ if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) {
+ asm volatile(
+ "fnclex\n\t"
+ "emms\n\t"
+ "fildl %P[addr]" /* set F?P to defined value */
+ : : [addr] "m" (tsk->thread.fpu.has_fpu));
+ }
return fpu_restore_checking(&tsk->thread.fpu);
}
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index cba45d99ac1a..67d69b8e2d20 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -191,6 +191,9 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
#define trace_interrupt interrupt
#endif
+#define VECTOR_UNDEFINED -1
+#define VECTOR_RETRIGGERED -2
+
typedef int vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
extern void setup_vector_irq(int cpu);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e63a5bd2a78f..a43f068ebec1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1142,9 +1142,10 @@ next:
if (test_bit(vector, used_vectors))
goto next;
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
goto next;
+ }
/* Found one! */
current_vector = vector;
current_offset = offset;
@@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
vector = cfg->vector;
for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
cfg->vector = 0;
cpumask_clear(cfg->domain);
@@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
if (likely(!cfg->move_in_progress))
return;
for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
continue;
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
break;
}
}
@@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu)
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
irq = per_cpu(vector_irq, cpu)[vector];
- if (irq < 0)
+ if (irq <= VECTOR_UNDEFINED)
continue;
cfg = irq_cfg(irq);
if (!cpumask_test_cpu(cpu, cfg->domain))
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
raw_spin_unlock(&vector_lock);
}
@@ -2202,13 +2202,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irq;
+ int irq;
unsigned int irr;
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __this_cpu_read(vector_irq[vector]);
- if (irq == -1)
+ if (irq <= VECTOR_UNDEFINED)
continue;
desc = irq_to_desc(irq);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7e99cb..6359506a19ee 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
endif
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index e09f0bfb7b8f..4b8e4d3cd6ea 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
#include <asm/apic.h>
@@ -816,6 +817,18 @@ out:
return ret;
}
+static void ibs_eilvt_setup(void)
+{
+ /*
+ * Force LVT offset assignment for family 10h: The offsets are
+ * not assigned by the BIOS for this family, so the OS is
+ * responsible for doing it. If the OS assignment fails, fall
+ * back to BIOS settings and try to setup this.
+ */
+ if (boot_cpu_data.x86 == 0x10)
+ force_ibs_eilvt_setup();
+}
+
static inline int get_ibs_lvt_offset(void)
{
u64 val;
@@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy)
setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
}
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+ clear_APIC_ibs(NULL);
+ return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+ ibs_eilvt_setup();
+ setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+ .resume = perf_ibs_resume,
+ .suspend = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+ register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
static int
perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
@@ -877,18 +920,12 @@ static __init int amd_ibs_init(void)
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
- /*
- * Force LVT offset assignment for family 10h: The offsets are
- * not assigned by the BIOS for this family, so the OS is
- * responsible for doing it. If the OS assignment fails, fall
- * back to BIOS settings and try to setup this.
- */
- if (boot_cpu_data.x86 == 0x10)
- force_ibs_eilvt_setup();
+ ibs_eilvt_setup();
if (!ibs_eilvt_valid())
goto out;
+ perf_ibs_pm_init();
get_online_cpus();
ibs_caps = caps;
/* make ibs_caps visible to other cpus: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 000000000000..5ad35ad94d0f
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,679 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ * pp0 counter: consumption of all physical cores (power plane 0)
+ * event: rapl_energy_cores
+ * perf code: 0x1
+ *
+ * pkg counter: consumption of the whole processor package
+ * event: rapl_energy_pkg
+ * perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ * event: rapl_energy_dram
+ * perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ * event: rapl_energy_gpu
+ * perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
+#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
+#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
+#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */
+#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK 0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct kobj_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_EVENT_DESC(_name, _config) \
+{ \
+ .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
+ .config = _config, \
+}
+
+#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+
+struct rapl_pmu {
+ spinlock_t lock;
+ int hw_unit; /* 1/2^hw_unit Joule */
+ int n_active; /* number of active events */
+ struct list_head active_list;
+ struct pmu *pmu; /* pointer to rapl_pmu_class */
+ ktime_t timer_interval; /* in ktime_t unit */
+ struct hrtimer hrtimer;
+};
+
+static struct pmu rapl_pmu_class;
+static cpumask_t rapl_cpu_mask;
+static int rapl_cntr_mask;
+
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+ u64 raw;
+ rdmsrl(event->hw.event_base, raw);
+ return raw;
+}
+
+static inline u64 rapl_scale(u64 v)
+{
+ /*
+ * scale delta to smallest unit (1/2^32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+ int shift = RAPL_CNTR_WIDTH;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ rdmsrl(event->hw.event_base, new_raw_count);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count) {
+ cpu_relax();
+ goto again;
+ }
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ sdelta = rapl_scale(delta);
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+ __hrtimer_start_range_ns(&pmu->hrtimer,
+ pmu->timer_interval, 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
+{
+ hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!pmu->n_active)
+ return HRTIMER_NORESTART;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ list_for_each_entry(event, &pmu->active_list, active_entry) {
+ rapl_event_update(event);
+ }
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+
+ hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+ struct hrtimer *hr = &pmu->hrtimer;
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+ struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &pmu->active_list);
+
+ local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+ pmu->n_active++;
+ if (pmu->n_active == 1)
+ rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+ __rapl_pmu_event_start(pmu, event);
+ spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ /* mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(pmu->n_active <= 0);
+ pmu->n_active--;
+ if (pmu->n_active == 0)
+ rapl_stop_hrtimer(pmu);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(pmu, event);
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int bit, msr, ret = 0;
+
+ /* only look at RAPL events */
+ if (event->attr.type != rapl_pmu_class.type)
+ return -ENOENT;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~RAPL_EVENT_MASK)
+ return -EINVAL;
+
+ /*
+ * check event is known (determines counter)
+ */
+ switch (cfg) {
+ case INTEL_RAPL_PP0:
+ bit = RAPL_IDX_PP0_NRG_STAT;
+ msr = MSR_PP0_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PKG:
+ bit = RAPL_IDX_PKG_NRG_STAT;
+ msr = MSR_PKG_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_RAM:
+ bit = RAPL_IDX_RAM_NRG_STAT;
+ msr = MSR_DRAM_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PP1:
+ bit = RAPL_IDX_PP1_NRG_STAT;
+ msr = MSR_PP1_ENERGY_STATUS;
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* check event supported */
+ if (!(rapl_cntr_mask & (1 << bit)))
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /* must be done before validate_group */
+ event->hw.event_base = msr;
+ event->hw.config = cfg;
+ event->hw.idx = bit;
+
+ return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
+
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+ .attrs = rapl_pmu_attrs,
+};
+
+EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
+EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
+EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+
+EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
+EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
+EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+ .name = "events",
+ .attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+ .name = "format",
+ .attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+ &rapl_pmu_attr_group,
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
+};
+
+static struct pmu rapl_pmu_class = {
+ .attr_groups = rapl_attr_groups,
+ .task_ctx_nr = perf_invalid_context, /* system-wide only */
+ .event_init = rapl_pmu_event_init,
+ .add = rapl_pmu_event_add, /* must have */
+ .del = rapl_pmu_event_del, /* must have */
+ .start = rapl_pmu_event_start,
+ .stop = rapl_pmu_event_stop,
+ .read = rapl_pmu_event_read,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+ int i, phys_id = topology_physical_package_id(cpu);
+ int target = -1;
+
+ /* find a new cpu on same package */
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (phys_id == topology_physical_package_id(i)) {
+ target = i;
+ break;
+ }
+ }
+ /*
+ * clear cpu from cpumask
+ * if was set in cpumask and still some cpu on package,
+ * then move to new cpu
+ */
+ if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
+ cpumask_set_cpu(target, &rapl_cpu_mask);
+
+ WARN_ON(cpumask_empty(&rapl_cpu_mask));
+ /*
+ * migrate events and context to new cpu
+ */
+ if (target >= 0)
+ perf_pmu_migrate_context(pmu->pmu, cpu, target);
+
+ /* cancel overflow polling timer for CPU */
+ rapl_stop_hrtimer(pmu);
+}
+
+static void rapl_cpu_init(int cpu)
+{
+ int i, phys_id = topology_physical_package_id(cpu);
+
+ /* check if phys_is is already covered */
+ for_each_cpu(i, &rapl_cpu_mask) {
+ if (phys_id == topology_physical_package_id(i))
+ return;
+ }
+ /* was not found, so add it */
+ cpumask_set_cpu(cpu, &rapl_cpu_mask);
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+ int phys_id = topology_physical_package_id(cpu);
+ u64 ms;
+
+ if (pmu)
+ return 0;
+
+ if (phys_id < 0)
+ return -1;
+
+ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!pmu)
+ return -1;
+
+ spin_lock_init(&pmu->lock);
+
+ INIT_LIST_HEAD(&pmu->active_list);
+
+ /*
+ * grab power unit as: 1/2^unit Joules
+ *
+ * we cache in local PMU instance
+ */
+ rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit);
+ pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL;
+ pmu->pmu = &rapl_pmu_class;
+
+ /*
+ * use reference of 200W for scaling the timeout
+ * to avoid missing counter overflows.
+ * 200W = 200 Joules/sec
+ * divide interval by 2 to avoid lockstep (2 * 100)
+ * if hw unit is 32, then we use 2 ms 1/200/2
+ */
+ if (pmu->hw_unit < 32)
+ ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+ else
+ ms = 2;
+
+ pmu->timer_interval = ms_to_ktime(ms);
+
+ rapl_hrtimer_init(pmu);
+
+ /* set RAPL pmu for this cpu for now */
+ per_cpu(rapl_pmu, cpu) = pmu;
+ per_cpu(rapl_pmu_to_free, cpu) = NULL;
+
+ return 0;
+}
+
+static void rapl_cpu_kfree(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
+
+ kfree(pmu);
+
+ per_cpu(rapl_pmu_to_free, cpu) = NULL;
+}
+
+static int rapl_cpu_dying(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+
+ if (!pmu)
+ return 0;
+
+ per_cpu(rapl_pmu, cpu) = NULL;
+
+ per_cpu(rapl_pmu_to_free, cpu) = pmu;
+
+ return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ rapl_cpu_prepare(cpu);
+ break;
+ case CPU_STARTING:
+ rapl_cpu_init(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ rapl_cpu_dying(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ rapl_cpu_kfree(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rapl_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] = {
+ [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+ [1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+ struct rapl_pmu *pmu;
+ int cpu, ret;
+
+ /*
+ * check for Intel processor family 6
+ */
+ if (!x86_match_cpu(rapl_cpu_match))
+ return 0;
+
+ /* check supported CPU */
+ switch (boot_cpu_data.x86_model) {
+ case 42: /* Sandy Bridge */
+ case 58: /* Ivy Bridge */
+ case 60: /* Haswell */
+ case 69: /* Haswell-Celeron */
+ rapl_cntr_mask = RAPL_IDX_CLN;
+ rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+ break;
+ case 45: /* Sandy Bridge-EP */
+ case 62: /* IvyTown */
+ rapl_cntr_mask = RAPL_IDX_SRV;
+ rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+ break;
+
+ default:
+ /* unsupported */
+ return 0;
+ }
+ get_online_cpus();
+
+ for_each_online_cpu(cpu) {
+ rapl_cpu_prepare(cpu);
+ rapl_cpu_init(cpu);
+ }
+
+ perf_cpu_notifier(rapl_cpu_notifier);
+
+ ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+ if (WARN_ON(ret)) {
+ pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+ put_online_cpus();
+ return -1;
+ }
+
+ pmu = __get_cpu_var(rapl_pmu);
+
+ pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+ " API unit is 2^-32 Joules,"
+ " %d fixed counters"
+ " %llu ms ovfl timer\n",
+ pmu->hw_unit,
+ hweight32(rapl_cntr_mask),
+ ktime_to_ms(pmu->timer_interval));
+
+ put_online_cpus();
+
+ return 0;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 51e2988c5728..a2a4f4697889 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1082,7 +1082,7 @@ ENTRY(ftrace_caller)
pushl $0 /* Pass NULL as regs pointer */
movl 4*4(%esp), %eax
movl 0x4(%ebp), %edx
- leal function_trace_op, %ecx
+ movl function_trace_op, %ecx
subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
@@ -1140,7 +1140,7 @@ ENTRY(ftrace_regs_caller)
movl 12*4(%esp), %eax /* Load ip (1st parameter) */
subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
pushl %esp /* Save pt_regs as 4th parameter */
GLOBAL(ftrace_regs_call)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e21b0785a85b..1e96c3628bf2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -88,7 +88,7 @@ END(function_hook)
MCOUNT_SAVE_FRAME \skip
/* Load the ftrace_ops into the 3rd parameter */
- leaq function_trace_op, %rdx
+ movq function_trace_op(%rip), %rdx
/* Load ip into the first parameter */
movq RIP(%rsp), %rdi
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687e7fda..884d875c1434 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
if (!handle_irq(irq, regs)) {
ack_APIC_irq();
- if (printk_ratelimit())
- pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+ if (irq != VECTOR_RETRIGGERED) {
+ pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(),
+ vector, irq);
+ } else {
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ }
}
irq_exit();
@@ -344,7 +348,7 @@ void fixup_irqs(void)
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
- if (__this_cpu_read(vector_irq[vector]) < 0)
+ if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -355,11 +359,14 @@ void fixup_irqs(void)
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
raw_spin_lock(&desc->lock);
- if (chip->irq_retrigger)
+ if (chip->irq_retrigger) {
chip->irq_retrigger(data);
+ __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ }
raw_spin_unlock(&desc->lock);
}
- __this_cpu_write(vector_irq[vector], -1);
+ if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
}
}
#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc594ff..7f50156542fb 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... NR_VECTORS - 1] = -1,
+ [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
int cpu;
for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
+ if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
return 1;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1673940cf9c3..775702f649ca 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1355,7 +1355,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
vcpu->arch.apic_base = value;
/* update jump label if enable bit changes */
- if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
if (value & MSR_IA32_APICBASE_ENABLE)
static_key_slow_dec_deferred(&apic_hw_disabled);
else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9ff85bb8dd69..9d591c895803 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -641,6 +641,20 @@ no_context(struct pt_regs *regs, unsigned long error_code,
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
if (current_thread_info()->sig_on_uaccess_error && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | PF_USER;
@@ -649,6 +663,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address, tsk, 0);
}
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
return;
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26328e800869..4ed75dd81d05 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -359,15 +359,21 @@ void bpf_jit_compile(struct sk_filter *fp)
EMIT2(0x89, 0xd0); /* mov %edx,%eax */
break;
case BPF_S_ALU_MOD_K: /* A %= K; */
+ if (K == 1) {
+ CLEAR_A();
+ break;
+ }
EMIT2(0x31, 0xd2); /* xor %edx,%edx */
EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
EMIT2(0xf7, 0xf1); /* div %ecx */
EMIT2(0x89, 0xd0); /* mov %edx,%eax */
break;
- case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
- EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
- EMIT(K, 4);
- EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
+ case BPF_S_ALU_DIV_K: /* A /= K */
+ if (K == 1)
+ break;
+ EMIT2(0x31, 0xd2); /* xor %edx,%edx */
+ EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
+ EMIT2(0xf7, 0xf1); /* div %ecx */
break;
case BPF_S_ALU_AND_X:
seen |= SEEN_XREG;
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 2ada505067cc..eb5d7a56f8d4 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -178,7 +178,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
ts->tv_nsec = 0;
do {
- seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
mode = gtod->clock.vclock_mode;
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->wall_time_snsec;
@@ -198,7 +198,7 @@ notrace static int do_monotonic(struct timespec *ts)
ts->tv_nsec = 0;
do {
- seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
mode = gtod->clock.vclock_mode;
ts->tv_sec = gtod->monotonic_time_sec;
ns = gtod->monotonic_time_snsec;
@@ -214,7 +214,7 @@ notrace static int do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
do {
- seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->wall_time_coarse.tv_sec;
ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -225,7 +225,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
{
unsigned long seq;
do {
- seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+ seq = raw_read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->monotonic_time_coarse.tv_sec;
ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));