summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c126
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c715
-rw-r--r--arch/x86/kernel/cpu/intel_pt.h131
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c75
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c345
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c22
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event.c223
-rw-r--r--arch/x86/kernel/cpu/perf_event.h181
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c908
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c525
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c1379
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c39
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c321
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c1100
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c3
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
30 files changed, 5347 insertions, 1198 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..fd470ebf924e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <linux/io.h>
#include <linux/sched.h>
+#include <linux/random.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
va_align.mask = (upperbit - 1) & PAGE_MASK;
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_int() & va_align.mask;
}
}
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..a62cf04dac8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[10] = eax;
}
+ /* Additional Intel-defined flags: level 0x0000000F */
+ if (c->cpuid_level >= 0x0000000F) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=0 */
+ cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[11] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = ebx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[12] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ }
+ } else {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ }
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
detect_nopl(c);
}
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+ /*
+ * The heavy lifting of max_rmid and cache_occ_scale are handled
+ * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
+ * in case CQM bits really aren't there in this CPU.
+ */
+ if (c != &boot_cpu_data) {
+ boot_cpu_data.x86_cache_max_rmid =
+ min(boot_cpu_data.x86_cache_max_rmid,
+ c->x86_cache_max_rmid);
+ }
+}
+
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor(c);
x86_init_rdrand(c);
+ x86_init_cache_qos(c);
/*
* Clear/Set all flags overriden by options, need do it
@@ -959,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
}
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
-/* May not be __init: called during resume */
-static void syscall32_cpu_init(void)
-{
- /* Load these always in case some future AMD CPU supports
- SYSENTER from compat mode too. */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-#endif /* CONFIG_IA32_EMULATION */
-#endif /* CONFIG_X86_64 */
-
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
#ifdef CONFIG_X86_32
void enable_sep_cpu(void)
{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss;
+ int cpu;
- if (!boot_cpu_has(X86_FEATURE_SEP)) {
- put_cpu();
- return;
- }
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ goto out;
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
tss->x86_tss.ss1 = __KERNEL_CS;
- tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+ wrmsr(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
+ 0);
+
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
put_cpu();
}
#endif
@@ -1118,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+ (unsigned long)&init_thread_union + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(kernel_stack);
#ifdef CONFIG_X86_64
@@ -1130,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
/*
- * The following four percpu variables are hot. Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot. Align current_task to
+ * cacheline size such that they fall in the same cacheline.
*/
DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
@@ -1171,10 +1209,23 @@ void syscall_init(void)
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+#else
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
@@ -1226,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack. Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+ (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
@@ -1307,7 +1367,7 @@ void cpu_init(void)
*/
load_ucode_ap();
- t = &per_cpu(init_tss, cpu);
+ t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1391,7 +1451,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 659643376dbf..edcb0e28c336 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -7,16 +7,14 @@
* Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
+#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sysfs.h>
#include <linux/pci.h>
#include <asm/processor.h>
-#include <linux/smp.h>
#include <asm/amd_nb.h>
#include <asm/smp.h>
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
enum _cache_type {
- CACHE_TYPE_NULL = 0,
- CACHE_TYPE_DATA = 1,
- CACHE_TYPE_INST = 2,
- CACHE_TYPE_UNIFIED = 3
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
};
union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-struct _cpuid4_info {
- struct _cpuid4_info_regs base;
- DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
static const unsigned char levels[] = { 1, 1, 2, 3 };
static const unsigned char types[] = { 1, 2, 3, 3 };
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
static void
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
- unsigned int);
-};
-
#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
/*
* L3 cache descriptors
*/
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-{
- int node;
-
- /* only for L3, and not in virtualized environments */
- if (index < 3)
- return;
-
- node = amd_get_nb_id(smp_processor_id());
- this_leaf->nb = node_to_amd_nb(node);
- if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
- amd_calc_l3_indices(this_leaf->nb);
-}
-
/*
* check whether a slot used for disabling an L3 index is occupied.
* @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
return -1;
}
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
unsigned int slot)
{
int index;
+ struct amd_northbridge *nb = this_leaf->priv;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
+ index = amd_get_l3_disable_slot(nb, slot);
if (index >= 0)
return sprintf(buf, "%d\n", index);
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
#define SHOW_CACHE_DISABLE(slot) \
static ssize_t \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return show_cache_disable(this_leaf, buf, slot); \
}
SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
return 0;
}
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
{
unsigned long val = 0;
int cpu, err = 0;
+ struct amd_northbridge *nb = this_leaf->priv;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ cpu = cpumask_first(&this_leaf->shared_cpu_map);
if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
- err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count, \
- unsigned int cpu) \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return store_cache_disable(this_leaf, buf, count, slot); \
}
STORE_CACHE_DISABLE(0)
STORE_CACHE_DISABLE(1)
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
-
-static ssize_t
-show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
+static ssize_t subcaches_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
}
-static ssize_t
-store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
- unsigned int cpu)
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
unsigned long val;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
-
if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
return count;
}
-static struct _cache_attr subcaches =
- __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!this_leaf->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ int n = 1;
+ static struct attribute **amd_l3_attrs;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (this_leaf->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return;
+
+ node = amd_get_nb_id(smp_processor_id());
+ this_leaf->nb = node_to_amd_nb(node);
+ if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+ amd_calc_l3_indices(this_leaf->nb);
+}
#else
#define amd_init_l3_cache(x, y)
#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
- if (eax.split.type == CACHE_TYPE_NULL)
+ if (eax.split.type == CTYPE_NULL)
return -EIO; /* better error ? */
this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
/* Do cpuid(op) loop to find out num_cache_leaves */
cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
cache_eax.full = eax;
- } while (cache_eax.split.type != CACHE_TYPE_NULL);
+ } while (cache_eax.split.type != CTYPE_NULL);
return i;
}
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
switch (this_leaf.eax.split.level) {
case 1:
- if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
+ if (this_leaf.eax.split.type == CTYPE_DATA)
new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
+ else if (this_leaf.eax.split.type == CTYPE_INST)
new_l1i = this_leaf.size/1024;
break;
case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
return l2;
}
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-
-static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf;
int i, sibling;
if (cpu_has_topoext) {
unsigned int apicid, nshared, first, last;
- if (!per_cpu(ici_cpuid4_info, cpu))
- return 0;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
+ this_leaf = this_cpu_ci->info_list + index;
+ nshared = base->eax.split.num_threads_sharing + 1;
apicid = cpu_data(cpu).apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
apicid = cpu_data(i).apicid;
if ((apicid < first) || (apicid > last))
continue;
- if (!per_cpu(ici_cpuid4_info, i))
- continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+
+ this_leaf = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
apicid = cpu_data(sibling).apicid;
if ((apicid < first) || (apicid > last))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else if (index == 3) {
for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- if (!per_cpu(ici_cpuid4_info, i))
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+ this_leaf = this_cpu_ci->info_list + index;
for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
if (!cpu_online(sibling))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
return 1;
}
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf, *sibling_leaf;
unsigned long num_threads_sharing;
int index_msb, i;
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->x86_vendor == X86_VENDOR_AMD) {
- if (cache_shared_amd_cpu_map_setup(cpu, index))
+ if (__cache_amd_cpumap_setup(cpu, index, base))
return;
}
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
+ this_leaf = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+ cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
if (num_threads_sharing == 1)
- cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
- else {
- index_msb = get_count_order(num_threads_sharing);
-
- for_each_online_cpu(i) {
- if (cpu_data(i).apicid >> index_msb ==
- c->apicid >> index_msb) {
- cpumask_set_cpu(i,
- to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
- sibling_leaf =
- CPUID4_INFO_IDX(i, index);
- cpumask_set_cpu(cpu, to_cpumask(
- sibling_leaf->shared_cpu_map));
- }
- }
- }
- }
-}
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- int sibling;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpumask_clear_cpu(cpu,
- to_cpumask(sibling_leaf->shared_cpu_map));
- }
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void free_cache_attributes(unsigned int cpu)
-{
- int i;
-
- for (i = 0; i < num_cache_leaves; i++)
- cache_remove_shared_cpu_map(cpu, i);
-
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static void get_cpu_leaves(void *_retval)
-{
- int j, *retval = _retval, cpu = smp_processor_id();
+ return;
- /* Do cpuid and store the results */
- for (j = 0; j < num_cache_leaves; j++) {
- struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
+ index_msb = get_count_order(num_threads_sharing);
- *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base);
- if (unlikely(*retval < 0)) {
- int i;
+ for_each_online_cpu(i)
+ if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
- for (i = 0; i < j; i++)
- cache_remove_shared_cpu_map(cpu, i);
- break;
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;/* skip if itself or no cacheinfo */
+ sibling_leaf = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
}
- cache_shared_cpu_map_setup(cpu, j);
- }
}
-static int detect_cache_attributes(unsigned int cpu)
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+ struct _cpuid4_info_regs *base)
{
- int retval;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(ici_cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
- if (retval) {
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
- }
-
- return retval;
+ this_leaf->level = base->eax.split.level;
+ this_leaf->type = cache_type_map[base->eax.split.type];
+ this_leaf->coherency_line_size =
+ base->ebx.split.coherency_line_size + 1;
+ this_leaf->ways_of_associativity =
+ base->ebx.split.ways_of_associativity + 1;
+ this_leaf->size = base->size;
+ this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+ this_leaf->physical_line_partition =
+ base->ebx.split.physical_line_partition + 1;
+ this_leaf->priv = base->nb;
}
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
- struct kobject kobj;
- unsigned int cpu;
- unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val) \
-static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
-{ \
- return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, base.eax.split.level, 0);
-show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
- int type, char *buf)
-{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int ret;
-
- if (type)
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
- cpumask_pr_args(mask));
- else
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
- cpumask_pr_args(mask));
- buf[ret++] = '\n';
- buf[ret] = '\0';
- return ret;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
+static int __init_cache_level(unsigned int cpu)
{
- return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
-{
- return show_shared_cpu_map_func(leaf, 1, buf);
-}
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- switch (this_leaf->base.eax.split.type) {
- case CACHE_TYPE_DATA:
- return sprintf(buf, "Data\n");
- case CACHE_TYPE_INST:
- return sprintf(buf, "Instruction\n");
- case CACHE_TYPE_UNIFIED:
- return sprintf(buf, "Unified\n");
- default:
- return sprintf(buf, "Unknown\n");
- }
-}
-
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
- __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
- NULL
-};
-
-#ifdef CONFIG_AMD_NB
-static struct attribute **amd_l3_attrs(void)
-{
- static struct attribute **attrs;
- int n;
-
- if (attrs)
- return attrs;
-
- n = ARRAY_SIZE(default_attrs);
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- n += 2;
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- n += 1;
-
- attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
- if (attrs == NULL)
- return attrs = default_attrs;
-
- for (n = 0; default_attrs[n]; n++)
- attrs[n] = default_attrs[n];
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
- attrs[n++] = &cache_disable_0.attr;
- attrs[n++] = &cache_disable_1.attr;
- }
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- attrs[n++] = &subcaches.attr;
-
- return attrs;
-}
-#endif
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->show ?
- fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->store ?
- fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, count, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type ktype_cache = {
- .sysfs_ops = &sysfs_ops,
- .default_attrs = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
- .sysfs_ops = &sysfs_ops,
-};
-
-static void cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
- kfree(per_cpu(ici_cache_kobject, cpu));
- kfree(per_cpu(ici_index_kobject, cpu));
- per_cpu(ici_cache_kobject, cpu) = NULL;
- per_cpu(ici_index_kobject, cpu) = NULL;
- free_cache_attributes(cpu);
-}
-
-static int cpuid4_cache_sysfs_init(unsigned int cpu)
-{
- int err;
-
- if (num_cache_leaves == 0)
+ if (!num_cache_leaves)
return -ENOENT;
-
- err = detect_cache_attributes(cpu);
- if (err)
- return err;
-
- /* Allocate all required memory */
- per_cpu(ici_cache_kobject, cpu) =
- kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
- goto err_out;
-
- per_cpu(ici_index_kobject, cpu) = kzalloc(
- sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
- goto err_out;
-
+ if (!this_cpu_ci)
+ return -EINVAL;
+ this_cpu_ci->num_levels = 3;
+ this_cpu_ci->num_leaves = num_cache_leaves;
return 0;
-
-err_out:
- cpuid4_cache_sysfs_exit(cpu);
- return -ENOMEM;
}
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int cache_add_dev(struct device *dev)
+static int __populate_cache_leaves(unsigned int cpu)
{
- unsigned int cpu = dev->id;
- unsigned long i, j;
- struct _index_kobject *this_object;
- struct _cpuid4_info *this_leaf;
- int retval;
-
- retval = cpuid4_cache_sysfs_init(cpu);
- if (unlikely(retval < 0))
- return retval;
-
- retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
- &ktype_percpu_entry,
- &dev->kobj, "%s", "cache");
- if (retval < 0) {
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
+ unsigned int idx, ret;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+ struct _cpuid4_info_regs id4_regs = {};
- for (i = 0; i < num_cache_leaves; i++) {
- this_object = INDEX_KOBJECT_PTR(cpu, i);
- this_object->cpu = cpu;
- this_object->index = i;
-
- this_leaf = CPUID4_INFO_IDX(cpu, i);
-
- ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
- if (this_leaf->base.nb)
- ktype_cache.default_attrs = amd_l3_attrs();
-#endif
- retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache,
- per_cpu(ici_cache_kobject, cpu),
- "index%1lu", i);
- if (unlikely(retval)) {
- for (j = 0; j < i; j++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
- kobject_uevent(&(this_object->kobj), KOBJ_ADD);
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ if (ret)
+ return ret;
+ ci_leaf_init(this_leaf++, &id4_regs);
+ __cache_cpumap_setup(cpu, idx, &id4_regs);
}
- cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
- kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
return 0;
}
-static void cache_remove_dev(struct device *dev)
-{
- unsigned int cpu = dev->id;
- unsigned long i;
-
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return;
- if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
- return;
- cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
- for (i = 0; i < num_cache_leaves; i++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
-
- dev = get_cpu_device(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- cache_add_dev(dev);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- cache_remove_dev(dev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cacheinfo_cpu_notifier = {
- .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __init cache_sysfs_init(void)
-{
- int i, err = 0;
-
- if (num_cache_leaves == 0)
- return 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- struct device *dev = get_cpu_device(i);
-
- err = cache_add_dev(dev);
- if (err)
- goto out;
- }
- __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
-
-out:
- cpu_notifier_register_done();
- return err;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
+DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
+DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+/*
+ * Table of Physical Addresses bits
+ */
+enum topa_sz {
+ TOPA_4K = 0,
+ TOPA_8K,
+ TOPA_16K,
+ TOPA_32K,
+ TOPA_64K,
+ TOPA_128K,
+ TOPA_256K,
+ TOPA_512K,
+ TOPA_1MB,
+ TOPA_2MB,
+ TOPA_4MB,
+ TOPA_8MB,
+ TOPA_16MB,
+ TOPA_32MB,
+ TOPA_64MB,
+ TOPA_128MB,
+ TOPA_SZ_END,
+};
+
+static inline unsigned int sizes(enum topa_sz tsz)
+{
+ return 1 << (tsz + 12);
+};
+
+struct topa_entry {
+ u64 end : 1;
+ u64 rsvd0 : 1;
+ u64 intr : 1;
+ u64 rsvd1 : 1;
+ u64 stop : 1;
+ u64 rsvd2 : 1;
+ u64 size : 4;
+ u64 rsvd3 : 2;
+ u64 base : 36;
+ u64 rsvd4 : 16;
+};
+
+#define TOPA_SHIFT 12
+#define PT_CPUID_LEAVES 2
+
+enum pt_capabilities {
+ PT_CAP_max_subleaf = 0,
+ PT_CAP_cr3_filtering,
+ PT_CAP_topa_output,
+ PT_CAP_topa_multiple_entries,
+ PT_CAP_payloads_lip,
+};
+
+struct pt_pmu {
+ struct pmu pmu;
+ u32 caps[4 * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ * cpu, depending on perf event configuration
+ * @cpu: cpu for per-cpu allocation
+ * @tables: list of ToPA tables in this buffer
+ * @first: shorthand for first topa table
+ * @last: shorthand for last topa table
+ * @cur: current topa table
+ * @nr_pages: buffer size in pages
+ * @cur_idx: current output region's index within @cur table
+ * @output_off: offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost: if data was lost/truncated
+ * @head: logical write offset inside the buffer
+ * @snapshot: if this is for a snapshot/overwrite counter
+ * @stop_pos: STOP topa entry in the buffer
+ * @intr_pos: INT topa entry in the buffer
+ * @data_pages: array of pages from perf
+ * @topa_index: table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+ int cpu;
+ struct list_head tables;
+ struct topa *first, *last, *cur;
+ unsigned int cur_idx;
+ size_t output_off;
+ unsigned long nr_pages;
+ local_t data_size;
+ local_t lost;
+ local64_t head;
+ bool snapshot;
+ unsigned long stop_pos, intr_pos;
+ void **data_pages;
+ struct topa_entry *topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle: perf output handle
+ * @handle_nmi: do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+ struct perf_output_handle handle;
+ int handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
};
#define ATTR_LEN 16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
@@ -23,20 +24,20 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
-int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
-void mce_intel_cmci_poll(void);
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
-# define mce_intel_adjust_timer mce_adjust_timer_default
-static inline void mce_intel_cmci_poll(void) { }
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..9c682c222071 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
-int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum context ctx = error_context(m);
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC)
+ return MCE_PANIC_SEVERITY;
+
+ if (m->status & MCI_STATUS_UC) {
+
+ /*
+ * On older systems where overflow_recov flag is not present, we
+ * should simply panic if an error overflow occurs. If
+ * overflow_recov flag is present and set, then software can try
+ * to at least kill process to prolong system operation.
+ */
+ if (mce_flags.overflow_recov) {
+ /* software can try to contain */
+ if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
+ return MCE_PANIC_SEVERITY;
+
+ /* kill current process */
+ return MCE_AR_SEVERITY;
+ } else {
+ /* at least one error was not logged */
+ if (m->status & MCI_STATUS_OVER)
+ return MCE_PANIC_SEVERITY;
+ }
+
+ /*
+ * For any other case, return MCE_UC_SEVERITY so that we log the
+ * error and exit #MC handler.
+ */
+ return MCE_UC_SEVERITY;
+ }
+
+ /*
+ * deferred error: poll handler catches these and adds to mce_ring so
+ * memory-failure can take recovery actions.
+ */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return MCE_DEFERRED_SEVERITY;
+
+ /*
+ * corrected error: poll handler catches these and passes responsibility
+ * of decoding the error to EDAC
+ */
+ return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m);
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
}
}
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+ mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ mce_severity = mce_severity_amd;
+}
+
#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3c036cb4a370..e535533d5ab8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
-#define SPINUNIT 100 /* 100ns */
+#define SPINUNIT 100 /* 100ns */
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/* CMCI storm detection filter */
-static DEFINE_PER_CPU(unsigned long, mce_polled_error);
-
/*
* MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
+ bool error_logged = false;
struct mce m;
int severity;
int i;
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
- this_cpu_write(mce_polled_error, 1);
+
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
+ error_logged = true;
mce_log(&m);
+ }
/*
* Clear state for this bank.
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
sync_core();
+
+ return error_logged;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -813,7 +816,7 @@ static void mce_reign(void)
* other CPUs.
*/
if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal Machine check", m, msg);
+ mce_panic("Fatal machine check", m, msg);
/*
* For UC somewhere we let the CPU who detects it handle it.
@@ -826,7 +829,7 @@ static void mce_reign(void)
* source or one CPU is hung. Panic.
*/
if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Machine check from unknown source", NULL, NULL);
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
* Now clear all the mces_seen so that they don't reappear on
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static unsigned long check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
return interval;
}
-static unsigned long (*mce_adjust_timer)(unsigned long interval) =
- mce_adjust_timer_default;
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-static int cmc_error_seen(void)
+static void __restart_timer(struct timer_list *t, unsigned long interval)
{
- unsigned long *v = this_cpu_ptr(&mce_polled_error);
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
- return test_and_clear_bit(0, v);
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+
+ local_irq_restore(flags);
}
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int cpu = smp_processor_id();
unsigned long iv;
- int notify;
- WARN_ON(smp_processor_id() != data);
+ WARN_ON(cpu != data);
+
+ iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP,
- this_cpu_ptr(&mce_poll_banks));
- mce_intel_cmci_poll();
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+
+ if (mce_intel_cmci_poll()) {
+ iv = mce_adjust_timer(iv);
+ goto done;
+ }
}
/*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
*/
- iv = __this_cpu_read(mce_next_interval);
- notify = mce_notify_irq();
- notify |= cmc_error_seen();
- if (notify) {
+ if (mce_notify_irq())
iv = max(iv / 2, (unsigned long) HZ/100);
- } else {
+ else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
- iv = mce_adjust_timer(iv);
- }
+
+done:
__this_cpu_write(mce_next_interval, iv);
- /* Might have become 0 after CMCI storm subsided */
- if (iv) {
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, iv);
}
/*
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
void mce_timer_kick(unsigned long interval)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
- if (timer_pending(t)) {
- if (time_before(when, t->expires))
- mod_timer_pinned(t, when);
- } else {
- t->expires = round_jiffies(when);
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, interval);
+
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
}
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
- if (c->x86 == 6 && cfg->banks > 0)
+ if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 val, hwcr;
- bool need_toggle;
- u32 msrs[] = {
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
0x00000413, /* MC4_MISC0 */
0xc0000408, /* MC4_MISC1 */
- };
+ };
- rdmsrl(MSR_K7_HWCR, hwcr);
+ rdmsrl(MSR_K7_HWCR, hwcr);
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
- for (i = 0; i < ARRAY_SIZE(msrs); i++) {
- rdmsrl(msrs[i], val);
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
- /* CntP bit set? */
- if (val & BIT_64(62)) {
- val &= ~BIT_64(62);
- wrmsrl(msrs[i], val);
- }
- }
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = mce_intel_adjust_timer;
+ mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
+ mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
break;
default:
break;
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mcheck_vendor_init_severity();
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f1c3769bbd64..55ad9b37cae8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
return (bank == 4);
}
-static const char * const bank4_names(struct threshold_block *b)
+static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
/* MSR4_MISC0 */
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!b.interrupt_capable)
goto init;
+ b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce(offset, new);
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
log:
mce_setup(&m);
rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ return;
m.misc = ((u64)high << 32) | low;
m.bank = bank;
mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
- if (b->interrupt_capable)
+ if (b->interrupt_capable) {
threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
- else
+ b->interrupt_enable = 1;
+ } else {
threshold_ktype.default_attrs[2] = NULL;
+ }
INIT_LIST_HEAD(&b->miscj);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
+/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_INTERVAL (HZ)
#define CMCI_STORM_THRESHOLD 15
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
-void mce_intel_cmci_poll(void)
+bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ return false;
+
+ /*
+ * Reset the counter if we've logged an error in the last poll
+ * during the storm.
+ */
+ if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ else
+ this_cpu_dec(cmci_backoff_cnt);
+
+ return true;
}
void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
-unsigned long mce_intel_adjust_timer(unsigned long interval)
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
- int r;
-
- if (interval < CMCI_POLL_INTERVAL)
- return interval;
+ if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+ (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+ mce_notify_irq();
+ return CMCI_STORM_INTERVAL;
+ }
switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
+
/*
* We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the
- * timer interval is back to our poll interval.
+ * silenced itself. That means no events recorded and the timer
+ * interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- r = atomic_sub_return(1, &cmci_storm_on_cpus);
- if (r == 0)
+ if (!atomic_sub_return(1, &cmci_storm_on_cpus))
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
/* FALLTHROUGH */
case CMCI_STORM_SUBSIDED:
/*
- * We wait for all cpus to go back to SUBSIDED
- * state. When that happens we switch back to
- * interrupt mode.
+ * We wait for all CPUs to go back to SUBSIDED state. When that
+ * happens we switch back to interrupt mode.
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
}
return CMCI_POLL_INTERVAL;
default:
- /*
- * We have shiny weather. Let the poll do whatever it
- * thinks.
- */
+
+ /* We have shiny weather. Let the poll do whatever it thinks. */
return interval;
}
}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_POLL_INTERVAL);
+ mce_timer_kick(CMCI_STORM_INTERVAL);
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
+
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
+
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bfbbe6195e2d..12829c3ced3c 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -21,7 +21,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
-#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index d45df4bd16ab..a413a69cbd74 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -23,57 +23,6 @@
#include <asm/processor.h>
#include <asm/cmdline.h>
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
-
-#define CPUID_IS(a, b, c, ebx, ecx, edx) \
- (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
-
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
- *
- * x86_vendor() gets vendor information directly through cpuid.
- */
-static int x86_vendor(void)
-{
- u32 eax = 0x00000000;
- u32 ebx, ecx = 0, edx;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
- return X86_VENDOR_INTEL;
-
- if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
- return X86_VENDOR_AMD;
-
- return X86_VENDOR_UNKNOWN;
-}
-
-static int x86_family(void)
-{
- u32 eax = 0x00000001;
- u32 ebx, ecx = 0, edx;
- int x86;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- x86 = (eax >> 8) & 0xf;
- if (x86 == 15)
- x86 += (eax >> 20) & 0xff;
-
- return x86;
-}
-
static bool __init check_loader_disabled_bsp(void)
{
#ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
void __init load_ucode_bsp(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_bsp())
return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_bsp();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_bsp();
break;
default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
void load_ucode_ap(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_ap())
return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_ap();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_ap();
break;
default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
void reload_early_microcode(void)
{
- int vendor, x86;
+ int vendor, family;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
reload_ucode_intel();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
reload_ucode_amd();
break;
default:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 746e7fd08aad..a41beadb3db9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
- return get_matching_microcode(csig, cpf, mc_intel, crev);
+ return get_matching_microcode(csig, cpf, crev, mc_intel);
}
static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
csig = uci->cpu_sig.sig;
cpf = uci->cpu_sig.pf;
- if (get_matching_microcode(csig, cpf, mc, new_rev)) {
+ if (get_matching_microcode(csig, cpf, new_rev, mc)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 420eb933189c..2f49ab4ac0ae 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -16,6 +16,14 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/slab.h>
@@ -28,6 +36,9 @@
#include <asm/tlbflush.h>
#include <asm/setup.h>
+#undef pr_fmt
+#define pr_fmt(fmt) "microcode: " fmt
+
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
@@ -35,50 +46,45 @@ static struct mc_saved_data {
} mc_saved_data;
static enum ucode_state
-generic_load_microcode_early(struct microcode_intel **mc_saved_p,
- unsigned int mc_saved_count,
- struct ucode_cpu_info *uci)
+load_microcode_early(struct microcode_intel **saved,
+ unsigned int num_saved, struct ucode_cpu_info *uci)
{
struct microcode_intel *ucode_ptr, *new_mc = NULL;
- int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_OK;
- unsigned int mc_size;
- struct microcode_header_intel *mc_header;
- unsigned int csig = uci->cpu_sig.sig;
- unsigned int cpf = uci->cpu_sig.pf;
- int i;
+ struct microcode_header_intel *mc_hdr;
+ int new_rev, ret, i;
- for (i = 0; i < mc_saved_count; i++) {
- ucode_ptr = mc_saved_p[i];
+ new_rev = uci->cpu_sig.rev;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- mc_size = get_totalsize(mc_header);
- if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
- new_rev = mc_header->rev;
- new_mc = ucode_ptr;
- }
- }
+ for (i = 0; i < num_saved; i++) {
+ ucode_ptr = saved[i];
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
+ ret = get_matching_microcode(uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ new_rev,
+ ucode_ptr);
+ if (!ret)
+ continue;
+
+ new_rev = mc_hdr->rev;
+ new_mc = ucode_ptr;
}
+ if (!new_mc)
+ return UCODE_NFOUND;
+
uci->mc = (struct microcode_intel *)new_mc;
-out:
- return state;
+ return UCODE_OK;
}
-static void
-microcode_pointer(struct microcode_intel **mc_saved,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start, int mc_saved_count)
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+ unsigned long off, int num_saved)
{
int i;
- for (i = 0; i < mc_saved_count; i++)
- mc_saved[i] = (struct microcode_intel *)
- (mc_saved_in_initrd[i] + initrd_start);
+ for (i = 0; i < num_saved; i++)
+ mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
}
#ifdef CONFIG_X86_32
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
#endif
static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start,
- struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long initrd_start, struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int count = mc_saved_data->mc_saved_count;
if (!mc_saved_data->mc_saved) {
- microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
- initrd_start, count);
+ copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
microcode_phys(mc_saved_tmp, mc_saved_data);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
#else
- return generic_load_microcode_early(mc_saved_data->mc_saved,
+ return load_microcode_early(mc_saved_data->mc_saved,
count, uci);
#endif
}
}
-static u8 get_x86_family(unsigned long sig)
-{
- u8 x86;
-
- x86 = (sig >> 8) & 0xf;
-
- if (x86 == 0xf)
- x86 += (sig >> 20) & 0xff;
-
- return x86;
-}
-
-static u8 get_x86_model(unsigned long sig)
-{
- u8 x86, x86_model;
-
- x86 = get_x86_family(sig);
- x86_model = (sig >> 4) & 0xf;
-
- if (x86 == 0x6 || x86 == 0xf)
- x86_model += ((sig >> 16) & 0xf) << 4;
-
- return x86_model;
-}
-
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
@@ -159,42 +137,40 @@ static enum ucode_state
matching_model_microcode(struct microcode_header_intel *mc_header,
unsigned long sig)
{
- u8 x86, x86_model;
- u8 x86_ucode, x86_model_ucode;
+ unsigned int fam, model;
+ unsigned int fam_ucode, model_ucode;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
- x86 = get_x86_family(sig);
- x86_model = get_x86_model(sig);
+ fam = __x86_family(sig);
+ model = x86_model(sig);
- x86_ucode = get_x86_family(mc_header->sig);
- x86_model_ucode = get_x86_model(mc_header->sig);
+ fam_ucode = __x86_family(mc_header->sig);
+ model_ucode = x86_model(mc_header->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
return UCODE_NFOUND;
- ext_header = (struct extended_sigtable *)
- mc_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
- x86_ucode = get_x86_family(ext_sig->sig);
- x86_model_ucode = get_x86_model(ext_sig->sig);
+ fam_ucode = __x86_family(ext_sig->sig);
+ model_ucode = x86_model(ext_sig->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
ext_sig++;
}
-
return UCODE_NFOUND;
}
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
unsigned int mc_saved_count)
{
int i, j;
- struct microcode_intel **mc_saved_p;
+ struct microcode_intel **saved_ptr;
int ret;
if (!mc_saved_count)
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
/*
* Copy new microcode data.
*/
- mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
- GFP_KERNEL);
- if (!mc_saved_p)
+ saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+ if (!saved_ptr)
return -ENOMEM;
for (i = 0; i < mc_saved_count; i++) {
- struct microcode_intel *mc = mc_saved_src[i];
- struct microcode_header_intel *mc_header = &mc->hdr;
- unsigned long mc_size = get_totalsize(mc_header);
- mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
- if (!mc_saved_p[i]) {
- ret = -ENOMEM;
- goto err;
- }
+ struct microcode_header_intel *mc_hdr;
+ struct microcode_intel *mc;
+ unsigned long size;
+
if (!mc_saved_src[i]) {
ret = -EINVAL;
goto err;
}
- memcpy(mc_saved_p[i], mc, mc_size);
+
+ mc = mc_saved_src[i];
+ mc_hdr = &mc->hdr;
+ size = get_totalsize(mc_hdr);
+
+ saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+ if (!saved_ptr[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ memcpy(saved_ptr[i], mc, size);
}
/*
* Point to newly saved microcode.
*/
- mc_saved_data->mc_saved = mc_saved_p;
+ mc_saved_data->mc_saved = saved_ptr;
mc_saved_data->mc_saved_count = mc_saved_count;
return 0;
err:
for (j = 0; j <= i; j++)
- kfree(mc_saved_p[j]);
- kfree(mc_saved_p);
+ kfree(saved_ptr[j]);
+ kfree(saved_ptr);
return ret;
}
@@ -257,48 +239,45 @@ err:
* - or if it is a newly discovered microcode patch.
*
* The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
*/
-static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
- unsigned int *mc_saved_count_p)
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+ u8 *ucode_ptr, unsigned int num_saved)
{
- int i;
- int found = 0;
- unsigned int mc_saved_count = *mc_saved_count_p;
- struct microcode_header_intel *mc_header;
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ unsigned int sig, pf, new_rev;
+ int found = 0, i;
+
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ for (i = 0; i < num_saved; i++) {
+ mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+ new_rev = mc_hdr->rev;
+
+ if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+ continue;
+
+ found = 1;
+
+ if (!revision_is_newer(mc_hdr, new_rev))
+ continue;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- for (i = 0; i < mc_saved_count; i++) {
- unsigned int sig, pf;
- unsigned int new_rev;
- struct microcode_header_intel *mc_saved_header =
- (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- new_rev = mc_header->rev;
-
- if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
- found = 1;
- if (update_match_revision(mc_header, new_rev)) {
- /*
- * Found an older ucode saved before.
- * Replace the older one with this newer
- * one.
- */
- mc_saved[i] =
- (struct microcode_intel *)ucode_ptr;
- break;
- }
- }
- }
- if (i >= mc_saved_count && !found)
/*
- * This ucode is first time discovered in ucode file.
- * Save it to memory.
+ * Found an older ucode saved earlier. Replace it with
+ * this newer one.
*/
- mc_saved[mc_saved_count++] =
- (struct microcode_intel *)ucode_ptr;
+ mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+
+ /* Newly detected microcode, save it to memory. */
+ if (i >= num_saved && !found)
+ mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
- *mc_saved_count_p = mc_saved_count;
+ return num_saved;
}
/*
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
continue;
}
- _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
ucode_ptr += mc_size;
}
@@ -372,7 +351,7 @@ out:
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
- u8 x86, x86_model;
+ unsigned int family, model;
struct cpu_signature csig;
unsigned int eax, ebx, ecx, edx;
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- x86 = get_x86_family(csig.sig);
- x86_model = get_x86_model(csig.sig);
+ family = __x86_family(csig.sig);
+ model = x86_model(csig.sig);
- if ((x86_model >= 5) || (x86 > 6)) {
+ if ((model >= 5) || (family > 6)) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void)
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
rev = uci.cpu_sig.rev;
- pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
- smp_processor_id(), sig, pf, rev);
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
struct microcode_header_intel *mc_saved_header;
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void)
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
- ext_header = (struct extended_sigtable *)
- mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc)
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
*/
-
- _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
/*
* Save the mc_save_tmp in global mc_saved_data.
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long end,
- struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- struct ucode_cpu_info *uci)
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long start, unsigned long size,
+ struct ucode_cpu_info *uci)
{
- unsigned int size = end - start + 1;
struct cpio_data cd;
long offset = 0;
#ifdef CONFIG_X86_32
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end,
if (!cd.data)
return UCODE_ERROR;
-
return get_matching_model_microcode(0, start, cd.data, cd.size,
- mc_saved_data, mc_saved_in_initrd,
- uci);
+ mc_saved_data, initrd, uci);
}
/*
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void)
if (count == 0)
return ret;
- microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void)
static void __init
_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start_early,
- unsigned long initrd_end_early,
- struct ucode_cpu_info *uci)
+ unsigned long *initrd,
+ unsigned long start, unsigned long size)
{
+ struct ucode_cpu_info uci;
enum ucode_state ret;
- collect_cpu_info_early(uci);
- scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
- mc_saved_in_initrd, uci);
+ collect_cpu_info_early(&uci);
- ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, uci);
+ ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+ if (ret != UCODE_OK)
+ return;
- if (ret == UCODE_OK)
- apply_microcode_early(uci, true);
+ ret = load_microcode(mc_saved_data, initrd, start, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
}
-void __init
-load_ucode_intel_bsp(void)
+void __init load_ucode_intel_bsp(void)
{
- u64 ramdisk_image, ramdisk_size;
- unsigned long initrd_start_early, initrd_end_early;
- struct ucode_cpu_info uci;
+ u64 start, size;
#ifdef CONFIG_X86_32
- struct boot_params *boot_params_p;
+ struct boot_params *p;
- boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
- ramdisk_image = boot_params_p->hdr.ramdisk_image;
- ramdisk_size = boot_params_p->hdr.ramdisk_size;
- initrd_start_early = ramdisk_image;
- initrd_end_early = initrd_start_early + ramdisk_size;
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ start = p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
_load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- initrd_start_early, initrd_end_early, &uci);
+ (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
#else
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
- initrd_start_early = ramdisk_image + PAGE_OFFSET;
- initrd_end_early = initrd_start_early + ramdisk_size;
-
- _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, initrd_end_early,
- &uci);
+ start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+ size = boot_params.hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
#endif
}
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void)
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
unsigned long initrd_start_addr;
+ enum ucode_state ret;
#ifdef CONFIG_X86_32
unsigned long *initrd_start_p;
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void)
return;
collect_cpu_info_early(&uci);
- load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
+ ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+
+ if (ret != UCODE_OK)
+ return;
+
apply_microcode_early(&uci, true);
}
@@ -808,8 +777,8 @@ void reload_ucode_intel(void)
collect_cpu_info_early(&uci);
- ret = generic_load_microcode_early(mc_saved_data.mc_saved,
- mc_saved_data.mc_saved_count, &uci);
+ ret = load_microcode_early(mc_saved_data.mc_saved,
+ mc_saved_data.mc_saved_count, &uci);
if (ret != UCODE_OK)
return;
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..cd47a510a3f1 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
}
-int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
int microcode_sanity_check(void *mc, int print_err)
{
unsigned long total_size, data_size, ext_table_size;
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
{
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header;
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
}
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
{
- struct microcode_header_intel *mc_header = mc;
+ struct microcode_header_intel *mc_hdr = mc;
- if (!update_match_revision(mc_header, rev))
+ if (!revision_is_newer(mc_hdr, rev))
return 0;
- return get_matching_sig(csig, cpf, mc, rev);
+ return get_matching_sig(csig, cpf, rev, mc);
}
EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 36d99a337b49..3f20710a5b23 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,7 +6,7 @@
IN=$1
OUT=$2
-function dump_array()
+dump_array()
{
ARRAY=$1
SIZE=$2
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
- int i, max, len;
+ int i, max;
mtrr_type type;
unsigned long base, size;
- len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
- len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
- "(%5luMB), size=%5lu%cB, count=%d: %s\n",
- i, base, base >> (20 - PAGE_SHIFT), size,
- factor, mtrr_usage_table[i],
- mtrr_attrib_to_str(type));
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..87848ebe2bb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
}
}
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+ hw_perf_event_destroy(event);
+
+ /* undo the lbr/bts event accounting */
+ x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
static inline int x86_pmu_initialized(void)
{
return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return x86_pmu_extra_regs(val, event);
}
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+ int ret = -EBUSY, i;
+
+ if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+ return 0;
+
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto out;
+
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ ret = 0;
+
+out:
+ mutex_unlock(&pmc_reserve_mutex);
+ return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+ atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
int x86_setup_perfctr(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
/* BTS is currently only allowed for user-mode. */
if (!attr->exclude_kernel)
return -EOPNOTSUPP;
+
+ /* disallow bts if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
}
hwc->config |= config;
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
- /*
- * check that PEBS LBR correction does not conflict with
- * whatever the user is asking with attr->branch_sample_type
- */
- if (event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2) {
- u64 *br_type = &event->attr.branch_sample_type;
-
- if (has_branch_stack(event)) {
- if (!precise_br_compat(event))
- return -EOPNOTSUPP;
-
- /* branch_sample_type is compatible */
-
- } else {
- /*
- * user did not specify branch_sample_type
- *
- * For PEBS fixups, we capture all
- * the branches at the priv level of the
- * event.
- */
- *br_type = PERF_SAMPLE_BRANCH_ANY;
-
- if (!event->attr.exclude_user)
- *br_type |= PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_kernel)
- *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
- }
+ }
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
}
}
+ if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+ event->attach_state |= PERF_ATTACH_TASK_DATA;
+
/*
* Generate PMC IRQs:
* (keep 'enabled' bit clear for now)
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+ if (event->attr.sample_period && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, event->attr.sample_period) >
+ event->attr.sample_period)
+ return -EINVAL;
+ }
+
return x86_setup_perfctr(event);
}
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
struct event_constraint *c;
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
struct perf_event *e;
- int i, wmin, wmax, num = 0;
+ int i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ if (x86_pmu.start_scheduling)
+ x86_pmu.start_scheduling(cpuc);
+
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
hwc = &cpuc->event_list[i]->hw;
- c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+ c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
hwc->constraint = c;
wmin = min(wmin, c->weight);
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n)
- num = perf_assign_events(cpuc->event_list, n, wmin,
- wmax, assign);
+ unsched = perf_assign_events(cpuc->event_list, n, wmin,
+ wmax, assign);
/*
- * Mark the event as committed, so we do not put_constraint()
- * in case new events are added and fail scheduling.
+ * In case of success (unsched = 0), mark events as committed,
+ * so we do not put_constraint() in case new events are added
+ * and fail to be scheduled
+ *
+ * We invoke the lower level commit callback to lock the resource
+ *
+ * We do not need to do all of this in case we are called to
+ * validate an event group (assign == NULL)
*/
- if (!num && assign) {
+ if (!unsched && assign) {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+ if (x86_pmu.commit_scheduling)
+ x86_pmu.commit_scheduling(cpuc, e, assign[i]);
}
}
- /*
- * scheduling failed or is just a simulation,
- * free resources if necessary
- */
- if (!assign || num) {
+
+ if (!assign || unsched) {
+
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
/*
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
continue;
+ /*
+ * release events that failed scheduling
+ */
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(cpuc, e);
}
}
- return num ? -EINVAL : 0;
+
+ if (x86_pmu.stop_scheduling)
+ x86_pmu.stop_scheduling(cpuc);
+
+ return unsched ? -EINVAL : 0;
}
/*
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event)
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
+ if (x86_pmu.limit_period)
+ left = x86_pmu.limit_period(event, left);
+
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
hwc = &event->hw;
- perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
ret = n = collect_events(cpuc, event, false);
if (ret < 0)
@@ -1071,7 +1140,6 @@ done_collect:
ret = 0;
out:
- perf_pmu_enable(event->pmu);
return ret;
}
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- u64 pebs;
+ u64 pebs, debugctl;
struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void)
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
- rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("\n");
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ if (x86_pmu.pebs_constraints) {
+ rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ }
+ if (x86_pmu.lbr_nr) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
+ }
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- int ret = NOTIFY_OK;
+ int i, ret = NOTIFY_OK;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- cpuc->kfree_on_online = NULL;
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+ cpuc->kfree_on_online[i] = NULL;
if (x86_pmu.cpu_prepare)
ret = x86_pmu.cpu_prepare(cpu);
break;
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
- kfree(cpuc->kfree_on_online);
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+ kfree(cpuc->kfree_on_online[i]);
+ cpuc->kfree_on_online[i] = NULL;
+ }
break;
case CPU_DYING:
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event)
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
- c = x86_pmu.get_event_constraints(fake_cpuc, event);
+ c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
if (!c || !c->weight)
ret = -EINVAL;
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_flush_branch_stack(void)
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
+ if (x86_pmu.sched_task)
+ x86_pmu.sched_task(ctx, sched_in);
}
void perf_check_microcode(void)
@@ -1949,7 +2027,8 @@ static struct pmu pmu = {
.commit_txn = x86_pmu_commit_txn,
.event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
+ .sched_task = x86_pmu_sched_task,
+ .task_ctx_size = sizeof(struct x86_perf_task_context),
};
void arch_perf_update_userpage(struct perf_event *event,
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event,
data = cyc2ns_read_begin();
+ /*
+ * Internal timekeeping for enabled/running/stopped times
+ * is always in the local_clock domain.
+ */
userpg->cap_user_time = 1;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;
- userpg->cap_user_time_zero = 1;
- userpg->time_zero = data->cyc2ns_offset;
+ /*
+ * cap_user_time_zero doesn't make sense when we're using a different
+ * time base for the records.
+ */
+ if (event->clock == &local_clock) {
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = data->cyc2ns_offset;
+ }
cyc2ns_read_end(data);
}
@@ -2147,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
static unsigned long code_segment_base(struct pt_regs *regs)
{
/*
+ * For IA32 we look at the GDT/LDT segment base to convert the
+ * effective IP to a linear address.
+ */
+
+#ifdef CONFIG_X86_32
+ /*
* If we are in VM86 mode, add the segment offset to convert to a
* linear address.
*/
if (regs->flags & X86_VM_MASK)
return 0x10 * regs->cs;
- /*
- * For IA32 we look at the GDT/LDT segment base to convert the
- * effective IP to a linear address.
- */
-#ifdef CONFIG_X86_32
if (user_mode(regs) && regs->cs != __USER_CS)
return get_segment_base(regs->cs);
#else
- if (test_thread_flag(TIF_IA32)) {
- if (user_mode(regs) && regs->cs != __USER32_CS)
- return get_segment_base(regs->cs);
- }
+ if (user_mode(regs) && !user_64bit_mode(regs) &&
+ regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
#endif
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..6ac5cb7a9e14 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -65,13 +65,15 @@ struct event_constraint {
/*
* struct hw_perf_event.flags flags
*/
-#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
+#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
struct amd_nb {
@@ -123,8 +125,37 @@ struct intel_shared_regs {
unsigned core_id; /* per-core: core id */
};
+enum intel_excl_state_type {
+ INTEL_EXCL_UNUSED = 0, /* counter is unused */
+ INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */
+ INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+ enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
+ enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+ int num_alloc_cntrs;/* #counters allocated */
+ int max_alloc_cntrs;/* max #counters allowed */
+ bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+ raw_spinlock_t lock;
+
+ struct intel_excl_states states[2];
+
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
#define MAX_LBR_ENTRIES 16
+enum {
+ X86_PERF_KFREE_SHARED = 0,
+ X86_PERF_KFREE_EXCL = 1,
+ X86_PERF_KFREE_MAX
+};
+
struct cpu_hw_events {
/*
* Generic x86 PMC bits
@@ -179,6 +210,12 @@ struct cpu_hw_events {
* used on Intel NHM/WSM/SNB
*/
struct intel_shared_regs *shared_regs;
+ /*
+ * manage exclusive counter access between hyperthread
+ */
+ struct event_constraint *constraint_list; /* in enable order */
+ struct intel_excl_cntrs *excl_cntrs;
+ int excl_thread_id; /* 0 or 1 */
/*
* AMD specific bits
@@ -187,7 +224,7 @@ struct cpu_hw_events {
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
- void *kfree_on_online;
+ void *kfree_on_online[X86_PERF_KFREE_MAX];
};
#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
@@ -202,6 +239,10 @@ struct cpu_hw_events {
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+ 0, PERF_X86_EVENT_EXCL)
+
/*
* The overlap flag marks event constraints with overlapping counter
* masks. This is the case if the counter mask of such an event is not
@@ -259,6 +300,10 @@ struct cpu_hw_events {
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
#define INTEL_PLD_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
@@ -283,22 +328,40 @@ struct cpu_hw_events {
/* Check flags and event code, and set the HSW load flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
- __EVENT_CONSTRAINT(code, n, \
+ __EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW store flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW load flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
@@ -408,6 +471,13 @@ union x86_pmu_config {
#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+enum {
+ x86_lbr_exclusive_lbr,
+ x86_lbr_exclusive_bts,
+ x86_lbr_exclusive_pt,
+ x86_lbr_exclusive_max,
+};
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -443,14 +513,25 @@ struct x86_pmu {
u64 max_period;
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
+ int idx,
struct perf_event *event);
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
+
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ int cntr);
+
+ void (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+ void (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
bool late_ack;
+ unsigned (*limit_period)(struct perf_event *event, unsigned l);
/*
* sysfs attrs
@@ -472,7 +553,8 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
- void (*flush_branch_stack)(void);
+ void (*sched_task)(struct perf_event_context *ctx,
+ bool sched_in);
/*
* Intel Arch Perfmon v2+
@@ -504,10 +586,15 @@ struct x86_pmu {
bool lbr_double_abort; /* duplicated lbr aborts */
/*
+ * Intel PT/LBR/BTS are exclusive
+ */
+ atomic_t lbr_exclusive[x86_lbr_exclusive_max];
+
+ /*
* Extra registers for events
*/
struct extra_reg *extra_regs;
- unsigned int er_flags;
+ unsigned int flags;
/*
* Intel host/guest support (KVM)
@@ -515,6 +602,13 @@ struct x86_pmu {
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
+struct x86_perf_task_context {
+ u64 lbr_from[MAX_LBR_ENTRIES];
+ u64 lbr_to[MAX_LBR_ENTRIES];
+ int lbr_callstack_users;
+ int lbr_stack_state;
+};
+
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
@@ -524,8 +618,13 @@ do { \
x86_pmu.quirks = &__quirk; \
} while (0)
-#define ERF_NO_HT_SHARING 1
-#define ERF_HAS_RSP_1 2
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
+#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
+#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
extern struct x86_pmu x86_pmu __read_mostly;
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+ return x86_pmu.lbr_sel_map &&
+ x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
int x86_perf_event_set_period(struct perf_event *event);
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index)
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
int x86_setup_perfctr(struct perf_event *event);
int x86_pmu_hw_config(struct perf_event *event);
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+ /* user explicitly requested branch sampling */
+ if (has_branch_stack(event))
+ return true;
+
+ /* implicit branch sampling to correct PEBS skid */
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+ x86_pmu.intel_cap.pebs_format < 2)
+ return true;
+
+ return false;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+ !event->attr.freq && event->hw.sample_period == 1)
+ return true;
+
+ return false;
+}
+
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event);
struct intel_shared_regs *allocate_shared_regs(int cpu);
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void);
void intel_ds_init(void);
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_reset(void);
void intel_pmu_lbr_enable(struct perf_event *event);
void intel_pmu_lbr_disable(struct perf_event *event);
-void intel_pmu_lbr_enable_all(void);
+void intel_pmu_lbr_enable_all(bool pmi);
void intel_pmu_lbr_disable_all(void);
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void);
void intel_pmu_lbr_init_snb(void);
+void intel_pmu_lbr_init_hsw(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
int p4_pmu_init(void);
int p6_pmu_init(void);
@@ -758,6 +905,10 @@ int knc_pmu_init(void);
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline int is_ht_workaround_enabled(void)
+{
+ return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
static void amd_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
struct amd_nb *nb;
int i, nb_id;
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
continue;
if (nb->nb_id == nb_id) {
- cpuc->kfree_on_online = cpuc->amd_nb;
+ *onln = cpuc->amd_nb;
cpuc->amd_nb = nb;
break;
}
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
}
static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
/*
* if not NB event or no NB, then no constraints
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
* the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
* is using the new offset.
*/
-static int force_ibs_eilvt_setup(void)
+static void force_ibs_eilvt_setup(void)
{
int offset;
int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
if (offset == APIC_EILVT_NR_MAX) {
printk(KERN_DEBUG "No EILVT entry available\n");
- return -EBUSY;
+ return;
}
ret = setup_ibs_ctl(offset);
if (ret)
goto out;
- if (!ibs_eilvt_valid()) {
- ret = -EFAULT;
+ if (!ibs_eilvt_valid())
goto out;
- }
pr_info("IBS: LVT offset %d assigned\n", offset);
- return 0;
+ return;
out:
preempt_disable();
put_eilvt(offset);
preempt_enable();
- return ret;
+ return;
}
static void ibs_eilvt_setup(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 258990688a5e..219d3fb423a1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/watchdog.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
- /*
- * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
- * siblings; disable these events because they can corrupt unrelated
- * counters.
- */
- INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
EVENT_CONSTRAINT_END
};
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
- return true;
-
- return false;
-}
-
-static void intel_pmu_disable_all(void)
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void)
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
+ else
+ intel_bts_disable_local();
intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+ __intel_pmu_disable_all();
intel_pmu_lbr_disable_all();
}
-static void intel_pmu_enable_all(int added)
+static void __intel_pmu_enable_all(int added, bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
intel_pmu_pebs_enable_all();
- intel_pmu_lbr_enable_all();
+ intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added)
return;
intel_pmu_enable_bts(event->hw.config);
- }
+ } else
+ intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+ __intel_pmu_enable_all(added, false);
}
/*
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
* must disable before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_disable(event);
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
* must enabled before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void)
if (ds)
ds->bts_index = ds->bts_buffer_base;
+ /* Ack all overflows and disable fixed counters */
+ if (x86_pmu.version >= 2) {
+ intel_pmu_ack_status(intel_pmu_get_status());
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ /* Reset LBRs and LBR freezing */
+ if (x86_pmu.lbr_nr) {
+ update_debugctlmsr(get_debugctlmsr() &
+ ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+ }
+
local_irq_restore(flags);
}
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
*/
if (!x86_pmu.late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
- intel_pmu_disable_all();
+ __intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
status = intel_pmu_get_status();
if (!status)
goto done;
@@ -1399,6 +1629,14 @@ again:
}
/*
+ * Intel PT
+ */
+ if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+ handled++;
+ intel_pt_interrupt();
+ }
+
+ /*
* Checkpointed counters can lead to 'spurious' PMIs because the
* rollback caused by the PMI will have cleared the overflow status
* bit. Therefore always force probe these counters.
@@ -1433,7 +1671,7 @@ again:
goto again;
done:
- intel_pmu_enable_all(0);
+ __intel_pmu_enable_all(0, true);
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event)
static int intel_alt_er(int idx)
{
- if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+ if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
if (idx == EXTRA_REG_RSP_0)
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
}
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
}
static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;
- return x86_get_event_constraints(cpuc, event);
+ return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = true;
+ xl->num_alloc_cntrs = 0;
+ /*
+ * lock shared state until we are done scheduling
+ * in stop_event_scheduling()
+ * makes scheduling appear as a transaction
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+ raw_spin_lock(&excl_cntrs->lock);
+
+ /*
+ * save initial state of sibling thread
+ */
+ memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ /*
+ * make new sibling thread state visible
+ */
+ memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+ xl->sched_started = false;
+ /*
+ * release shared state lock (acquired in intel_start_scheduling())
+ */
+ raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ int idx, struct event_constraint *c)
+{
+ struct event_constraint *cx;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int is_excl, i;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* alternate */
+
+ /*
+ * validating a group does not require
+ * enforcing cross-thread exclusion
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return c;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return c;
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ /*
+ * xl = state of current HT
+ * xlo = state of sibling HT
+ */
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * do not allow scheduling of more than max_alloc_cntrs
+ * which is set to half the available generic counters.
+ * this helps avoid counter starvation of sibling thread
+ * by ensuring at most half the counters cannot be in
+ * exclusive mode. There is not designated counters for the
+ * limits. Any N/2 counters can be used. This helps with
+ * events with specifix counter constraints
+ */
+ if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+ return &emptyconstraint;
+
+ cx = c;
+
+ /*
+ * because we modify the constraint, we need
+ * to make a copy. Static constraints come
+ * from static const tables.
+ *
+ * only needed when constraint has not yet
+ * been cloned (marked dynamic)
+ */
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+ /* sanity check */
+ if (idx < 0)
+ return &emptyconstraint;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ memcpy(cx, c, sizeof(*cx));
+
+ /*
+ * mark constraint as dynamic, so we
+ * can free it later on
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ }
+
+ /*
+ * From here on, the constraint is dynamic.
+ * Either it was just allocated above, or it
+ * was allocated during a earlier invocation
+ * of this function
+ */
+
+ /*
+ * Modify static constraint with current dynamic
+ * state of thread
+ *
+ * EXCLUSIVE: sibling counter measuring exclusive event
+ * SHARED : sibling counter measuring non-exclusive event
+ * UNUSED : sibling counter unused
+ */
+ for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+ /*
+ * exclusive event in sibling counter
+ * our corresponding counter cannot be used
+ * regardless of our event
+ */
+ if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+ __clear_bit(i, cx->idxmsk);
+ /*
+ * if measuring an exclusive event, sibling
+ * measuring non-exclusive, then counter cannot
+ * be used
+ */
+ if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+ __clear_bit(i, cx->idxmsk);
+ }
+
+ /*
+ * recompute actual bit weight for scheduling algorithm
+ */
+ cx->weight = hweight64(cx->idxmsk64);
+
+ /*
+ * if we return an empty mask, then switch
+ * back to static empty constraint to avoid
+ * the cost of freeing later on
+ */
+ if (cx->weight == 0)
+ cx = &emptyconstraint;
+
+ return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c1 = event->hw.constraint;
+ struct event_constraint *c2;
+
+ /*
+ * first time only
+ * - static constraint: no change across incremental scheduling calls
+ * - dynamic constraint: handled by intel_get_excl_constraints()
+ */
+ c2 = __intel_get_event_constraints(cpuc, idx, event);
+ if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+ bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+ c1->weight = c2->weight;
+ c2 = c1;
+ }
+
+ if (cpuc->excl_cntrs)
+ return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+ return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xlo, *xl;
+ unsigned long flags = 0; /* keep compiler happy */
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake)
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional
+ */
+ if (!xl->sched_started)
+ raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+ /*
+ * if event was actually assigned, then mark the
+ * counter state as unused now
+ */
+ if (hwc->idx >= 0)
+ xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+ if (!xl->sched_started)
+ raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
}
static void
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
+ struct event_constraint *c = event->hw.constraint;
+
intel_put_shared_regs_event_constraints(cpuc, event);
+
+ /*
+ * is PMU has exclusive counter restrictions, then
+ * all events are subject to and must call the
+ * put_excl_constraints() routine
+ */
+ if (c && cpuc->excl_cntrs)
+ intel_put_excl_constraints(cpuc, event);
+
+ /* cleanup dynamic constraint */
+ if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+ event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+ struct perf_event *event, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = event->hw.constraint;
+ struct intel_excl_states *xlo, *xl;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+ int is_excl;
+
+ if (cpuc->is_fake || !c)
+ return;
+
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+ if (cntr >= 0) {
+ if (is_excl)
+ xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+ }
}
static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip && x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
- if (intel_pmu_needs_lbr_smpl(event)) {
+ if (needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
+
+ /*
+ * BTS is set up earlier in this path, so don't account twice
+ */
+ if (!intel_pmu_has_bts(event)) {
+ /* disallow lbr if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
}
if (event->attr.type != PERF_TYPE_RAW)
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint =
EVENT_CONSTRAINT(0, 0x4, 0);
static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
- struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
/* Handle special quirk on in_tx_checkpointed only in counter 2 */
if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
return c;
}
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+ if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (left < 128)
+ left = 128;
+ left &= ~0x3fu;
+ }
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
return regs;
}
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+ struct intel_excl_cntrs *c;
+ int i;
+
+ c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (c) {
+ raw_spin_lock_init(&c->lock);
+ for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+ c->states[0].state[i] = INTEL_EXCL_UNUSED;
+ c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+ c->states[1].state[i] = INTEL_EXCL_UNUSED;
+ c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+ }
+ c->core_id = -1;
+ }
+ return c;
+}
+
static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
- return NOTIFY_OK;
+ if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ return NOTIFY_BAD;
+ }
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- return NOTIFY_BAD;
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+ cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+ if (!cpuc->constraint_list)
+ return NOTIFY_BAD;
+
+ cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+ if (!cpuc->excl_cntrs) {
+ kfree(cpuc->constraint_list);
+ kfree(cpuc->shared_regs);
+ return NOTIFY_BAD;
+ }
+ cpuc->excl_thread_id = 0;
+ }
return NOTIFY_OK;
}
@@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu)
if (!cpuc->shared_regs)
return;
- if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+ if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
for_each_cpu(i, topology_thread_cpumask(cpu)) {
struct intel_shared_regs *pc;
pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- cpuc->kfree_on_online = cpuc->shared_regs;
+ *onln = cpuc->shared_regs;
cpuc->shared_regs = pc;
break;
}
@@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.lbr_sel_map)
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ int h = x86_pmu.num_counters >> 1;
+
+ for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ struct intel_excl_cntrs *c;
+
+ c = per_cpu(cpu_hw_events, i).excl_cntrs;
+ if (c && c->core_id == core_id) {
+ cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+ cpuc->excl_cntrs = c;
+ cpuc->excl_thread_id = 1;
+ break;
+ }
+ }
+ cpuc->excl_cntrs->core_id = core_id;
+ cpuc->excl_cntrs->refcnt++;
+ /*
+ * set hard limit to half the number of generic counters
+ */
+ cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+ cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
+ }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct intel_excl_cntrs *c;
+
+ c = cpuc->excl_cntrs;
+ if (c) {
+ if (c->core_id == -1 || --c->refcnt == 0)
+ kfree(c);
+ cpuc->excl_cntrs = NULL;
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+ }
}
static void intel_pmu_cpu_dying(int cpu)
@@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu)
cpuc->shared_regs = NULL;
}
- fini_debug_store_on_cpu(cpu);
-}
+ free_excl_cntrs(cpu);
-static void intel_pmu_flush_branch_stack(void)
-{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
- if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
+ fini_debug_store_on_cpu(cpu);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
+ .sched_task = intel_pmu_lbr_sched_task,
};
static __init void intel_clovertown_quirk(void)
@@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void)
}
}
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+ x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
+ x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
@@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_slm_event_constraints;
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
x86_pmu.extra_regs = intel_slm_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
pr_cont("Silvermont events, ");
break;
@@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void)
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
x86_pmu.extra_regs = intel_westmere_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = nhm_events_attrs;
@@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void)
case 42: /* 32nm SandyBridge */
case 45: /* 32nm SandyBridge-E/EN/EP */
x86_add_quirk(intel_sandybridge_quirk);
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void)
case 58: /* 22nm IvyBridge */
case 62: /* 22nm IvyBridge-EP/EX */
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
/* dTLB-load-misses on IVB is different than SNB */
@@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void)
else
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void)
case 63: /* 22nm Haswell Server */
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+ x86_add_quirk(intel_ht_bug);
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
- intel_pmu_lbr_init_snb();
+ intel_pmu_lbr_init_hsw();
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+ intel_pmu_lbr_init_hsw();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.limit_period = bdw_limit_period;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void)
return 0;
}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+ int cpu = smp_processor_id();
+ int w, c;
+ /*
+ * problem not present on this CPU model, nothing to do
+ */
+ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+ return 0;
+
+ w = cpumask_weight(topology_thread_cpumask(cpu));
+ if (w > 1) {
+ pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+ return 0;
+ }
+
+ watchdog_nmi_disable_all();
+
+ x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+ x86_pmu.commit_scheduling = NULL;
+ x86_pmu.start_scheduling = NULL;
+ x86_pmu.stop_scheduling = NULL;
+
+ watchdog_nmi_enable_all();
+
+ get_online_cpus();
+
+ for_each_online_cpu(c) {
+ free_excl_cntrs(c);
+ }
+
+ put_online_cpus();
+ pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+ return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..ac1f0c55f379
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
+
+struct bts_ctx {
+ struct perf_output_handle handle;
+ struct debug_store ds_back;
+ int started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE 24
+#define BTS_SAFETY_MARGIN 4080
+
+struct bts_phys {
+ struct page *page;
+ unsigned long size;
+ unsigned long offset;
+ unsigned long displacement;
+};
+
+struct bts_buffer {
+ size_t real_size; /* multiple of BTS_RECORD_SIZE */
+ unsigned int nr_pages;
+ unsigned int nr_bufs;
+ unsigned int cur_buf;
+ bool snapshot;
+ local_t data_size;
+ local_t lost;
+ local_t head;
+ unsigned long end;
+ void **data_pages;
+ struct bts_phys buf[0];
+};
+
+struct pmu bts_pmu;
+
+void intel_pmu_enable_bts(u64 config);
+void intel_pmu_disable_bts(void);
+
+static size_t buf_size(struct page *page)
+{
+ return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+ struct bts_buffer *buf;
+ struct page *page;
+ int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ unsigned long offset;
+ size_t size = nr_pages << PAGE_SHIFT;
+ int pg, nbuf, pad;
+
+ /* count all the high order buffers */
+ for (pg = 0, nbuf = 0; pg < nr_pages;) {
+ page = virt_to_page(pages[pg]);
+ if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+ return NULL;
+ pg += 1 << page_private(page);
+ nbuf++;
+ }
+
+ /*
+ * to avoid interrupts in overwrite mode, only allow one physical
+ */
+ if (overwrite && nbuf > 1)
+ return NULL;
+
+ buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->nr_pages = nr_pages;
+ buf->nr_bufs = nbuf;
+ buf->snapshot = overwrite;
+ buf->data_pages = pages;
+ buf->real_size = size - size % BTS_RECORD_SIZE;
+
+ for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+ unsigned int __nr_pages;
+
+ page = virt_to_page(pages[pg]);
+ __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+ buf->buf[nbuf].page = page;
+ buf->buf[nbuf].offset = offset;
+ buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+ buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+ pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+ buf->buf[nbuf].size -= pad;
+
+ pg += __nr_pages;
+ offset += __nr_pages << PAGE_SHIFT;
+ }
+
+ return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+ kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+ return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_phys *phys = &buf->buf[buf->cur_buf];
+ unsigned long index, thresh = 0, end = phys->size;
+ struct page *page = phys->page;
+
+ index = local_read(&buf->head);
+
+ if (!buf->snapshot) {
+ if (buf->end < phys->offset + buf_size(page))
+ end = buf->end - phys->offset - phys->displacement;
+
+ index -= phys->offset + phys->displacement;
+
+ if (end - index > BTS_SAFETY_MARGIN)
+ thresh = end - BTS_SAFETY_MARGIN;
+ else if (end - index > BTS_RECORD_SIZE)
+ thresh = end - BTS_RECORD_SIZE;
+ else
+ thresh = end;
+ }
+
+ ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+ ds->bts_index = ds->bts_buffer_base + index;
+ ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+ ds->bts_interrupt_threshold = !buf->snapshot
+ ? ds->bts_buffer_base + thresh
+ : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+ unsigned long index = head - phys->offset;
+
+ memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= bts->handle.size ||
+ bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+ return true;
+
+ return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+ if (!buf)
+ return;
+
+ head = index + bts_buffer_offset(buf, buf->cur_buf);
+ old = local_xchg(&buf->head, head);
+
+ if (!buf->snapshot) {
+ if (old == head)
+ return;
+
+ if (ds->bts_index >= ds->bts_absolute_maximum)
+ local_inc(&buf->lost);
+
+ /*
+ * old and head are always in the same physical buffer, so we
+ * can subtract them to get the data size.
+ */
+ local_add(head - old, &buf->data_size);
+ } else {
+ local_set(&buf->data_size, head);
+ }
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ u64 config = 0;
+
+ if (!buf || bts_buffer_is_full(buf, bts))
+ return;
+
+ event->hw.state = 0;
+
+ if (!buf->snapshot)
+ config |= ARCH_PERFMON_EVENTSEL_INT;
+ if (!event->attr.exclude_kernel)
+ config |= ARCH_PERFMON_EVENTSEL_OS;
+ if (!event->attr.exclude_user)
+ config |= ARCH_PERFMON_EVENTSEL_USR;
+
+ bts_config_buffer(buf);
+
+ /*
+ * local barrier to make sure that ds configuration made it
+ * before we enable BTS
+ */
+ wmb();
+
+ intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ __bts_event_start(event);
+
+ /* PMI handler: this counter is running and likely generating PMIs */
+ ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+ /*
+ * No extra synchronization is mandated by the documentation to have
+ * BTS data stores globally visible.
+ */
+ intel_pmu_disable_bts();
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
+ ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ /* PMI handler: don't restart this counter */
+ ACCESS_ONCE(bts->started) = 0;
+
+ __bts_event_stop(event);
+
+ if (flags & PERF_EF_UPDATE)
+ bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event && bts->started)
+ __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event)
+ __bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+ unsigned long head, space, next_space, pad, gap, skip, wakeup;
+ unsigned int next_buf;
+ struct bts_phys *phys, *next_phys;
+ int ret;
+
+ if (buf->snapshot)
+ return 0;
+
+ head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+ if (WARN_ON_ONCE(head != local_read(&buf->head)))
+ return -EINVAL;
+
+ phys = &buf->buf[buf->cur_buf];
+ space = phys->offset + phys->displacement + phys->size - head;
+ pad = space;
+ if (space > handle->size) {
+ space = handle->size;
+ space -= space % BTS_RECORD_SIZE;
+ }
+ if (space <= BTS_SAFETY_MARGIN) {
+ /* See if next phys buffer has more space */
+ next_buf = buf->cur_buf + 1;
+ if (next_buf >= buf->nr_bufs)
+ next_buf = 0;
+ next_phys = &buf->buf[next_buf];
+ gap = buf_size(phys->page) - phys->displacement - phys->size +
+ next_phys->displacement;
+ skip = pad + gap;
+ if (handle->size >= skip) {
+ next_space = next_phys->size;
+ if (next_space + skip > handle->size) {
+ next_space = handle->size - skip;
+ next_space -= next_space % BTS_RECORD_SIZE;
+ }
+ if (next_space > space || !space) {
+ if (pad)
+ bts_buffer_pad_out(phys, head);
+ ret = perf_aux_output_skip(handle, skip);
+ if (ret)
+ return ret;
+ /* Advance to next phys buffer */
+ phys = next_phys;
+ space = next_space;
+ head = phys->offset + phys->displacement;
+ /*
+ * After this, cur_buf and head won't match ds
+ * anymore, so we must not be racing with
+ * bts_update().
+ */
+ buf->cur_buf = next_buf;
+ local_set(&buf->head, head);
+ }
+ }
+ }
+
+ /* Don't go far beyond wakeup watermark */
+ wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+ handle->head;
+ if (space > wakeup) {
+ space = wakeup;
+ space -= space % BTS_RECORD_SIZE;
+ }
+
+ buf->end = head + space;
+
+ /*
+ * If we have no space, the lost notification would have been sent when
+ * we hit absolute_maximum - see bts_update()
+ */
+ if (!space)
+ return -ENOSPC;
+
+ return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct perf_event *event = bts->handle.event;
+ struct bts_buffer *buf;
+ s64 old_head;
+ int err;
+
+ if (!event || !bts->started)
+ return 0;
+
+ buf = perf_get_aux(&bts->handle);
+ /*
+ * Skip snapshot counters: they don't use the interrupt, but
+ * there's no other way of telling, because the pointer will
+ * keep moving
+ */
+ if (!buf || buf->snapshot)
+ return 0;
+
+ old_head = local_read(&buf->head);
+ bts_update(bts);
+
+ /* no new data */
+ if (old_head == local_read(&buf->head))
+ return 0;
+
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return 1;
+
+ err = bts_buffer_reset(buf, &bts->handle);
+ if (err)
+ perf_aux_output_end(&bts->handle, 0, false);
+
+ return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+ bts_event_stop(event, PERF_EF_UPDATE);
+
+ if (buf) {
+ if (buf->snapshot)
+ bts->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+ }
+
+ cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+ cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+ struct bts_buffer *buf;
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ return -EBUSY;
+
+ if (bts->handle.event)
+ return -EBUSY;
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return -EINVAL;
+
+ ret = bts_buffer_reset(buf, &bts->handle);
+ if (ret) {
+ perf_aux_output_end(&bts->handle, 0, false);
+ return ret;
+ }
+
+ bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+ bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+ bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+ if (mode & PERF_EF_START) {
+ bts_event_start(event, 0);
+ if (hwc->state & PERF_HES_STOPPED) {
+ bts_event_del(event, 0);
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+ if (event->attr.type != bts_pmu.type)
+ return -ENOENT;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_bts))
+ return -EBUSY;
+
+ event->destroy = bts_event_destroy;
+
+ return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+ return -ENODEV;
+
+ bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+ bts_pmu.task_ctx_nr = perf_sw_context;
+ bts_pmu.event_init = bts_event_init;
+ bts_pmu.add = bts_event_add;
+ bts_pmu.del = bts_event_del;
+ bts_pmu.start = bts_event_start;
+ bts_pmu.stop = bts_event_stop;
+ bts_pmu.read = bts_event_read;
+ bts_pmu.setup_aux = bts_buffer_setup_aux;
+ bts_pmu.free_aux = bts_buffer_free_aux;
+
+ return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+
+module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..e4d1b8b738fa
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,1379 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+ raw_spinlock_t lock;
+ int rmid;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR (1ULL << 63)
+#define RMID_VAL_UNAVAIL (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static unsigned int intel_cqm_rotation_rmid;
+
+#define INVALID_RMID (-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(unsigned int rmid)
+{
+ if (!rmid || rmid == INVALID_RMID)
+ return false;
+
+ return true;
+}
+
+static u64 __rmid_read(unsigned int rmid)
+{
+ u64 val;
+
+ /*
+ * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+ * it just says that to increase confusion.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ /*
+ * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+ * the number of cachelines tagged with @rmid.
+ */
+ return val;
+}
+
+enum rmid_recycle_state {
+ RMID_YOUNG = 0,
+ RMID_AVAILABLE,
+ RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+ unsigned int rmid;
+ enum rmid_recycle_state state;
+ struct list_head list;
+ unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ entry = cqm_rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static int __get_rmid(void)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ if (list_empty(&cqm_rmid_free_lru))
+ return INVALID_RMID;
+
+ entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void __put_rmid(unsigned int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ WARN_ON(!__rmid_valid(rmid));
+ entry = __rmid_entry(rmid);
+
+ entry->queue_time = jiffies;
+ entry->state = RMID_YOUNG;
+
+ list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+ struct cqm_rmid_entry *entry;
+ unsigned int nr_rmids;
+ int r = 0;
+
+ nr_rmids = cqm_max_rmid + 1;
+ cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+ nr_rmids, GFP_KERNEL);
+ if (!cqm_rmid_ptrs)
+ return -ENOMEM;
+
+ for (; r <= cqm_max_rmid; r++) {
+ struct cqm_rmid_entry *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto fail;
+
+ INIT_LIST_HEAD(&entry->list);
+ entry->rmid = r;
+ cqm_rmid_ptrs[r] = entry;
+
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ mutex_lock(&cache_mutex);
+ intel_cqm_rotation_rmid = __get_rmid();
+ mutex_unlock(&cache_mutex);
+
+ return 0;
+fail:
+ while (r--)
+ kfree(cqm_rmid_ptrs[r]);
+
+ kfree(cqm_rmid_ptrs);
+ return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+ /* Per-cpu and task events don't mix */
+ if ((a->attach_state & PERF_ATTACH_TASK) !=
+ (b->attach_state & PERF_ATTACH_TASK))
+ return false;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (a->cgrp != b->cgrp)
+ return false;
+#endif
+
+ /* If not task event, we're machine wide */
+ if (!(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Events that target same task are placed into the same cache group.
+ */
+ if (a->hw.target == b->hw.target)
+ return true;
+
+ /*
+ * Are we an inherited event?
+ */
+ if (b->parent == a)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return perf_cgroup_from_task(event->hw.target);
+
+ return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ * PROHIBITS
+ * system-wide -> cgroup and task
+ * cgroup -> system-wide
+ * -> task in cgroup
+ * task -> system-wide
+ * -> task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+ /*
+ * We can have any number of cgroups but only one system-wide
+ * event at a time.
+ */
+ if (a->cgrp && b->cgrp) {
+ struct perf_cgroup *ac = a->cgrp;
+ struct perf_cgroup *bc = b->cgrp;
+
+ /*
+ * This condition should have been caught in
+ * __match_event() and we should be sharing an RMID.
+ */
+ WARN_ON_ONCE(ac == bc);
+
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+
+ if (a->cgrp || b->cgrp) {
+ struct perf_cgroup *ac, *bc;
+
+ /*
+ * cgroup and system-wide events are mutually exclusive
+ */
+ if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+ (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+ return true;
+
+ /*
+ * Ensure neither event is part of the other's cgroup
+ */
+ ac = event_to_cgroup(a);
+ bc = event_to_cgroup(b);
+ if (ac == bc)
+ return true;
+
+ /*
+ * Must have cgroup and non-intersecting task events.
+ */
+ if (!ac || !bc)
+ return false;
+
+ /*
+ * We have cgroup and task events, and the task belongs
+ * to a cgroup. Check for for overlap.
+ */
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+#endif
+ /*
+ * If one of them is not a task, same story as above with cgroups.
+ */
+ if (!(a->attach_state & PERF_ATTACH_TASK) ||
+ !(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Must be non-overlapping.
+ */
+ return false;
+}
+
+struct rmid_read {
+ unsigned int rmid;
+ atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned int
+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+{
+ struct perf_event *event;
+ unsigned int old_rmid = group->hw.cqm_rmid;
+ struct list_head *head = &group->hw.cqm_group_entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ /*
+ * If our RMID is being deallocated, perform a read now.
+ */
+ if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ .rmid = old_rmid,
+ };
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+ &rr, 1);
+ local64_set(&group->count, atomic64_read(&rr.value));
+ }
+
+ raw_spin_lock_irq(&cache_lock);
+
+ group->hw.cqm_rmid = rmid;
+ list_for_each_entry(event, head, hw.cqm_group_entry)
+ event->hw.cqm_rmid = rmid;
+
+ raw_spin_unlock_irq(&cache_lock);
+
+ return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+ struct cqm_rmid_entry *entry;
+
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ if (entry->state != RMID_AVAILABLE)
+ break;
+
+ if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+ entry->state = RMID_DIRTY;
+ }
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(unsigned int rmid)
+{
+ struct perf_event *leader, *event;
+
+ lockdep_assert_held(&cache_mutex);
+
+ leader = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+ event = leader;
+
+ list_for_each_entry_continue(event, &cache_groups,
+ hw.cqm_groups_entry) {
+ if (__rmid_valid(event->hw.cqm_rmid))
+ continue;
+
+ if (__conflict_event(event, leader))
+ continue;
+
+ intel_cqm_xchg_rmid(event, rmid);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+ struct cqm_rmid_entry *entry, *tmp;
+
+ lockdep_assert_held(&cache_mutex);
+
+ *available = 0;
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ unsigned long min_queue_time;
+ unsigned long now = jiffies;
+
+ /*
+ * We hold RMIDs placed into limbo for a minimum queue
+ * time. Before the minimum queue time has elapsed we do
+ * not recycle RMIDs.
+ *
+ * The reasoning is that until a sufficient time has
+ * passed since we stopped using an RMID, any RMID
+ * placed onto the limbo list will likely still have
+ * data tagged in the cache, which means we'll probably
+ * fail to recycle it anyway.
+ *
+ * We can save ourselves an expensive IPI by skipping
+ * any RMIDs that have not been queued for the minimum
+ * time.
+ */
+ min_queue_time = entry->queue_time +
+ msecs_to_jiffies(__rmid_queue_time_ms);
+
+ if (time_after(min_queue_time, now))
+ break;
+
+ entry->state = RMID_AVAILABLE;
+ (*available)++;
+ }
+
+ /*
+ * Fast return if none of the RMIDs on the limbo list have been
+ * sitting on the queue for the minimum queue time.
+ */
+ if (!*available)
+ return false;
+
+ /*
+ * Test whether an RMID is free for each package.
+ */
+ on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+ list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+ /*
+ * Exhausted all RMIDs that have waited min queue time.
+ */
+ if (entry->state == RMID_YOUNG)
+ break;
+
+ if (entry->state == RMID_DIRTY)
+ continue;
+
+ list_del(&entry->list); /* remove from limbo */
+
+ /*
+ * The rotation RMID gets priority if it's
+ * currently invalid. In which case, skip adding
+ * the RMID to the the free lru.
+ */
+ if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_rotation_rmid = entry->rmid;
+ continue;
+ }
+
+ /*
+ * If we have groups waiting for RMIDs, hand
+ * them one now provided they don't conflict.
+ */
+ if (intel_cqm_sched_in_event(entry->rmid))
+ continue;
+
+ /*
+ * Otherwise place it onto the free list.
+ */
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+
+ return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+ struct perf_event *rotor;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ rotor = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+
+ /*
+ * The group at the front of the list should always have a valid
+ * RMID. If it doesn't then no groups have RMIDs assigned and we
+ * don't need to rotate the list.
+ */
+ if (next == rotor)
+ return;
+
+ rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+ __put_rmid(rmid);
+
+ list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+ struct perf_event *group, *g;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+ if (group == event)
+ continue;
+
+ rmid = group->hw.cqm_rmid;
+
+ /*
+ * Skip events that don't have a valid RMID.
+ */
+ if (!__rmid_valid(rmid))
+ continue;
+
+ /*
+ * No conflict? No problem! Leave the event alone.
+ */
+ if (!__conflict_event(group, event))
+ continue;
+
+ intel_cqm_xchg_rmid(group, INVALID_RMID);
+ __put_rmid(rmid);
+ }
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ * 1. To handle the scheduling of conflicting events
+ * 2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+ struct perf_event *group, *start = NULL;
+ unsigned int threshold_limit;
+ unsigned int nr_needed = 0;
+ unsigned int nr_available;
+ bool rotated = false;
+
+ mutex_lock(&cache_mutex);
+
+again:
+ /*
+ * Fast path through this function if there are no groups and no
+ * RMIDs that need cleaning.
+ */
+ if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+ if (!__rmid_valid(group->hw.cqm_rmid)) {
+ if (!start)
+ start = group;
+ nr_needed++;
+ }
+ }
+
+ /*
+ * We have some event groups, but they all have RMIDs assigned
+ * and no RMIDs need cleaning.
+ */
+ if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ if (!nr_needed)
+ goto stabilize;
+
+ /*
+ * We have more event groups without RMIDs than available RMIDs,
+ * or we have event groups that conflict with the ones currently
+ * scheduled.
+ *
+ * We force deallocate the rmid of the group at the head of
+ * cache_groups. The first event group without an RMID then gets
+ * assigned intel_cqm_rotation_rmid. This ensures we always make
+ * forward progress.
+ *
+ * Rotate the cache_groups list so the previous head is now the
+ * tail.
+ */
+ __intel_cqm_pick_and_rotate(start);
+
+ /*
+ * If the rotation is going to succeed, reduce the threshold so
+ * that we don't needlessly reuse dirty RMIDs.
+ */
+ if (__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+ intel_cqm_rotation_rmid = __get_rmid();
+
+ intel_cqm_sched_out_conflicting_events(start);
+
+ if (__intel_cqm_threshold)
+ __intel_cqm_threshold--;
+ }
+
+ rotated = true;
+
+stabilize:
+ /*
+ * We now need to stablize the RMID we freed above (if any) to
+ * ensure that the next time we rotate we have an RMID with zero
+ * occupancy value.
+ *
+ * Alternatively, if we didn't need to perform any rotation,
+ * we'll have a bunch of RMIDs in limbo that need stabilizing.
+ */
+ threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+ while (intel_cqm_rmid_stabilize(&nr_available) &&
+ __intel_cqm_threshold < threshold_limit) {
+ unsigned int steal_limit;
+
+ /*
+ * Don't spin if nobody is actively waiting for an RMID,
+ * the rotation worker will be kicked as soon as an
+ * event needs an RMID anyway.
+ */
+ if (!nr_needed)
+ break;
+
+ /* Allow max 25% of RMIDs to be in limbo. */
+ steal_limit = (cqm_max_rmid + 1) / 4;
+
+ /*
+ * We failed to stabilize any RMIDs so our rotation
+ * logic is now stuck. In order to make forward progress
+ * we have a few options:
+ *
+ * 1. rotate ("steal") another RMID
+ * 2. increase the threshold
+ * 3. do nothing
+ *
+ * We do both of 1. and 2. until we hit the steal limit.
+ *
+ * The steal limit prevents all RMIDs ending up on the
+ * limbo list. This can happen if every RMID has a
+ * non-zero occupancy above threshold_limit, and the
+ * occupancy values aren't dropping fast enough.
+ *
+ * Note that there is prioritisation at work here - we'd
+ * rather increase the number of RMIDs on the limbo list
+ * than increase the threshold, because increasing the
+ * threshold skews the event data (because we reuse
+ * dirty RMIDs) - threshold bumps are a last resort.
+ */
+ if (nr_available < steal_limit)
+ goto again;
+
+ __intel_cqm_threshold++;
+ }
+
+out:
+ mutex_unlock(&cache_mutex);
+ return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+ unsigned long delay;
+
+ __intel_cqm_rmid_rotate();
+
+ delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+ schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+ struct perf_event **group)
+{
+ struct perf_event *iter;
+ unsigned int rmid;
+ bool conflict = false;
+
+ list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+ rmid = iter->hw.cqm_rmid;
+
+ if (__match_event(iter, event)) {
+ /* All tasks in a group share an RMID */
+ event->hw.cqm_rmid = rmid;
+ *group = iter;
+ return;
+ }
+
+ /*
+ * We only care about conflicts for events that are
+ * actually scheduled in (and hence have a valid RMID).
+ */
+ if (__conflict_event(iter, event) && __rmid_valid(rmid))
+ conflict = true;
+ }
+
+ if (conflict)
+ rmid = INVALID_RMID;
+ else
+ rmid = __get_rmid();
+
+ event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+ unsigned long flags;
+ unsigned int rmid;
+ u64 val;
+
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ rmid = event->hw.cqm_rmid;
+
+ if (!__rmid_valid(rmid))
+ goto out;
+
+ val = __rmid_read(rmid);
+
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ goto out;
+
+ local64_set(&event->count, val);
+out:
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read(rr->rmid);
+
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+ return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+ unsigned long flags;
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ };
+
+ /*
+ * We only need to worry about task events. System-wide events
+ * are handled like usual, i.e. entirely with
+ * intel_cqm_event_read().
+ */
+ if (event->cpu != -1)
+ return __perf_event_count(event);
+
+ /*
+ * Only the group leader gets to report values. This stops us
+ * reporting duplicate values to userspace, and gives us a clear
+ * rule for which task gets to report the values.
+ *
+ * Note that it is impossible to attribute these values to
+ * specific packages - we forfeit that ability when we create
+ * task events.
+ */
+ if (!cqm_group_leader(event))
+ return 0;
+
+ /*
+ * Notice that we don't perform the reading of an RMID
+ * atomically, because we can't hold a spin lock across the
+ * IPIs.
+ *
+ * Speculatively perform the read, since @event might be
+ * assigned a different (possibly invalid) RMID while we're
+ * busying performing the IPI calls. It's therefore necessary to
+ * check @event's RMID afterwards, and if it has changed,
+ * discard the result of the read.
+ */
+ rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+ if (!__rmid_valid(rr.rmid))
+ goto out;
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ if (event->hw.cqm_rmid == rr.rmid)
+ local64_set(&event->count, atomic64_read(&rr.value));
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+ return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned int rmid = event->hw.cqm_rmid;
+ unsigned long flags;
+
+ if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+ return;
+
+ event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+
+ if (state->cnt++)
+ WARN_ON_ONCE(state->rmid != rmid);
+ else
+ WARN_ON_ONCE(state->rmid);
+
+ state->rmid = rmid;
+ wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned long flags;
+
+ if (event->hw.cqm_state & PERF_HES_STOPPED)
+ return;
+
+ event->hw.cqm_state |= PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+ intel_cqm_event_read(event);
+
+ if (!--state->cnt) {
+ state->rmid = 0;
+ wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+ } else {
+ WARN_ON_ONCE(!state->rmid);
+ }
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+ unsigned long flags;
+ unsigned int rmid;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
+ event->hw.cqm_state = PERF_HES_STOPPED;
+ rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+ intel_cqm_event_start(event, mode);
+
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return 0;
+}
+
+static void intel_cqm_event_del(struct perf_event *event, int mode)
+{
+ intel_cqm_event_stop(event, mode);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+ struct perf_event *group_other = NULL;
+
+ mutex_lock(&cache_mutex);
+
+ /*
+ * If there's another event in this group...
+ */
+ if (!list_empty(&event->hw.cqm_group_entry)) {
+ group_other = list_first_entry(&event->hw.cqm_group_entry,
+ struct perf_event,
+ hw.cqm_group_entry);
+ list_del(&event->hw.cqm_group_entry);
+ }
+
+ /*
+ * And we're the group leader..
+ */
+ if (cqm_group_leader(event)) {
+ /*
+ * If there was a group_other, make that leader, otherwise
+ * destroy the group and return the RMID.
+ */
+ if (group_other) {
+ list_replace(&event->hw.cqm_groups_entry,
+ &group_other->hw.cqm_groups_entry);
+ } else {
+ unsigned int rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid))
+ __put_rmid(rmid);
+ list_del(&event->hw.cqm_groups_entry);
+ }
+ }
+
+ mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+ struct perf_event *group = NULL;
+ bool rotate = false;
+
+ if (event->attr.type != intel_cqm_pmu.type)
+ return -ENOENT;
+
+ if (event->attr.config & ~QOS_EVENT_MASK)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+ INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+ event->destroy = intel_cqm_event_destroy;
+
+ mutex_lock(&cache_mutex);
+
+ /* Will also set rmid */
+ intel_cqm_setup_event(event, &group);
+
+ if (group) {
+ list_add_tail(&event->hw.cqm_group_entry,
+ &group->hw.cqm_group_entry);
+ } else {
+ list_add_tail(&event->hw.cqm_groups_entry,
+ &cache_groups);
+
+ /*
+ * All RMIDs are either in use or have recently been
+ * used. Kick the rotation worker to clean/free some.
+ *
+ * We only do this for the group leader, rather than for
+ * every event in a group to save on needless work.
+ */
+ if (!__rmid_valid(event->hw.cqm_rmid))
+ rotate = true;
+ }
+
+ mutex_unlock(&cache_mutex);
+
+ if (rotate)
+ schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+ return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+ .name = "events",
+ .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+ .name = "format",
+ .attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ ssize_t rv;
+
+ mutex_lock(&cache_mutex);
+ rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+ mutex_unlock(&cache_mutex);
+
+ return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int bytes, cachelines;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cache_mutex);
+
+ __intel_cqm_max_threshold = bytes;
+ cachelines = bytes / cqm_l3_scale;
+
+ /*
+ * The new maximum takes effect immediately.
+ */
+ if (__intel_cqm_threshold > cachelines)
+ __intel_cqm_threshold = cachelines;
+
+ mutex_unlock(&cache_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+ &dev_attr_max_recycle_threshold.attr,
+ NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+ .attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+ &intel_cqm_events_group,
+ &intel_cqm_format_group,
+ &intel_cqm_group,
+ NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+ .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+ .attr_groups = intel_cqm_attr_groups,
+ .task_ctx_nr = perf_sw_context,
+ .event_init = intel_cqm_event_init,
+ .add = intel_cqm_event_add,
+ .del = intel_cqm_event_del,
+ .start = intel_cqm_event_start,
+ .stop = intel_cqm_event_stop,
+ .read = intel_cqm_event_read,
+ .count = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ for_each_cpu(i, &cqm_cpumask) {
+ if (phys_id == topology_physical_package_id(i))
+ return; /* already got reader for this socket */
+ }
+
+ cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_prepare(unsigned int cpu)
+{
+ struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ raw_spin_lock_init(&state->lock);
+ state->rmid = 0;
+ state->cnt = 0;
+
+ WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+ WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ /*
+ * Is @cpu a designated cqm reader?
+ */
+ if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+ return;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+
+ if (phys_id == topology_physical_package_id(i)) {
+ cpumask_set_cpu(i, &cqm_cpumask);
+ break;
+ }
+ }
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ intel_cqm_cpu_prepare(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ intel_cqm_cpu_exit(cpu);
+ break;
+ case CPU_STARTING:
+ cqm_pick_event_reader(cpu);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+ {}
+};
+
+static int __init intel_cqm_init(void)
+{
+ char *str, scale[20];
+ int i, cpu, ret;
+
+ if (!x86_match_cpu(intel_cqm_match))
+ return -ENODEV;
+
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+ /*
+ * It's possible that not all resources support the same number
+ * of RMIDs. Instead of making scheduling much more complicated
+ * (where we have to match a task's RMID to a cpu that supports
+ * that many RMIDs) just find the minimum RMIDs supported across
+ * all cpus.
+ *
+ * Also, check that the scales match on all cpus.
+ */
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_cache_max_rmid < cqm_max_rmid)
+ cqm_max_rmid = c->x86_cache_max_rmid;
+
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
+ boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event_attr_intel_cqm_llc_scale.event_str = str;
+
+ ret = intel_cqm_setup_rmid_cache();
+ if (ret)
+ goto out;
+
+ for_each_online_cpu(i) {
+ intel_cqm_cpu_prepare(i);
+ cqm_pick_event_reader(i);
+ }
+
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+ ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+ if (ret)
+ pr_err("Intel CQM perf registration failed: %d\n", ret);
+ else
+ pr_info("Intel CQM monitoring enabled\n");
+
+out:
+ cpu_notifier_register_done();
+
+ return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..813f75d71175 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
debugctlmsr |= DEBUGCTLMSR_TR;
debugctlmsr |= DEBUGCTLMSR_BTS;
- debugctlmsr |= DEBUGCTLMSR_BTINT;
+ if (config & ARCH_PERFMON_EVENTSEL_INT)
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
if (!(config & ARCH_PERFMON_EVENTSEL_OS))
debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
@@ -557,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
EVENT_CONSTRAINT_END
};
@@ -564,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
EVENT_CONSTRAINT_END
};
@@ -587,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -602,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -611,6 +620,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -622,6 +635,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -633,16 +650,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT 9 /* enable call stack */
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
@@ -69,33 +71,31 @@ static enum {
#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
#define LBR_FROM_FLAG_ABORT (1ULL << 61)
-#define for_each_branch_sample_type(x) \
- for ((x) = PERF_SAMPLE_BRANCH_USER; \
- (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
/*
* x86control flow change classification
* x86control flow changes include branches, interrupts, traps, faults
*/
enum {
- X86_BR_NONE = 0, /* unknown */
-
- X86_BR_USER = 1 << 0, /* branch target is user */
- X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
-
- X86_BR_CALL = 1 << 2, /* call */
- X86_BR_RET = 1 << 3, /* return */
- X86_BR_SYSCALL = 1 << 4, /* syscall */
- X86_BR_SYSRET = 1 << 5, /* syscall return */
- X86_BR_INT = 1 << 6, /* sw interrupt */
- X86_BR_IRET = 1 << 7, /* return from interrupt */
- X86_BR_JCC = 1 << 8, /* conditional */
- X86_BR_JMP = 1 << 9, /* jump */
- X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
- X86_BR_IND_CALL = 1 << 11,/* indirect calls */
- X86_BR_ABORT = 1 << 12,/* transaction abort */
- X86_BR_IN_TX = 1 << 13,/* in transaction */
- X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -112,13 +112,15 @@ enum {
X86_BR_JMP |\
X86_BR_IRQ |\
X86_BR_ABORT |\
- X86_BR_IND_CALL)
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
#define X86_BR_ANY_CALL \
(X86_BR_CALL |\
X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
X86_BR_SYSCALL |\
X86_BR_IRQ |\
X86_BR_INT)
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
* otherwise it becomes near impossible to get a reliable stack.
*/
-static void __intel_pmu_lbr_enable(void)
+static void __intel_pmu_lbr_enable(bool pmi)
{
- u64 debugctl;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 debugctl, lbr_select = 0, orig_debugctl;
- if (cpuc->lbr_sel)
- wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
+ /*
+ * No need to reprogram LBR_SELECT in a PMI, as it
+ * did not change.
+ */
+ if (cpuc->lbr_sel && !pmi) {
+ lbr_select = cpuc->lbr_sel->config;
+ wrmsrl(MSR_LBR_SELECT, lbr_select);
+ }
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ orig_debugctl = debugctl;
+ debugctl |= DEBUGCTLMSR_LBR;
+ /*
+ * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+ * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+ * may cause superfluous increase/decrease of LBR_TOS.
+ */
+ if (!(lbr_select & LBR_CALL_STACK))
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (orig_debugctl != debugctl)
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void __intel_pmu_lbr_disable(void)
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_64();
}
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrl(x86_pmu.lbr_tos, tos);
+ return tos;
+}
+
+enum {
+ LBR_NONE,
+ LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0 ||
+ task_ctx->lbr_stack_state == LBR_NONE) {
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0) {
+ task_ctx->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ /*
+ * If LBR callstack feature is enabled and the stack was saved when
+ * the task was scheduled out, restore the stack. Otherwise flush
+ * the LBR stack.
+ */
+ task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ if (task_ctx) {
+ if (sched_in) {
+ __intel_pmu_lbr_restore(task_ctx);
+ cpuc->lbr_context = ctx;
+ } else {
+ __intel_pmu_lbr_save(task_ctx);
+ }
+ return;
+ }
+
+ /*
+ * When sampling the branck stack in system-wide, it may be
+ * necessary to flush the stack on context switch. This happens
+ * when the branch stack does not tag its entries with the pid
+ * of the current task. Otherwise it becomes impossible to
+ * associate a branch entry with a task. This ambiguity is more
+ * likely to appear when the branch stack supports priv level
+ * filtering and the user sets it to monitor only at the user
+ * level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch
+ * stack with branch from multiple tasks.
+ */
+ if (sched_in) {
+ intel_pmu_lbr_reset();
+ cpuc->lbr_context = ctx;
+ }
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
void intel_pmu_lbr_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event)
}
cpuc->br_sel = event->hw.branch_reg.reg;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users++;
+ }
+
cpuc->lbr_users++;
+ perf_sched_cb_inc(event->ctx->pmu);
}
void intel_pmu_lbr_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users--;
+ }
+
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
+ perf_sched_cb_dec(event->ctx->pmu);
if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
}
}
-void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
- __intel_pmu_lbr_enable();
+ __intel_pmu_lbr_enable(pmi);
}
void intel_pmu_lbr_disable_all(void)
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
}
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
- u64 tos;
-
- rdmsrl(x86_pmu.lbr_tos, tos);
-
- return tos;
-}
-
static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void)
* - in case there is no HW filter
* - in case the HW filter has errata or limitations
*/
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{
u64 br_type = event->attr.branch_sample_type;
int mask = 0;
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC;
+ if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+ if (!x86_pmu_has_lbr_callstack())
+ return -EOPNOTSUPP;
+ if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+ return -EINVAL;
+ mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+ X86_BR_CALL_STACK;
+ }
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
*/
event->hw.branch_reg.reg = mask;
+ return 0;
}
/*
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
struct hw_perf_event_extra *reg;
u64 br_type = event->attr.branch_sample_type;
- u64 mask = 0, m;
- u64 v;
+ u64 mask = 0, v;
+ int i;
- for_each_branch_sample_type(m) {
- if (!(br_type & m))
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & (1ULL << i)))
continue;
- v = x86_pmu.lbr_sel_map[m];
+ v = x86_pmu.lbr_sel_map[i];
if (v == LBR_NOT_SUPP)
return -EOPNOTSUPP;
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
- /* LBR_SELECT operates in suppress mode so invert mask */
- reg->config = ~mask & x86_pmu.lbr_sel_mask;
+ /*
+ * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+ * in suppress mode. So LBR_SELECT should be set to
+ * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ */
+ reg->config = mask ^ x86_pmu.lbr_sel_mask;
return 0;
}
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/*
* setup SW LBR filter
*/
- intel_pmu_setup_sw_lbr_filter(event);
+ ret = intel_pmu_setup_sw_lbr_filter(event);
+ if (ret)
+ return ret;
/*
* setup HW LBR filter, if any
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
+ insn_get_immediate(&insn);
+ if (insn.immediate1.value == 0) {
+ /* zero length call */
+ ret = X86_BR_ZERO_CALL;
+ break;
+ }
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
/*
* Map interface branch filters onto LBR filters
*/
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
- | LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
*/
- [PERF_SAMPLE_BRANCH_ANY_CALL] =
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
};
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
- [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
- | LBR_FAR,
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_RETURN | LBR_CALL_STACK,
};
/* core */
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void)
pr_cont("16-deep LBR, ");
}
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ pr_cont("16-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..ffe666c2c6b5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1100 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+
+#include "perf_event.h"
+#include "intel_pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m) \
+ [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
+ .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+ const char *name;
+ u32 leaf;
+ u8 reg;
+ u32 mask;
+} pt_caps[] = {
+ PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
+ PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
+ PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+ struct pt_cap_desc *cd = &pt_caps[cap];
+ u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+ unsigned int shift = __ffs(cd->mask);
+
+ return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dev_ext_attribute *ea =
+ container_of(attr, struct dev_ext_attribute, attr);
+ enum pt_capabilities cap = (long)ea->var;
+
+ return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+ .name = "caps",
+};
+
+PMU_FORMAT_ATTR(tsc, "config:10" );
+PMU_FORMAT_ATTR(noretcomp, "config:11" );
+
+static struct attribute *pt_formats_attr[] = {
+ &format_attr_tsc.attr,
+ &format_attr_noretcomp.attr,
+ NULL,
+};
+
+static struct attribute_group pt_format_group = {
+ .name = "format",
+ .attrs = pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+ &pt_cap_group,
+ &pt_format_group,
+ NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+ struct dev_ext_attribute *de_attrs;
+ struct attribute **attrs;
+ size_t size;
+ int ret;
+ long i;
+
+ attrs = NULL;
+ ret = -ENODEV;
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ goto fail;
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ cpuid_count(20, i,
+ &pt_pmu.caps[CR_EAX + i*4],
+ &pt_pmu.caps[CR_EBX + i*4],
+ &pt_pmu.caps[CR_ECX + i*4],
+ &pt_pmu.caps[CR_EDX + i*4]);
+ }
+
+ ret = -ENOMEM;
+ size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+ attrs = kzalloc(size, GFP_KERNEL);
+ if (!attrs)
+ goto fail;
+
+ size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+ de_attrs = kzalloc(size, GFP_KERNEL);
+ if (!de_attrs)
+ goto fail;
+
+ for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+ struct dev_ext_attribute *de_attr = de_attrs + i;
+
+ de_attr->attr.attr.name = pt_caps[i].name;
+
+ sysfs_attr_init(&de_attrs->attr.attr);
+
+ de_attr->attr.attr.mode = S_IRUGO;
+ de_attr->attr.show = pt_cap_show;
+ de_attr->var = (void *)i;
+
+ attrs[i] = &de_attr->attr.attr;
+ }
+
+ pt_cap_group.attrs = attrs;
+
+ return 0;
+
+fail:
+ kfree(attrs);
+
+ return ret;
+}
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+
+ if ((config & PT_CONFIG_MASK) != config)
+ return false;
+
+ return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static bool pt_is_running(void)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ return !!(ctl & RTIT_CTL_TRACEEN);
+}
+
+static void pt_config(struct perf_event *event)
+{
+ u64 reg;
+
+ reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+ if (!event->attr.exclude_kernel)
+ reg |= RTIT_CTL_OS;
+ if (!event->attr.exclude_user)
+ reg |= RTIT_CTL_USR;
+
+ reg |= (event->attr.config & PT_CONFIG_MASK);
+
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+ if (start)
+ ctl |= RTIT_CTL_TRACEEN;
+ else
+ ctl &= ~RTIT_CTL_TRACEEN;
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ /*
+ * A wrmsr that disables trace generation serializes other PT
+ * registers and causes all data packets to be written to memory,
+ * but a fence is required for the data to become globally visible.
+ *
+ * The below WMB, separating data store and aux_head store matches
+ * the consumer's RMB that separates aux_head load and data load.
+ */
+ if (!start)
+ wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+ unsigned int output_off)
+{
+ u64 reg;
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+ reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table: actual ToPA table entries, as understood by PT hardware
+ * @list: linkage to struct pt_buffer's list of tables
+ * @phys: physical address of this page
+ * @offset: offset of the first entry in this table in the buffer
+ * @size: total size of all entries in this table
+ * @last: index of the last initialized entry in this table
+ */
+struct topa {
+ struct topa_entry table[TENTS_PER_PAGE];
+ struct list_head list;
+ u64 phys;
+ u64 offset;
+ size_t size;
+ int last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu: CPU on which to allocate.
+ * @gfp: Allocation flags.
+ *
+ * Return: On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+ int node = cpu_to_node(cpu);
+ struct topa *topa;
+ struct page *p;
+
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+ if (!p)
+ return NULL;
+
+ topa = page_address(p);
+ topa->last = 0;
+ topa->phys = page_to_phys(p);
+
+ /*
+ * In case of singe-entry ToPA, always put the self-referencing END
+ * link as the 2nd entry in the table
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, 1)->end = 1;
+ }
+
+ return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa: Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+ free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf: PT buffer that's being extended.
+ * @topa: New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+ struct topa *last = buf->last;
+
+ list_add_tail(&topa->list, &buf->tables);
+
+ if (!buf->first) {
+ buf->first = buf->last = buf->cur = topa;
+ return;
+ }
+
+ topa->offset = last->offset + last->size;
+ buf->last = topa;
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return;
+
+ BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+ TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa: ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+ /* single-entry ToPA is a special case */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return !!topa->last;
+
+ return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf: PT buffer being initialized.
+ * @gfp: Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return: 0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+ struct topa *topa = buf->last;
+ int order = 0;
+ struct page *p;
+
+ p = virt_to_page(buf->data_pages[buf->nr_pages]);
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (topa_table_full(topa)) {
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+ }
+
+ TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, -1)->size = order;
+ if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, -1)->intr = 1;
+ TOPA_ENTRY(topa, -1)->stop = 1;
+ }
+
+ topa->last++;
+ topa->size += sizes(order);
+
+ buf->nr_pages += 1ul << order;
+
+ return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf: PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+ struct topa *topa;
+
+ list_for_each_entry(topa, &buf->tables, list) {
+ int i;
+
+ pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+ topa->phys, topa->offset, topa->size);
+ for (i = 0; i < TENTS_PER_PAGE; i++) {
+ pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+ &topa->table[i],
+ (unsigned long)topa->table[i].base << TOPA_SHIFT,
+ sizes(topa->table[i].size),
+ topa->table[i].end ? 'E' : ' ',
+ topa->table[i].intr ? 'I' : ' ',
+ topa->table[i].stop ? 'S' : ' ',
+ *(u64 *)&topa->table[i]);
+ if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+ topa->table[i].stop) ||
+ topa->table[i].end)
+ break;
+ }
+ }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf: PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+ buf->output_off = 0;
+ buf->cur_idx++;
+
+ if (buf->cur_idx == buf->cur->last) {
+ if (buf->cur == buf->last)
+ buf->cur = buf->first;
+ else
+ buf->cur = list_entry(buf->cur->list.next, struct topa,
+ list);
+ buf->cur_idx = 0;
+ }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt: Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ u64 topa_idx, base, old;
+
+ /* offset of the first region in this table from the beginning of buf */
+ base = buf->cur->offset + buf->output_off;
+
+ /* offset of the current output region within this table */
+ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+ base += sizes(buf->cur->table[topa_idx].size);
+
+ if (buf->snapshot) {
+ local_set(&buf->data_size, base);
+ } else {
+ old = (local64_xchg(&buf->head, base) &
+ ((buf->nr_pages << PAGE_SHIFT) - 1));
+ if (base < old)
+ base += buf->nr_pages << PAGE_SHIFT;
+
+ local_add(base - old, &buf->data_size);
+ }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf: PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+ return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf: PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+ return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt: Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ int advance = 0;
+ u64 status;
+
+ rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+ if (status & RTIT_STATUS_ERROR) {
+ pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+ pt_topa_dump(buf);
+ status &= ~RTIT_STATUS_ERROR;
+ }
+
+ if (status & RTIT_STATUS_STOPPED) {
+ status &= ~RTIT_STATUS_STOPPED;
+
+ /*
+ * On systems that only do single-entry ToPA, hitting STOP
+ * means we are already losing data; need to let the decoder
+ * know.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+ buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ local_inc(&buf->lost);
+ advance++;
+ }
+ }
+
+ /*
+ * Also on single-entry ToPA implementations, interrupt will come
+ * before the output reaches its output region's boundary.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+ pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+ void *head = pt_buffer_region(buf);
+
+ /* everything within this margin needs to be zeroed out */
+ memset(head + buf->output_off, 0,
+ pt_buffer_region_size(buf) -
+ buf->output_off);
+ advance++;
+ }
+
+ if (advance)
+ pt_buffer_advance(buf);
+
+ wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf: PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+ u64 offset, base_topa;
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+ buf->cur = phys_to_virt(base_topa);
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+ /* offset within current output region */
+ buf->output_off = offset >> 32;
+ /* index of current output region within this table */
+ buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf: PT buffer.
+ * @pg: Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+ struct topa_entry *te = buf->topa_index[pg];
+
+ /* one region */
+ if (buf->first == buf->last && buf->first->last == 1)
+ return pg;
+
+ do {
+ pg++;
+ pg &= buf->nr_pages - 1;
+ } while (buf->topa_index[pg] == te);
+
+ return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf: PT buffer.
+ * @handle: Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+ struct perf_output_handle *handle)
+
+{
+ unsigned long idx, npages, end;
+
+ if (buf->snapshot)
+ return 0;
+
+ /* can't stop in the middle of an output region */
+ if (buf->output_off + handle->size + 1 <
+ sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+ return -EINVAL;
+
+
+ /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return 0;
+
+ /* clear STOP and INT from current entry */
+ buf->topa_index[buf->stop_pos]->stop = 0;
+ buf->topa_index[buf->intr_pos]->intr = 0;
+
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ npages = (handle->size + 1) >> PAGE_SHIFT;
+ end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
+ /*if (end > handle->wakeup >> PAGE_SHIFT)
+ end = handle->wakeup >> PAGE_SHIFT;*/
+ idx = end & (buf->nr_pages - 1);
+ buf->stop_pos = idx;
+ idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
+ idx &= buf->nr_pages - 1;
+ buf->intr_pos = idx;
+ }
+
+ buf->topa_index[buf->stop_pos]->stop = 1;
+ buf->topa_index[buf->intr_pos]->intr = 1;
+
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf: PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+ struct topa *cur = buf->first, *prev = buf->last;
+ struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+ *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+ int pg = 0, idx = 0, ntopa = 0;
+
+ while (pg < buf->nr_pages) {
+ int tidx;
+
+ /* pages within one topa entry */
+ for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+ buf->topa_index[pg] = te_prev;
+
+ te_prev = te_cur;
+
+ if (idx == cur->last - 1) {
+ /* advance to next topa table */
+ idx = 0;
+ cur = list_entry(cur->list.next, struct topa, list);
+ ntopa++;
+ } else
+ idx++;
+ te_cur = TOPA_ENTRY(cur, idx);
+ }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf: PT buffer.
+ * @head: Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly.
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+ int pg;
+
+ if (buf->snapshot)
+ head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ pg = pt_topa_next_entry(buf, pg);
+
+ buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+ buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+ (unsigned long)buf->cur) / sizeof(struct topa_entry);
+ buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+ local64_set(&buf->head, head);
+ local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf: PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+ struct topa *topa, *iter;
+
+ list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+ /*
+ * right now, this is in free_aux() path only, so
+ * no need to unlink this table from the list
+ */
+ topa_free(topa);
+ }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf: PT buffer.
+ * @size: Total size of all regions within this ToPA.
+ * @gfp: Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+ gfp_t gfp)
+{
+ struct topa *topa;
+ int err;
+
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+
+ while (buf->nr_pages < nr_pages) {
+ err = topa_insert_pages(buf, gfp);
+ if (err) {
+ pt_buffer_fini_topa(buf);
+ return -ENOMEM;
+ }
+ }
+
+ pt_buffer_setup_topa_index(buf);
+
+ /* link last table to the first one, unless we're double buffering */
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(buf->last, -1)->end = 1;
+ }
+
+ pt_topa_dump(buf);
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu: Cpu on which to allocate, -1 means current.
+ * @pages: Array of pointers to buffer pages passed from perf core.
+ * @nr_pages: Number of pages in the buffer.
+ * @snapshot: If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return: Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+ struct pt_buffer *buf;
+ int node, ret;
+
+ if (!nr_pages)
+ return NULL;
+
+ if (cpu == -1)
+ cpu = raw_smp_processor_id();
+ node = cpu_to_node(cpu);
+
+ buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+ GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->cpu = cpu;
+ buf->snapshot = snapshot;
+ buf->data_pages = pages;
+
+ INIT_LIST_HEAD(&buf->tables);
+
+ ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+ if (ret) {
+ kfree(buf);
+ return NULL;
+ }
+
+ return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data: PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+ struct pt_buffer *buf = data;
+
+ pt_buffer_fini_topa(buf);
+ kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf: PT buffer.
+ * @pt: Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= pt->handle.size)
+ return true;
+
+ return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+ struct perf_event *event = pt->handle.event;
+
+ /*
+ * There may be a dangling PT bit in the interrupt status register
+ * after PT has been disabled by pt_event_stop(). Make sure we don't
+ * do anything (particularly, re-enable) for this event here.
+ */
+ if (!ACCESS_ONCE(pt->handle_nmi))
+ return;
+
+ pt_config_start(false);
+
+ if (!event)
+ return;
+
+ buf = perf_get_aux(&pt->handle);
+ if (!buf)
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+
+ if (!event->hw.state) {
+ int ret;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret) {
+ perf_aux_output_end(&pt->handle, 0, true);
+ return;
+ }
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+ }
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ ACCESS_ONCE(pt->handle_nmi) = 1;
+ event->hw.state = 0;
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ /*
+ * Protect against the PMI racing with disabling wrmsr,
+ * see comment in intel_pt_interrupt().
+ */
+ ACCESS_ONCE(pt->handle_nmi) = 0;
+ pt_config_start(false);
+
+ if (event->hw.state == PERF_HES_STOPPED)
+ return;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_UPDATE) {
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (!buf)
+ return;
+
+ if (WARN_ON_ONCE(pt->handle.event != event))
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+ }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+
+ pt_event_stop(event, PERF_EF_UPDATE);
+
+ buf = perf_get_aux(&pt->handle);
+
+ if (buf) {
+ if (buf->snapshot)
+ pt->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+ }
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+ struct pt_buffer *buf;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ if (pt->handle.event)
+ goto fail;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ ret = -EINVAL;
+ if (!buf)
+ goto fail_stop;
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ if (!buf->snapshot) {
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret)
+ goto fail_end_stop;
+ }
+
+ if (mode & PERF_EF_START) {
+ pt_event_start(event, 0);
+ ret = -EBUSY;
+ if (hwc->state == PERF_HES_STOPPED)
+ goto fail_end_stop;
+ } else {
+ hwc->state = PERF_HES_STOPPED;
+ }
+
+ return 0;
+
+fail_end_stop:
+ perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+ hwc->state = PERF_HES_STOPPED;
+fail:
+ return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+ if (event->attr.type != pt_pmu.pmu.type)
+ return -ENOENT;
+
+ if (!pt_event_valid(event))
+ return -EINVAL;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_pt))
+ return -EBUSY;
+
+ event->destroy = pt_event_destroy;
+
+ return 0;
+}
+
+static __init int pt_init(void)
+{
+ int ret, cpu, prior_warn = 0;
+
+ BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ u64 ctl;
+
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+ if (!ret && (ctl & RTIT_CTL_TRACEEN))
+ prior_warn++;
+ }
+ put_online_cpus();
+
+ if (prior_warn) {
+ x86_add_exclusive(x86_lbr_exclusive_pt);
+ pr_warn("PT is enabled at boot time, doing nothing\n");
+
+ return -EBUSY;
+ }
+
+ ret = pt_pmu_hw_init();
+ if (ret)
+ return ret;
+
+ if (!pt_cap_get(PT_CAP_topa_output)) {
+ pr_warn("ToPA output is not supported on this CPU\n");
+ return -ENODEV;
+ }
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ pt_pmu.pmu.capabilities =
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+ pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+ pt_pmu.pmu.attr_groups = pt_attr_groups;
+ pt_pmu.pmu.task_ctx_nr = perf_sw_context;
+ pt_pmu.pmu.event_init = pt_event_init;
+ pt_pmu.pmu.add = pt_event_add;
+ pt_pmu.pmu.del = pt_event_del;
+ pt_pmu.pmu.start = pt_event_start;
+ pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.read = pt_event_read;
+ pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
+ pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+ return ret;
+}
+
+module_init(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index c4bb8b8e5017..999289b94025 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -62,6 +62,14 @@
#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+#define NR_RAPL_DOMAINS 0x4
+static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+ "pp0-core",
+ "package",
+ "dram",
+ "pp1-gpu",
+};
+
/* Clients have PP0, PKG */
#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
@@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = { \
struct rapl_pmu {
spinlock_t lock;
- int hw_unit; /* 1/2^hw_unit Joule */
int n_active; /* number of active events */
struct list_head active_list;
struct pmu *pmu; /* pointer to rapl_pmu_class */
@@ -120,6 +127,7 @@ struct rapl_pmu {
struct hrtimer hrtimer;
};
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
static struct pmu rapl_pmu_class;
static cpumask_t rapl_cpu_mask;
static int rapl_cntr_mask;
@@ -127,6 +135,7 @@ static int rapl_cntr_mask;
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+static struct x86_pmu_quirk *rapl_quirks;
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
@@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw;
}
-static inline u64 rapl_scale(u64 v)
+#define rapl_add_quirk(func_) \
+do { \
+ static struct x86_pmu_quirk __quirk __initdata = { \
+ .func = func_, \
+ }; \
+ __quirk.next = rapl_quirks; \
+ rapl_quirks = &__quirk; \
+} while (0)
+
+static inline u64 rapl_scale(u64 v, int cfg)
{
+ if (cfg > NR_RAPL_DOMAINS) {
+ pr_warn("invalid domain %d, failed to scale data\n", cfg);
+ return v;
+ }
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
- return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
+ return v << (32 - rapl_hw_unit[cfg - 1]);
}
static u64 rapl_event_update(struct perf_event *event)
@@ -173,7 +195,7 @@ again:
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
- sdelta = rapl_scale(delta);
+ sdelta = rapl_scale(delta, event->hw.config);
local64_add(sdelta, &event->count);
@@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu)
cpumask_set_cpu(cpu, &rapl_cpu_mask);
}
+static __init void rapl_hsw_server_quirk(void)
+{
+ /*
+ * DRAM domain on HSW server has fixed energy unit which can be
+ * different than the unit from power unit MSR.
+ * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+ * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ */
+ rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
static int rapl_cpu_prepare(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
int phys_id = topology_physical_package_id(cpu);
u64 ms;
- u64 msr_rapl_power_unit_bits;
if (pmu)
return 0;
@@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu)
if (phys_id < 0)
return -1;
- /* protect rdmsrl() to handle virtualization */
- if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
- return -1;
-
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
if (!pmu)
return -1;
-
spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
- /*
- * grab power unit as: 1/2^unit Joules
- *
- * we cache in local PMU instance
- */
- pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
pmu->pmu = &rapl_pmu_class;
/*
@@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu)
* divide interval by 2 to avoid lockstep (2 * 100)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
- if (pmu->hw_unit < 32)
- ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+ if (rapl_hw_unit[0] < 32)
+ ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
else
ms = 2;
@@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self,
return NOTIFY_OK;
}
+static int rapl_check_hw_unit(void)
+{
+ u64 msr_rapl_power_unit_bits;
+ int i;
+
+ /* protect rdmsrl() to handle virtualization */
+ if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+ return -1;
+ for (i = 0; i < NR_RAPL_DOMAINS; i++)
+ rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+ return 0;
+}
+
static const struct x86_cpu_id rapl_cpu_match[] = {
[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
[1] = {},
@@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void)
{
struct rapl_pmu *pmu;
int cpu, ret;
+ struct x86_pmu_quirk *quirk;
+ int i;
/*
* check for Intel processor family 6
@@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void)
rapl_cntr_mask = RAPL_IDX_CLN;
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
+ case 63: /* Haswell-Server */
+ rapl_add_quirk(rapl_hsw_server_quirk);
+ rapl_cntr_mask = RAPL_IDX_SRV;
+ rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+ break;
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
rapl_cntr_mask = RAPL_IDX_HSW;
@@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void)
/* unsupported */
return 0;
}
+ ret = rapl_check_hw_unit();
+ if (ret)
+ return ret;
+ /* run cpu model quirks */
+ for (quirk = rapl_quirks; quirk; quirk = quirk->next)
+ quirk->func();
cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
@@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void)
pmu = __this_cpu_read(rapl_pmu);
- pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+ pr_info("RAPL PMU detected,"
" API unit is 2^-32 Joules,"
" %d fixed counters"
" %llu ms ovfl timer\n",
- pmu->hw_unit,
hweight32(rapl_cntr_mask),
ktime_to_ms(pmu->timer_interval));
-
+ for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+ if (rapl_cntr_mask & (1 << i)) {
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_domain_names[i], rapl_hw_unit[i]);
+ }
+ }
out:
cpu_notifier_register_done();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
}
}
- if (ubox_dev)
- pci_dev_put(ubox_dev);
+ pci_dev_put(ubox_dev);
return err ? pcibios_err_to_errno(err) : 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
{ X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
{ X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },