summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2015-04-20 04:32:26 +0300
committerDave Airlie <airlied@redhat.com>2015-04-20 06:05:20 +0300
commit2c33ce009ca2389dbf0535d0672214d09738e35e (patch)
tree6186a6458c3c160385d794a23eaf07c786a9e61b /arch/x86/kernel/cpu
parentcec32a47010647e8b0603726ebb75b990a4057a4 (diff)
parent09d51602cf84a1264946711dd4ea0dddbac599a1 (diff)
downloadlinux-2c33ce009ca2389dbf0535d0672214d09738e35e.tar.xz
Merge Linus master into drm-next
The merge is clean, but the arm build fails afterwards, due to API changes in the regulator tree. I've included the patch into the merge to fix the build. Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c126
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c715
-rw-r--r--arch/x86/kernel/cpu/intel_pt.h131
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c75
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c345
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c22
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event.c223
-rw-r--r--arch/x86/kernel/cpu/perf_event.h181
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c908
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c525
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c1379
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c39
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c321
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c1100
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c3
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
30 files changed, 5347 insertions, 1198 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..fd470ebf924e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <linux/io.h>
#include <linux/sched.h>
+#include <linux/random.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
va_align.mask = (upperbit - 1) & PAGE_MASK;
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_int() & va_align.mask;
}
}
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..a62cf04dac8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[10] = eax;
}
+ /* Additional Intel-defined flags: level 0x0000000F */
+ if (c->cpuid_level >= 0x0000000F) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=0 */
+ cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[11] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = ebx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[12] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ }
+ } else {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ }
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
detect_nopl(c);
}
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+ /*
+ * The heavy lifting of max_rmid and cache_occ_scale are handled
+ * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
+ * in case CQM bits really aren't there in this CPU.
+ */
+ if (c != &boot_cpu_data) {
+ boot_cpu_data.x86_cache_max_rmid =
+ min(boot_cpu_data.x86_cache_max_rmid,
+ c->x86_cache_max_rmid);
+ }
+}
+
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor(c);
x86_init_rdrand(c);
+ x86_init_cache_qos(c);
/*
* Clear/Set all flags overriden by options, need do it
@@ -959,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
}
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
-/* May not be __init: called during resume */
-static void syscall32_cpu_init(void)
-{
- /* Load these always in case some future AMD CPU supports
- SYSENTER from compat mode too. */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-#endif /* CONFIG_IA32_EMULATION */
-#endif /* CONFIG_X86_64 */
-
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
#ifdef CONFIG_X86_32
void enable_sep_cpu(void)
{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss;
+ int cpu;
- if (!boot_cpu_has(X86_FEATURE_SEP)) {
- put_cpu();
- return;
- }
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ goto out;
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
tss->x86_tss.ss1 = __KERNEL_CS;
- tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+ wrmsr(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
+ 0);
+
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
put_cpu();
}
#endif
@@ -1118,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+ (unsigned long)&init_thread_union + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(kernel_stack);
#ifdef CONFIG_X86_64
@@ -1130,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
/*
- * The following four percpu variables are hot. Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot. Align current_task to
+ * cacheline size such that they fall in the same cacheline.
*/
DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
@@ -1171,10 +1209,23 @@ void syscall_init(void)
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+#else
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
@@ -1226,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack. Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+ (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
@@ -1307,7 +1367,7 @@ void cpu_init(void)
*/
load_ucode_ap();
- t = &per_cpu(init_tss, cpu);
+ t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1391,7 +1451,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 659643376dbf..edcb0e28c336 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -7,16 +7,14 @@
* Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
+#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sysfs.h>
#include <linux/pci.h>
#include <asm/processor.h>
-#include <linux/smp.h>
#include <asm/amd_nb.h>
#include <asm/smp.h>
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
enum _cache_type {
- CACHE_TYPE_NULL = 0,
- CACHE_TYPE_DATA = 1,
- CACHE_TYPE_INST = 2,
- CACHE_TYPE_UNIFIED = 3
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
};
union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-struct _cpuid4_info {
- struct _cpuid4_info_regs base;
- DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
static const unsigned char levels[] = { 1, 1, 2, 3 };
static const unsigned char types[] = { 1, 2, 3, 3 };
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
static void
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
- unsigned int);
-};
-
#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
/*
* L3 cache descriptors
*/
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-{
- int node;
-
- /* only for L3, and not in virtualized environments */
- if (index < 3)
- return;
-
- node = amd_get_nb_id(smp_processor_id());
- this_leaf->nb = node_to_amd_nb(node);
- if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
- amd_calc_l3_indices(this_leaf->nb);
-}
-
/*
* check whether a slot used for disabling an L3 index is occupied.
* @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
return -1;
}
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
unsigned int slot)
{
int index;
+ struct amd_northbridge *nb = this_leaf->priv;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
+ index = amd_get_l3_disable_slot(nb, slot);
if (index >= 0)
return sprintf(buf, "%d\n", index);
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
#define SHOW_CACHE_DISABLE(slot) \
static ssize_t \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return show_cache_disable(this_leaf, buf, slot); \
}
SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
return 0;
}
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
{
unsigned long val = 0;
int cpu, err = 0;
+ struct amd_northbridge *nb = this_leaf->priv;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ cpu = cpumask_first(&this_leaf->shared_cpu_map);
if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
- err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count, \
- unsigned int cpu) \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return store_cache_disable(this_leaf, buf, count, slot); \
}
STORE_CACHE_DISABLE(0)
STORE_CACHE_DISABLE(1)
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
-
-static ssize_t
-show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
+static ssize_t subcaches_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
}
-static ssize_t
-store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
- unsigned int cpu)
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
unsigned long val;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
-
if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
return count;
}
-static struct _cache_attr subcaches =
- __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!this_leaf->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ int n = 1;
+ static struct attribute **amd_l3_attrs;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (this_leaf->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return;
+
+ node = amd_get_nb_id(smp_processor_id());
+ this_leaf->nb = node_to_amd_nb(node);
+ if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+ amd_calc_l3_indices(this_leaf->nb);
+}
#else
#define amd_init_l3_cache(x, y)
#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
- if (eax.split.type == CACHE_TYPE_NULL)
+ if (eax.split.type == CTYPE_NULL)
return -EIO; /* better error ? */
this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
/* Do cpuid(op) loop to find out num_cache_leaves */
cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
cache_eax.full = eax;
- } while (cache_eax.split.type != CACHE_TYPE_NULL);
+ } while (cache_eax.split.type != CTYPE_NULL);
return i;
}
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
switch (this_leaf.eax.split.level) {
case 1:
- if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
+ if (this_leaf.eax.split.type == CTYPE_DATA)
new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
+ else if (this_leaf.eax.split.type == CTYPE_INST)
new_l1i = this_leaf.size/1024;
break;
case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
return l2;
}
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-
-static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf;
int i, sibling;
if (cpu_has_topoext) {
unsigned int apicid, nshared, first, last;
- if (!per_cpu(ici_cpuid4_info, cpu))
- return 0;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
+ this_leaf = this_cpu_ci->info_list + index;
+ nshared = base->eax.split.num_threads_sharing + 1;
apicid = cpu_data(cpu).apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
apicid = cpu_data(i).apicid;
if ((apicid < first) || (apicid > last))
continue;
- if (!per_cpu(ici_cpuid4_info, i))
- continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+
+ this_leaf = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
apicid = cpu_data(sibling).apicid;
if ((apicid < first) || (apicid > last))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else if (index == 3) {
for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- if (!per_cpu(ici_cpuid4_info, i))
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+ this_leaf = this_cpu_ci->info_list + index;
for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
if (!cpu_online(sibling))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
return 1;
}
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf, *sibling_leaf;
unsigned long num_threads_sharing;
int index_msb, i;
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->x86_vendor == X86_VENDOR_AMD) {
- if (cache_shared_amd_cpu_map_setup(cpu, index))
+ if (__cache_amd_cpumap_setup(cpu, index, base))
return;
}
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
+ this_leaf = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+ cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
if (num_threads_sharing == 1)
- cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
- else {
- index_msb = get_count_order(num_threads_sharing);
-
- for_each_online_cpu(i) {
- if (cpu_data(i).apicid >> index_msb ==
- c->apicid >> index_msb) {
- cpumask_set_cpu(i,
- to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
- sibling_leaf =
- CPUID4_INFO_IDX(i, index);
- cpumask_set_cpu(cpu, to_cpumask(
- sibling_leaf->shared_cpu_map));
- }
- }
- }
- }
-}
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- int sibling;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpumask_clear_cpu(cpu,
- to_cpumask(sibling_leaf->shared_cpu_map));
- }
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void free_cache_attributes(unsigned int cpu)
-{
- int i;
-
- for (i = 0; i < num_cache_leaves; i++)
- cache_remove_shared_cpu_map(cpu, i);
-
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static void get_cpu_leaves(void *_retval)
-{
- int j, *retval = _retval, cpu = smp_processor_id();
+ return;
- /* Do cpuid and store the results */
- for (j = 0; j < num_cache_leaves; j++) {
- struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
+ index_msb = get_count_order(num_threads_sharing);
- *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base);
- if (unlikely(*retval < 0)) {
- int i;
+ for_each_online_cpu(i)
+ if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
- for (i = 0; i < j; i++)
- cache_remove_shared_cpu_map(cpu, i);
- break;
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;/* skip if itself or no cacheinfo */
+ sibling_leaf = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
}
- cache_shared_cpu_map_setup(cpu, j);
- }
}
-static int detect_cache_attributes(unsigned int cpu)
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+ struct _cpuid4_info_regs *base)
{
- int retval;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(ici_cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
- if (retval) {
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
- }
-
- return retval;
+ this_leaf->level = base->eax.split.level;
+ this_leaf->type = cache_type_map[base->eax.split.type];
+ this_leaf->coherency_line_size =
+ base->ebx.split.coherency_line_size + 1;
+ this_leaf->ways_of_associativity =
+ base->ebx.split.ways_of_associativity + 1;
+ this_leaf->size = base->size;
+ this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+ this_leaf->physical_line_partition =
+ base->ebx.split.physical_line_partition + 1;
+ this_leaf->priv = base->nb;
}
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
- struct kobject kobj;
- unsigned int cpu;
- unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val) \
-static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
-{ \
- return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, base.eax.split.level, 0);
-show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
- int type, char *buf)
-{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int ret;
-
- if (type)
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
- cpumask_pr_args(mask));
- else
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
- cpumask_pr_args(mask));
- buf[ret++] = '\n';
- buf[ret] = '\0';
- return ret;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
+static int __init_cache_level(unsigned int cpu)
{
- return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
-{
- return show_shared_cpu_map_func(leaf, 1, buf);
-}
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- switch (this_leaf->base.eax.split.type) {
- case CACHE_TYPE_DATA:
- return sprintf(buf, "Data\n");
- case CACHE_TYPE_INST:
- return sprintf(buf, "Instruction\n");
- case CACHE_TYPE_UNIFIED:
- return sprintf(buf, "Unified\n");
- default:
- return sprintf(buf, "Unknown\n");
- }
-}
-
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
- __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
- NULL
-};
-
-#ifdef CONFIG_AMD_NB
-static struct attribute **amd_l3_attrs(void)
-{
- static struct attribute **attrs;
- int n;
-
- if (attrs)
- return attrs;
-
- n = ARRAY_SIZE(default_attrs);
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- n += 2;
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- n += 1;
-
- attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
- if (attrs == NULL)
- return attrs = default_attrs;
-
- for (n = 0; default_attrs[n]; n++)
- attrs[n] = default_attrs[n];
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
- attrs[n++] = &cache_disable_0.attr;
- attrs[n++] = &cache_disable_1.attr;
- }
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- attrs[n++] = &subcaches.attr;
-
- return attrs;
-}
-#endif
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->show ?
- fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->store ?
- fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, count, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type ktype_cache = {
- .sysfs_ops = &sysfs_ops,
- .default_attrs = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
- .sysfs_ops = &sysfs_ops,
-};
-
-static void cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
- kfree(per_cpu(ici_cache_kobject, cpu));
- kfree(per_cpu(ici_index_kobject, cpu));
- per_cpu(ici_cache_kobject, cpu) = NULL;
- per_cpu(ici_index_kobject, cpu) = NULL;
- free_cache_attributes(cpu);
-}
-
-static int cpuid4_cache_sysfs_init(unsigned int cpu)
-{
- int err;
-
- if (num_cache_leaves == 0)
+ if (!num_cache_leaves)
return -ENOENT;
-
- err = detect_cache_attributes(cpu);
- if (err)
- return err;
-
- /* Allocate all required memory */
- per_cpu(ici_cache_kobject, cpu) =
- kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
- goto err_out;
-
- per_cpu(ici_index_kobject, cpu) = kzalloc(
- sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
- goto err_out;
-
+ if (!this_cpu_ci)
+ return -EINVAL;
+ this_cpu_ci->num_levels = 3;
+ this_cpu_ci->num_leaves = num_cache_leaves;
return 0;
-
-err_out:
- cpuid4_cache_sysfs_exit(cpu);
- return -ENOMEM;
}
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int cache_add_dev(struct device *dev)
+static int __populate_cache_leaves(unsigned int cpu)
{
- unsigned int cpu = dev->id;
- unsigned long i, j;
- struct _index_kobject *this_object;
- struct _cpuid4_info *this_leaf;
- int retval;
-
- retval = cpuid4_cache_sysfs_init(cpu);
- if (unlikely(retval < 0))
- return retval;
-
- retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
- &ktype_percpu_entry,
- &dev->kobj, "%s", "cache");
- if (retval < 0) {
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
+ unsigned int idx, ret;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+ struct _cpuid4_info_regs id4_regs = {};
- for (i = 0; i < num_cache_leaves; i++) {
- this_object = INDEX_KOBJECT_PTR(cpu, i);
- this_object->cpu = cpu;
- this_object->index = i;
-
- this_leaf = CPUID4_INFO_IDX(cpu, i);
-
- ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
- if (this_leaf->base.nb)
- ktype_cache.default_attrs = amd_l3_attrs();
-#endif
- retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache,
- per_cpu(ici_cache_kobject, cpu),
- "index%1lu", i);
- if (unlikely(retval)) {
- for (j = 0; j < i; j++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
- kobject_uevent(&(this_object->kobj), KOBJ_ADD);
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ if (ret)
+ return ret;
+ ci_leaf_init(this_leaf++, &id4_regs);
+ __cache_cpumap_setup(cpu, idx, &id4_regs);
}
- cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
- kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
return 0;
}
-static void cache_remove_dev(struct device *dev)
-{
- unsigned int cpu = dev->id;
- unsigned long i;
-
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return;
- if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
- return;
- cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
- for (i = 0; i < num_cache_leaves; i++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
-
- dev = get_cpu_device(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- cache_add_dev(dev);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- cache_remove_dev(dev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cacheinfo_cpu_notifier = {
- .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __init cache_sysfs_init(void)
-{
- int i, err = 0;
-
- if (num_cache_leaves == 0)
- return 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- struct device *dev = get_cpu_device(i);
-
- err = cache_add_dev(dev);
- if (err)
- goto out;
- }
- __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
-
-out:
- cpu_notifier_register_done();
- return err;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
+DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
+DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+/*
+ * Table of Physical Addresses bits
+ */
+enum topa_sz {
+ TOPA_4K = 0,
+ TOPA_8K,
+ TOPA_16K,
+ TOPA_32K,
+ TOPA_64K,
+ TOPA_128K,
+ TOPA_256K,
+ TOPA_512K,
+ TOPA_1MB,
+ TOPA_2MB,
+ TOPA_4MB,
+ TOPA_8MB,
+ TOPA_16MB,
+ TOPA_32MB,
+ TOPA_64MB,
+ TOPA_128MB,
+ TOPA_SZ_END,
+};
+
+static inline unsigned int sizes(enum topa_sz tsz)
+{
+ return 1 << (tsz + 12);
+};
+
+struct topa_entry {
+ u64 end : 1;
+ u64 rsvd0 : 1;
+ u64 intr : 1;
+ u64 rsvd1 : 1;
+ u64 stop : 1;
+ u64 rsvd2 : 1;
+ u64 size : 4;
+ u64 rsvd3 : 2;
+ u64 base : 36;
+ u64 rsvd4 : 16;
+};
+
+#define TOPA_SHIFT 12
+#define PT_CPUID_LEAVES 2
+
+enum pt_capabilities {
+ PT_CAP_max_subleaf = 0,
+ PT_CAP_cr3_filtering,
+ PT_CAP_topa_output,
+ PT_CAP_topa_multiple_entries,
+ PT_CAP_payloads_lip,
+};
+
+struct pt_pmu {
+ struct pmu pmu;
+ u32 caps[4 * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ * cpu, depending on perf event configuration
+ * @cpu: cpu for per-cpu allocation
+ * @tables: list of ToPA tables in this buffer
+ * @first: shorthand for first topa table
+ * @last: shorthand for last topa table
+ * @cur: current topa table
+ * @nr_pages: buffer size in pages
+ * @cur_idx: current output region's index within @cur table
+ * @output_off: offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost: if data was lost/truncated
+ * @head: logical write offset inside the buffer
+ * @snapshot: if this is for a snapshot/overwrite counter
+ * @stop_pos: STOP topa entry in the buffer
+ * @intr_pos: INT topa entry in the buffer
+ * @data_pages: array of pages from perf
+ * @topa_index: table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+ int cpu;
+ struct list_head tables;
+ struct topa *first, *last, *cur;
+ unsigned int cur_idx;
+ size_t output_off;
+ unsigned long nr_pages;
+ local_t data_size;
+ local_t lost;
+ local64_t head;
+ bool snapshot;
+ unsigned long stop_pos, intr_pos;
+ void **data_pages;
+ struct topa_entry *topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle: perf output handle
+ * @handle_nmi: do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+ struct perf_output_handle handle;
+ int handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
};
#define ATTR_LEN 16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
@@ -23,20 +24,20 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
-int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
-void mce_intel_cmci_poll(void);
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
-# define mce_intel_adjust_timer mce_adjust_timer_default
-static inline void mce_intel_cmci_poll(void) { }
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..9c682c222071 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
-int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum context ctx = error_context(m);
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC)
+ return MCE_PANIC_SEVERITY;
+
+ if (m->status & MCI_STATUS_UC) {
+
+ /*
+ * On older systems where overflow_recov flag is not present, we
+ * should simply panic if an error overflow occurs. If
+ * overflow_recov flag is present and set, then software can try
+ * to at least kill process to prolong system operation.
+ */
+ if (mce_flags.overflow_recov) {
+ /* software can try to contain */
+ if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
+ return MCE_PANIC_SEVERITY;
+
+ /* kill current process */
+ return MCE_AR_SEVERITY;
+ } else {
+ /* at least one error was not logged */
+ if (m->status & MCI_STATUS_OVER)
+ return MCE_PANIC_SEVERITY;
+ }
+
+ /*
+ * For any other case, return MCE_UC_SEVERITY so that we log the
+ * error and exit #MC handler.
+ */
+ return MCE_UC_SEVERITY;
+ }
+
+ /*
+ * deferred error: poll handler catches these and adds to mce_ring so
+ * memory-failure can take recovery actions.
+ */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return MCE_DEFERRED_SEVERITY;
+
+ /*
+ * corrected error: poll handler catches these and passes responsibility
+ * of decoding the error to EDAC
+ */
+ return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m);
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
}
}
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+ mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ mce_severity = mce_severity_amd;
+}
+
#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3c036cb4a370..e535533d5ab8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
-#define SPINUNIT 100 /* 100ns */
+#define SPINUNIT 100 /* 100ns */
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/* CMCI storm detection filter */
-static DEFINE_PER_CPU(unsigned long, mce_polled_error);
-
/*
* MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
+ bool error_logged = false;
struct mce m;
int severity;
int i;
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
- this_cpu_write(mce_polled_error, 1);
+
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
+ error_logged = true;
mce_log(&m);
+ }
/*
* Clear state for this bank.
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
sync_core();
+
+ return error_logged;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -813,7 +816,7 @@ static void mce_reign(void)
* other CPUs.
*/
if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal Machine check", m, msg);
+ mce_panic("Fatal machine check", m, msg);
/*
* For UC somewhere we let the CPU who detects it handle it.
@@ -826,7 +829,7 @@ static void mce_reign(void)
* source or one CPU is hung. Panic.
*/
if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Machine check from unknown source", NULL, NULL);
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
* Now clear all the mces_seen so that they don't reappear on
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static unsigned long check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
return interval;
}
-static unsigned long (*mce_adjust_timer)(unsigned long interval) =
- mce_adjust_timer_default;
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-static int cmc_error_seen(void)
+static void __restart_timer(struct timer_list *t, unsigned long interval)
{
- unsigned long *v = this_cpu_ptr(&mce_polled_error);
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
- return test_and_clear_bit(0, v);
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+
+ local_irq_restore(flags);
}
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int cpu = smp_processor_id();
unsigned long iv;
- int notify;
- WARN_ON(smp_processor_id() != data);
+ WARN_ON(cpu != data);
+
+ iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP,
- this_cpu_ptr(&mce_poll_banks));
- mce_intel_cmci_poll();
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+
+ if (mce_intel_cmci_poll()) {
+ iv = mce_adjust_timer(iv);
+ goto done;
+ }
}
/*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
*/
- iv = __this_cpu_read(mce_next_interval);
- notify = mce_notify_irq();
- notify |= cmc_error_seen();
- if (notify) {
+ if (mce_notify_irq())
iv = max(iv / 2, (unsigned long) HZ/100);
- } else {
+ else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
- iv = mce_adjust_timer(iv);
- }
+
+done:
__this_cpu_write(mce_next_interval, iv);
- /* Might have become 0 after CMCI storm subsided */
- if (iv) {
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, iv);
}
/*
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
void mce_timer_kick(unsigned long interval)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
- if (timer_pending(t)) {
- if (time_before(when, t->expires))
- mod_timer_pinned(t, when);
- } else {
- t->expires = round_jiffies(when);
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, interval);
+
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
}
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
- if (c->x86 == 6 && cfg->banks > 0)
+ if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 val, hwcr;
- bool need_toggle;
- u32 msrs[] = {
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
0x00000413, /* MC4_MISC0 */
0xc0000408, /* MC4_MISC1 */
- };
+ };
- rdmsrl(MSR_K7_HWCR, hwcr);
+ rdmsrl(MSR_K7_HWCR, hwcr);
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
- for (i = 0; i < ARRAY_SIZE(msrs); i++) {
- rdmsrl(msrs[i], val);
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
- /* CntP bit set? */
- if (val & BIT_64(62)) {
- val &= ~BIT_64(62);
- wrmsrl(msrs[i], val);
- }
- }
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = mce_intel_adjust_timer;
+ mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
+ mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
break;
default:
break;
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mcheck_vendor_init_severity();
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f1c3769bbd64..55ad9b37cae8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
return (bank == 4);
}
-static const char * const bank4_names(struct threshold_block *b)
+static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
/* MSR4_MISC0 */
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!b.interrupt_capable)
goto init;
+ b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce(offset, new);
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
log:
mce_setup(&m);
rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ return;
m.misc = ((u64)high << 32) | low;
m.bank = bank;
mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
- if (b->interrupt_capable)
+ if (b->interrupt_capable) {
threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
- else
+ b->interrupt_enable = 1;
+ } else {
threshold_ktype.default_attrs[2] = NULL;
+ }
INIT_LIST_HEAD(&b->miscj);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
+/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_INTERVAL (HZ)
#define CMCI_STORM_THRESHOLD 15
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
-void mce_intel_cmci_poll(void)
+bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ return false;
+
+ /*
+ * Reset the counter if we've logged an error in the last poll
+ * during the storm.
+ */
+ if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ else
+ this_cpu_dec(cmci_backoff_cnt);
+
+ return true;
}
void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
-unsigned long mce_intel_adjust_timer(unsigned long interval)
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
- int r;
-
- if (interval < CMCI_POLL_INTERVAL)
- return interval;
+ if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+ (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+ mce_notify_irq();
+ return CMCI_STORM_INTERVAL;
+ }
switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
+
/*
* We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the
- * timer interval is back to our poll interval.
+ * silenced itself. That means no events recorded and the timer
+ * interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- r = atomic_sub_return(1, &cmci_storm_on_cpus);
- if (r == 0)
+ if (!atomic_sub_return(1, &cmci_storm_on_cpus))
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
/* FALLTHROUGH */
case CMCI_STORM_SUBSIDED:
/*
- * We wait for all cpus to go back to SUBSIDED
- * state. When that happens we switch back to
- * interrupt mode.
+ * We wait for all CPUs to go back to SUBSIDED state. When that
+ * happens we switch back to interrupt mode.
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
}
return CMCI_POLL_INTERVAL;
default:
- /*
- * We have shiny weather. Let the poll do whatever it
- * thinks.
- */
+
+ /* We have shiny weather. Let the poll do whatever it thinks. */
return interval;
}
}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_POLL_INTERVAL);
+ mce_timer_kick(CMCI_STORM_INTERVAL);
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
+
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
+
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bfbbe6195e2d..12829c3ced3c 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -21,7 +21,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
-#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index d45df4bd16ab..a413a69cbd74 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -23,57 +23,6 @@
#include <asm/processor.h>
#include <asm/cmdline.h>
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
-
-#define CPUID_IS(a, b, c, ebx, ecx, edx) \
- (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
-
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
- *
- * x86_vendor() gets vendor information directly through cpuid.
- */
-static int x86_vendor(void)
-{
- u32 eax = 0x00000000;
- u32 ebx, ecx = 0, edx;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
- return X86_VENDOR_INTEL;
-
- if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
- return X86_VENDOR_AMD;
-
- return X86_VENDOR_UNKNOWN;
-}
-
-static int x86_family(void)
-{
- u32 eax = 0x00000001;
- u32 ebx, ecx = 0, edx;
- int x86;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- x86 = (eax >> 8) & 0xf;
- if (x86 == 15)
- x86 += (eax >> 20) & 0xff;
-
- return x86;
-}
-
static bool __init check_loader_disabled_bsp(void)
{
#ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
void __init load_ucode_bsp(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_bsp())
return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_bsp();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_bsp();
break;
default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
void load_ucode_ap(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_ap())
return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_ap();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_ap();
break;
default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
void reload_early_microcode(void)
{
- int vendor, x86;
+ int vendor, family;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
reload_ucode_intel();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
reload_ucode_amd();
break;
default:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 746e7fd08aad..a41beadb3db9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
- return get_matching_microcode(csig, cpf, mc_intel, crev);
+ return get_matching_microcode(csig, cpf, crev, mc_intel);
}
static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
csig = uci->cpu_sig.sig;
cpf = uci->cpu_sig.pf;
- if (get_matching_microcode(csig, cpf, mc, new_rev)) {
+ if (get_matching_microcode(csig, cpf, new_rev, mc)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 420eb933189c..2f49ab4ac0ae 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -16,6 +16,14 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/slab.h>
@@ -28,6 +36,9 @@
#include <asm/tlbflush.h>
#include <asm/setup.h>
+#undef pr_fmt
+#define pr_fmt(fmt) "microcode: " fmt
+
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
@@ -35,50 +46,45 @@ static struct mc_saved_data {
} mc_saved_data;
static enum ucode_state
-generic_load_microcode_early(struct microcode_intel **mc_saved_p,
- unsigned int mc_saved_count,
- struct ucode_cpu_info *uci)
+load_microcode_early(struct microcode_intel **saved,
+ unsigned int num_saved, struct ucode_cpu_info *uci)
{
struct microcode_intel *ucode_ptr, *new_mc = NULL;
- int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_OK;
- unsigned int mc_size;
- struct microcode_header_intel *mc_header;
- unsigned int csig = uci->cpu_sig.sig;
- unsigned int cpf = uci->cpu_sig.pf;
- int i;
+ struct microcode_header_intel *mc_hdr;
+ int new_rev, ret, i;
- for (i = 0; i < mc_saved_count; i++) {
- ucode_ptr = mc_saved_p[i];
+ new_rev = uci->cpu_sig.rev;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- mc_size = get_totalsize(mc_header);
- if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
- new_rev = mc_header->rev;
- new_mc = ucode_ptr;
- }
- }
+ for (i = 0; i < num_saved; i++) {
+ ucode_ptr = saved[i];
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
+ ret = get_matching_microcode(uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ new_rev,
+ ucode_ptr);
+ if (!ret)
+ continue;
+
+ new_rev = mc_hdr->rev;
+ new_mc = ucode_ptr;
}
+ if (!new_mc)
+ return UCODE_NFOUND;
+
uci->mc = (struct microcode_intel *)new_mc;
-out:
- return state;
+ return UCODE_OK;
}
-static void
-microcode_pointer(struct microcode_intel **mc_saved,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start, int mc_saved_count)
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+ unsigned long off, int num_saved)
{
int i;
- for (i = 0; i < mc_saved_count; i++)
- mc_saved[i] = (struct microcode_intel *)
- (mc_saved_in_initrd[i] + initrd_start);
+ for (i = 0; i < num_saved; i++)
+ mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
}
#ifdef CONFIG_X86_32
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
#endif
static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start,
- struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long initrd_start, struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int count = mc_saved_data->mc_saved_count;
if (!mc_saved_data->mc_saved) {
- microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
- initrd_start, count);
+ copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
microcode_phys(mc_saved_tmp, mc_saved_data);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
#else
- return generic_load_microcode_early(mc_saved_data->mc_saved,
+ return load_microcode_early(mc_saved_data->mc_saved,
count, uci);
#endif
}
}
-static u8 get_x86_family(unsigned long sig)
-{
- u8 x86;
-
- x86 = (sig >> 8) & 0xf;
-
- if (x86 == 0xf)
- x86 += (sig >> 20) & 0xff;
-
- return x86;
-}
-
-static u8 get_x86_model(unsigned long sig)
-{
- u8 x86, x86_model;
-
- x86 = get_x86_family(sig);
- x86_model = (sig >> 4) & 0xf;
-
- if (x86 == 0x6 || x86 == 0xf)
- x86_model += ((sig >> 16) & 0xf) << 4;
-
- return x86_model;
-}
-
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
@@ -159,42 +137,40 @@ static enum ucode_state
matching_model_microcode(struct microcode_header_intel *mc_header,
unsigned long sig)
{
- u8 x86, x86_model;
- u8 x86_ucode, x86_model_ucode;
+ unsigned int fam, model;
+ unsigned int fam_ucode, model_ucode;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
- x86 = get_x86_family(sig);
- x86_model = get_x86_model(sig);
+ fam = __x86_family(sig);
+ model = x86_model(sig);
- x86_ucode = get_x86_family(mc_header->sig);
- x86_model_ucode = get_x86_model(mc_header->sig);
+ fam_ucode = __x86_family(mc_header->sig);
+ model_ucode = x86_model(mc_header->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
return UCODE_NFOUND;
- ext_header = (struct extended_sigtable *)
- mc_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
- x86_ucode = get_x86_family(ext_sig->sig);
- x86_model_ucode = get_x86_model(ext_sig->sig);
+ fam_ucode = __x86_family(ext_sig->sig);
+ model_ucode = x86_model(ext_sig->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
ext_sig++;
}
-
return UCODE_NFOUND;
}
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
unsigned int mc_saved_count)
{
int i, j;
- struct microcode_intel **mc_saved_p;
+ struct microcode_intel **saved_ptr;
int ret;
if (!mc_saved_count)
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
/*
* Copy new microcode data.
*/
- mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
- GFP_KERNEL);
- if (!mc_saved_p)
+ saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+ if (!saved_ptr)
return -ENOMEM;
for (i = 0; i < mc_saved_count; i++) {
- struct microcode_intel *mc = mc_saved_src[i];
- struct microcode_header_intel *mc_header = &mc->hdr;
- unsigned long mc_size = get_totalsize(mc_header);
- mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
- if (!mc_saved_p[i]) {
- ret = -ENOMEM;
- goto err;
- }
+ struct microcode_header_intel *mc_hdr;
+ struct microcode_intel *mc;
+ unsigned long size;
+
if (!mc_saved_src[i]) {
ret = -EINVAL;
goto err;
}
- memcpy(mc_saved_p[i], mc, mc_size);
+
+ mc = mc_saved_src[i];
+ mc_hdr = &mc->hdr;
+ size = get_totalsize(mc_hdr);
+
+ saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+ if (!saved_ptr[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ memcpy(saved_ptr[i], mc, size);
}
/*
* Point to newly saved microcode.
*/
- mc_saved_data->mc_saved = mc_saved_p;
+ mc_saved_data->mc_saved = saved_ptr;
mc_saved_data->mc_saved_count = mc_saved_count;
return 0;
err:
for (j = 0; j <= i; j++)
- kfree(mc_saved_p[j]);
- kfree(mc_saved_p);
+ kfree(saved_ptr[j]);
+ kfree(saved_ptr);
return ret;
}
@@ -257,48 +239,45 @@ err:
* - or if it is a newly discovered microcode patch.
*
* The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
*/
-static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
- unsigned int *mc_saved_count_p)
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+ u8 *ucode_ptr, unsigned int num_saved)
{
- int i;
- int found = 0;
- unsigned int mc_saved_count = *mc_saved_count_p;
- struct microcode_header_intel *mc_header;
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ unsigned int sig, pf, new_rev;
+ int found = 0, i;
+
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ for (i = 0; i < num_saved; i++) {
+ mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+ new_rev = mc_hdr->rev;
+
+ if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+ continue;
+
+ found = 1;
+
+ if (!revision_is_newer(mc_hdr, new_rev))
+ continue;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- for (i = 0; i < mc_saved_count; i++) {
- unsigned int sig, pf;
- unsigned int new_rev;
- struct microcode_header_intel *mc_saved_header =
- (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- new_rev = mc_header->rev;
-
- if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
- found = 1;
- if (update_match_revision(mc_header, new_rev)) {
- /*
- * Found an older ucode saved before.
- * Replace the older one with this newer
- * one.
- */
- mc_saved[i] =
- (struct microcode_intel *)ucode_ptr;
- break;
- }
- }
- }
- if (i >= mc_saved_count && !found)
/*
- * This ucode is first time discovered in ucode file.
- * Save it to memory.
+ * Found an older ucode saved earlier. Replace it with
+ * this newer one.
*/
- mc_saved[mc_saved_count++] =
- (struct microcode_intel *)ucode_ptr;
+ mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+
+ /* Newly detected microcode, save it to memory. */
+ if (i >= num_saved && !found)
+ mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
- *mc_saved_count_p = mc_saved_count;
+ return num_saved;
}
/*
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
continue;
}
- _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
ucode_ptr += mc_size;
}
@@ -372,7 +351,7 @@ out:
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
- u8 x86, x86_model;
+ unsigned int family, model;
struct cpu_signature csig;
unsigned int eax, ebx, ecx, edx;
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- x86 = get_x86_family(csig.sig);
- x86_model = get_x86_model(csig.sig);
+ family = __x86_family(csig.sig);
+ model = x86_model(csig.sig);
- if ((x86_model >= 5) || (x86 > 6)) {
+ if ((model >= 5) || (family > 6)) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void)
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
rev = uci.cpu_sig.rev;
- pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
- smp_processor_id(), sig, pf, rev);
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
struct microcode_header_intel *mc_saved_header;
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void)
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
- ext_header = (struct extended_sigtable *)
- mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc)
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
*/
-
- _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
/*
* Save the mc_save_tmp in global mc_saved_data.
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long end,
- struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- struct ucode_cpu_info *uci)
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long start, unsigned long size,
+ struct ucode_cpu_info *uci)
{
- unsigned int size = end - start + 1;
struct cpio_data cd;
long offset = 0;
#ifdef CONFIG_X86_32
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end,
if (!cd.data)
return UCODE_ERROR;
-
return get_matching_model_microcode(0, start, cd.data, cd.size,
- mc_saved_data, mc_saved_in_initrd,
- uci);
+ mc_saved_data, initrd, uci);
}
/*
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void)
if (count == 0)
return ret;
- microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void)
static void __init
_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start_early,
- unsigned long initrd_end_early,
- struct ucode_cpu_info *uci)
+ unsigned long *initrd,
+ unsigned long start, unsigned long size)
{
+ struct ucode_cpu_info uci;
enum ucode_state ret;
- collect_cpu_info_early(uci);
- scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
- mc_saved_in_initrd, uci);
+ collect_cpu_info_early(&uci);
- ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, uci);
+ ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+ if (ret != UCODE_OK)
+ return;
- if (ret == UCODE_OK)
- apply_microcode_early(uci, true);
+ ret = load_microcode(mc_saved_data, initrd, start, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
}
-void __init
-load_ucode_intel_bsp(void)
+void __init load_ucode_intel_bsp(void)
{
- u64 ramdisk_image, ramdisk_size;
- unsigned long initrd_start_early, initrd_end_early;
- struct ucode_cpu_info uci;
+ u64 start, size;
#ifdef CONFIG_X86_32
- struct boot_params *boot_params_p;
+ struct boot_params *p;
- boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
- ramdisk_image = boot_params_p->hdr.ramdisk_image;
- ramdisk_size = boot_params_p->hdr.ramdisk_size;
- initrd_start_early = ramdisk_image;
- initrd_end_early = initrd_start_early + ramdisk_size;
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ start = p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
_load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- initrd_start_early, initrd_end_early, &uci);
+ (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
#else
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
- initrd_start_early = ramdisk_image + PAGE_OFFSET;
- initrd_end_early = initrd_start_early + ramdisk_size;
-
- _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, initrd_end_early,
- &uci);
+ start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+ size = boot_params.hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
#endif
}
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void)
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
unsigned long initrd_start_addr;
+ enum ucode_state ret;
#ifdef CONFIG_X86_32
unsigned long *initrd_start_p;
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void)
return;
collect_cpu_info_early(&uci);
- load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
+ ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+
+ if (ret != UCODE_OK)
+ return;
+
apply_microcode_early(&uci, true);
}
@@ -808,8 +777,8 @@ void reload_ucode_intel(void)
collect_cpu_info_early(&uci);
- ret = generic_load_microcode_early(mc_saved_data.mc_saved,
- mc_saved_data.mc_saved_count, &uci);
+ ret = load_microcode_early(mc_saved_data.mc_saved,
+ mc_saved_data.mc_saved_count, &uci);
if (ret != UCODE_OK)
return;
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..cd47a510a3f1 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
}
-int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
int microcode_sanity_check(void *mc, int print_err)
{
unsigned long total_size, data_size, ext_table_size;
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
{
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header;
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
}
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
{
- struct microcode_header_intel *mc_header = mc;
+ struct microcode_header_intel *mc_hdr = mc;
- if (!update_match_revision(mc_header, rev))
+ if (!revision_is_newer(mc_hdr, rev))
return 0;
- return get_matching_sig(csig, cpf, mc, rev);
+ return get_matching_sig(csig, cpf, rev, mc);
}
EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 36d99a337b49..3f20710a5b23 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,7 +6,7 @@
IN=$1
OUT=$2
-function dump_array()
+dump_array()
{
ARRAY=$1
SIZE=$2
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
- int i, max, len;
+ int i, max;
mtrr_type type;
unsigned long base, size;
- len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
- len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
- "(%5luMB), size=%5lu%cB, count=%d: %s\n",
- i, base, base >> (20 - PAGE_SHIFT), size,
- factor, mtrr_usage_table[i],
- mtrr_attrib_to_str(type));
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..87848ebe2bb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
}
}
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+ hw_perf_event_destroy(event);
+
+ /* undo the lbr/bts event accounting */
+ x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
static inline int x86_pmu_initialized(void)
{
return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return x86_pmu_extra_regs(val, event);
}
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+ int ret = -EBUSY, i;
+
+ if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+ return 0;
+
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto out;
+
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ ret = 0;
+
+out:
+ mutex_unlock(&pmc_reserve_mutex);
+ return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+ atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
int x86_setup_perfctr(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
/* BTS is currently only allowed for user-mode. */
if (!attr->exclude_kernel)
return -EOPNOTSUPP;
+
+ /* disallow bts if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
}
hwc->config |= config;
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
- /*
- * check that PEBS LBR correction does not conflict with
- * whatever the user is asking with attr->branch_sample_type
- */
- if (event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2) {
- u64 *br_type = &event->attr.branch_sample_type;
-
- if (has_branch_stack(event)) {
- if (!precise_br_compat(event))
- return -EOPNOTSUPP;
-
- /* branch_sample_type is compatible */
-
- } else {
- /*
- * user did not specify branch_sample_type
- *
- * For PEBS fixups, we capture all
- * the branches at the priv level of the
- * event.
- */
- *br_type = PERF_SAMPLE_BRANCH_ANY;
-
- if (!event->attr.exclude_user)
- *br_type |= PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_kernel)
- *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
- }
+ }
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
}
}
+ if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+ event->attach_state |= PERF_ATTACH_TASK_DATA;
+
/*
* Generate PMC IRQs:
* (keep 'enabled' bit clear for now)
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+ if (event->attr.sample_period && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, event->attr.sample_period) >
+ event->attr.sample_period)
+ return -EINVAL;
+ }
+
return x86_setup_perfctr(event);
}
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
struct event_constraint *c;
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
struct perf_event *e;
- int i, wmin, wmax, num = 0;
+ int i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ if (x86_pmu.start_scheduling)
+ x86_pmu.start_scheduling(cpuc);
+
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
hwc = &cpuc->event_list[i]->hw;
- c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+ c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
hwc->constraint = c;
wmin = min(wmin, c->weight);
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n)
- num = perf_assign_events(cpuc->event_list, n, wmin,
- wmax, assign);
+ unsched = perf_assign_events(cpuc->event_list, n, wmin,
+ wmax, assign);
/*
- * Mark the event as committed, so we do not put_constraint()
- * in case new events are added and fail scheduling.
+ * In case of success (unsched = 0), mark events as committed,
+ * so we do not put_constraint() in case new events are added
+ * and fail to be scheduled
+ *
+ * We invoke the lower level commit callback to lock the resource
+ *
+ * We do not need to do all of this in case we are called to
+ * validate an event group (assign == NULL)
*/
- if (!num && assign) {
+ if (!unsched && assign) {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+ if (x86_pmu.commit_scheduling)
+ x86_pmu.commit_scheduling(cpuc, e, assign[i]);
}
}
- /*
- * scheduling failed or is just a simulation,
- * free resources if necessary
- */
- if (!assign || num) {
+
+ if (!assign || unsched) {
+
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
/*
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
continue;
+ /*
+ * release events that failed scheduling
+ */
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(cpuc, e);
}
}
- return num ? -EINVAL : 0;
+
+ if (x86_pmu.stop_scheduling)
+ x86_pmu.stop_scheduling(cpuc);
+
+ return unsched ? -EINVAL : 0;
}
/*
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event)
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
+ if (x86_pmu.limit_period)
+ left = x86_pmu.limit_period(event, left);
+
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
hwc = &event->hw;
- perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
ret = n = collect_events(cpuc, event, false);
if (ret < 0)
@@ -1071,7 +1140,6 @@ done_collect:
ret = 0;
out:
- perf_pmu_enable(event->pmu);
return ret;
}
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- u64 pebs;
+ u64 pebs, debugctl;
struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void)
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
- rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("\n");
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ if (x86_pmu.pebs_constraints) {
+ rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ }
+ if (x86_pmu.lbr_nr) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
+ }
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- int ret = NOTIFY_OK;
+ int i, ret = NOTIFY_OK;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- cpuc->kfree_on_online = NULL;
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+ cpuc->kfree_on_online[i] = NULL;
if (x86_pmu.cpu_prepare)
ret = x86_pmu.cpu_prepare(cpu);
break;
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
- kfree(cpuc->kfree_on_online);
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+ kfree(cpuc->kfree_on_online[i]);
+ cpuc->kfree_on_online[i] = NULL;
+ }
break;
case CPU_DYING:
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event)
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
- c = x86_pmu.get_event_constraints(fake_cpuc, event);
+ c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
if (!c || !c->weight)
ret = -EINVAL;
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_flush_branch_stack(void)
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
+ if (x86_pmu.sched_task)
+ x86_pmu.sched_task(ctx, sched_in);
}
void perf_check_microcode(void)
@@ -1949,7 +2027,8 @@ static struct pmu pmu = {
.commit_txn = x86_pmu_commit_txn,
.event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
+ .sched_task = x86_pmu_sched_task,
+ .task_ctx_size = sizeof(struct x86_perf_task_context),
};
void arch_perf_update_userpage(struct perf_event *event,
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event,
data = cyc2ns_read_begin();
+ /*
+ * Internal timekeeping for enabled/running/stopped times
+ * is always in the local_clock domain.
+ */
userpg->cap_user_time = 1;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;
- userpg->cap_user_time_zero = 1;
- userpg->time_zero = data->cyc2ns_offset;
+ /*
+ * cap_user_time_zero doesn't make sense when we're using a different
+ * time base for the records.
+ */
+ if (event->clock == &local_clock) {
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = data->cyc2ns_offset;
+ }
cyc2ns_read_end(data);
}
@@ -2147,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
static unsigned long code_segment_base(struct pt_regs *regs)
{
/*
+ * For IA32 we look at the GDT/LDT segment base to convert the
+ * effective IP to a linear address.
+ */
+
+#ifdef CONFIG_X86_32
+ /*
* If we are in VM86 mode, add the segment offset to convert to a
* linear address.
*/
if (regs->flags & X86_VM_MASK)
return 0x10 * regs->cs;
- /*
- * For IA32 we look at the GDT/LDT segment base to convert the
- * effective IP to a linear address.
- */
-#ifdef CONFIG_X86_32
if (user_mode(regs) && regs->cs != __USER_CS)
return get_segment_base(regs->cs);
#else
- if (test_thread_flag(TIF_IA32)) {
- if (user_mode(regs) && regs->cs != __USER32_CS)
- return get_segment_base(regs->cs);
- }
+ if (user_mode(regs) && !user_64bit_mode(regs) &&
+ regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
#endif
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..6ac5cb7a9e14 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -65,13 +65,15 @@ struct event_constraint {
/*
* struct hw_perf_event.flags flags
*/
-#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
+#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
struct amd_nb {
@@ -123,8 +125,37 @@ struct intel_shared_regs {
unsigned core_id; /* per-core: core id */
};
+enum intel_excl_state_type {
+ INTEL_EXCL_UNUSED = 0, /* counter is unused */
+ INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */
+ INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+ enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
+ enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+ int num_alloc_cntrs;/* #counters allocated */
+ int max_alloc_cntrs;/* max #counters allowed */
+ bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+ raw_spinlock_t lock;
+
+ struct intel_excl_states states[2];
+
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
#define MAX_LBR_ENTRIES 16
+enum {
+ X86_PERF_KFREE_SHARED = 0,
+ X86_PERF_KFREE_EXCL = 1,
+ X86_PERF_KFREE_MAX
+};
+
struct cpu_hw_events {
/*
* Generic x86 PMC bits
@@ -179,6 +210,12 @@ struct cpu_hw_events {
* used on Intel NHM/WSM/SNB
*/
struct intel_shared_regs *shared_regs;
+ /*
+ * manage exclusive counter access between hyperthread
+ */
+ struct event_constraint *constraint_list; /* in enable order */
+ struct intel_excl_cntrs *excl_cntrs;
+ int excl_thread_id; /* 0 or 1 */
/*
* AMD specific bits
@@ -187,7 +224,7 @@ struct cpu_hw_events {
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
- void *kfree_on_online;
+ void *kfree_on_online[X86_PERF_KFREE_MAX];
};
#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
@@ -202,6 +239,10 @@ struct cpu_hw_events {
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+ 0, PERF_X86_EVENT_EXCL)
+
/*
* The overlap flag marks event constraints with overlapping counter
* masks. This is the case if the counter mask of such an event is not
@@ -259,6 +300,10 @@ struct cpu_hw_events {
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
#define INTEL_PLD_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
@@ -283,22 +328,40 @@ struct cpu_hw_events {
/* Check flags and event code, and set the HSW load flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
- __EVENT_CONSTRAINT(code, n, \
+ __EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW store flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW load flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
@@ -408,6 +471,13 @@ union x86_pmu_config {
#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+enum {
+ x86_lbr_exclusive_lbr,
+ x86_lbr_exclusive_bts,
+ x86_lbr_exclusive_pt,
+ x86_lbr_exclusive_max,
+};
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -443,14 +513,25 @@ struct x86_pmu {
u64 max_period;
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
+ int idx,
struct perf_event *event);
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
+
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ int cntr);
+
+ void (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+ void (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
bool late_ack;
+ unsigned (*limit_period)(struct perf_event *event, unsigned l);
/*
* sysfs attrs
@@ -472,7 +553,8 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
- void (*flush_branch_stack)(void);
+ void (*sched_task)(struct perf_event_context *ctx,
+ bool sched_in);
/*
* Intel Arch Perfmon v2+
@@ -504,10 +586,15 @@ struct x86_pmu {
bool lbr_double_abort; /* duplicated lbr aborts */
/*
+ * Intel PT/LBR/BTS are exclusive
+ */
+ atomic_t lbr_exclusive[x86_lbr_exclusive_max];
+
+ /*
* Extra registers for events
*/
struct extra_reg *extra_regs;
- unsigned int er_flags;
+ unsigned int flags;
/*
* Intel host/guest support (KVM)
@@ -515,6 +602,13 @@ struct x86_pmu {
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
+struct x86_perf_task_context {
+ u64 lbr_from[MAX_LBR_ENTRIES];
+ u64 lbr_to[MAX_LBR_ENTRIES];
+ int lbr_callstack_users;
+ int lbr_stack_state;
+};
+
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
@@ -524,8 +618,13 @@ do { \
x86_pmu.quirks = &__quirk; \
} while (0)
-#define ERF_NO_HT_SHARING 1
-#define ERF_HAS_RSP_1 2
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
+#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
+#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
extern struct x86_pmu x86_pmu __read_mostly;
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+ return x86_pmu.lbr_sel_map &&
+ x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
int x86_perf_event_set_period(struct perf_event *event);
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index)
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
int x86_setup_perfctr(struct perf_event *event);
int x86_pmu_hw_config(struct perf_event *event);
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+ /* user explicitly requested branch sampling */
+ if (has_branch_stack(event))
+ return true;
+
+ /* implicit branch sampling to correct PEBS skid */
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+ x86_pmu.intel_cap.pebs_format < 2)
+ return true;
+
+ return false;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+ !event->attr.freq && event->hw.sample_period == 1)
+ return true;
+
+ return false;
+}
+
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event);
struct intel_shared_regs *allocate_shared_regs(int cpu);
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void);
void intel_ds_init(void);
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_reset(void);
void intel_pmu_lbr_enable(struct perf_event *event);
void intel_pmu_lbr_disable(struct perf_event *event);
-void intel_pmu_lbr_enable_all(void);
+void intel_pmu_lbr_enable_all(bool pmi);
void intel_pmu_lbr_disable_all(void);
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void);
void intel_pmu_lbr_init_snb(void);
+void intel_pmu_lbr_init_hsw(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
int p4_pmu_init(void);
int p6_pmu_init(void);
@@ -758,6 +905,10 @@ int knc_pmu_init(void);
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline int is_ht_workaround_enabled(void)
+{
+ return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
static void amd_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
struct amd_nb *nb;
int i, nb_id;
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
continue;
if (nb->nb_id == nb_id) {
- cpuc->kfree_on_online = cpuc->amd_nb;
+ *onln = cpuc->amd_nb;
cpuc->amd_nb = nb;
break;
}
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
}
static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
/*
* if not NB event or no NB, then no constraints
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
* the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
* is using the new offset.
*/
-static int force_ibs_eilvt_setup(void)
+static void force_ibs_eilvt_setup(void)
{
int offset;
int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
if (offset == APIC_EILVT_NR_MAX) {
printk(KERN_DEBUG "No EILVT entry available\n");
- return -EBUSY;
+ return;
}
ret = setup_ibs_ctl(offset);
if (ret)
goto out;
- if (!ibs_eilvt_valid()) {
- ret = -EFAULT;
+ if (!ibs_eilvt_valid())
goto out;
- }
pr_info("IBS: LVT offset %d assigned\n", offset);
- return 0;
+ return;
out:
preempt_disable();
put_eilvt(offset);
preempt_enable();
- return ret;
+ return;
}
static void ibs_eilvt_setup(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 258990688a5e..219d3fb423a1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/watchdog.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
- /*
- * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
- * siblings; disable these events because they can corrupt unrelated
- * counters.
- */
- INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
EVENT_CONSTRAINT_END
};
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
- return true;
-
- return false;
-}
-
-static void intel_pmu_disable_all(void)
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void)
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
+ else
+ intel_bts_disable_local();
intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+ __intel_pmu_disable_all();
intel_pmu_lbr_disable_all();
}
-static void intel_pmu_enable_all(int added)
+static void __intel_pmu_enable_all(int added, bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
intel_pmu_pebs_enable_all();
- intel_pmu_lbr_enable_all();
+ intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added)
return;
intel_pmu_enable_bts(event->hw.config);
- }
+ } else
+ intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+ __intel_pmu_enable_all(added, false);
}
/*
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
* must disable before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_disable(event);
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
* must enabled before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void)
if (ds)
ds->bts_index = ds->bts_buffer_base;
+ /* Ack all overflows and disable fixed counters */
+ if (x86_pmu.version >= 2) {
+ intel_pmu_ack_status(intel_pmu_get_status());
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ /* Reset LBRs and LBR freezing */
+ if (x86_pmu.lbr_nr) {
+ update_debugctlmsr(get_debugctlmsr() &
+ ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+ }
+
local_irq_restore(flags);
}
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
*/
if (!x86_pmu.late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
- intel_pmu_disable_all();
+ __intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
status = intel_pmu_get_status();
if (!status)
goto done;
@@ -1399,6 +1629,14 @@ again:
}
/*
+ * Intel PT
+ */
+ if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+ handled++;
+ intel_pt_interrupt();
+ }
+
+ /*
* Checkpointed counters can lead to 'spurious' PMIs because the
* rollback caused by the PMI will have cleared the overflow status
* bit. Therefore always force probe these counters.
@@ -1433,7 +1671,7 @@ again:
goto again;
done:
- intel_pmu_enable_all(0);
+ __intel_pmu_enable_all(0, true);
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event)
static int intel_alt_er(int idx)
{
- if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+ if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
if (idx == EXTRA_REG_RSP_0)
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
}
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
}
static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;
- return x86_get_event_constraints(cpuc, event);
+ return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = true;
+ xl->num_alloc_cntrs = 0;
+ /*
+ * lock shared state until we are done scheduling
+ * in stop_event_scheduling()
+ * makes scheduling appear as a transaction
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+ raw_spin_lock(&excl_cntrs->lock);
+
+ /*
+ * save initial state of sibling thread
+ */
+ memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ /*
+ * make new sibling thread state visible
+ */
+ memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+ xl->sched_started = false;
+ /*
+ * release shared state lock (acquired in intel_start_scheduling())
+ */
+ raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ int idx, struct event_constraint *c)
+{
+ struct event_constraint *cx;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int is_excl, i;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* alternate */
+
+ /*
+ * validating a group does not require
+ * enforcing cross-thread exclusion
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return c;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return c;
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ /*
+ * xl = state of current HT
+ * xlo = state of sibling HT
+ */
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * do not allow scheduling of more than max_alloc_cntrs
+ * which is set to half the available generic counters.
+ * this helps avoid counter starvation of sibling thread
+ * by ensuring at most half the counters cannot be in
+ * exclusive mode. There is not designated counters for the
+ * limits. Any N/2 counters can be used. This helps with
+ * events with specifix counter constraints
+ */
+ if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+ return &emptyconstraint;
+
+ cx = c;
+
+ /*
+ * because we modify the constraint, we need
+ * to make a copy. Static constraints come
+ * from static const tables.
+ *
+ * only needed when constraint has not yet
+ * been cloned (marked dynamic)
+ */
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+ /* sanity check */
+ if (idx < 0)
+ return &emptyconstraint;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ memcpy(cx, c, sizeof(*cx));
+
+ /*
+ * mark constraint as dynamic, so we
+ * can free it later on
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ }
+
+ /*
+ * From here on, the constraint is dynamic.
+ * Either it was just allocated above, or it
+ * was allocated during a earlier invocation
+ * of this function
+ */
+
+ /*
+ * Modify static constraint with current dynamic
+ * state of thread
+ *
+ * EXCLUSIVE: sibling counter measuring exclusive event
+ * SHARED : sibling counter measuring non-exclusive event
+ * UNUSED : sibling counter unused
+ */
+ for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+ /*
+ * exclusive event in sibling counter
+ * our corresponding counter cannot be used
+ * regardless of our event
+ */
+ if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+ __clear_bit(i, cx->idxmsk);
+ /*
+ * if measuring an exclusive event, sibling
+ * measuring non-exclusive, then counter cannot
+ * be used
+ */
+ if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+ __clear_bit(i, cx->idxmsk);
+ }
+
+ /*
+ * recompute actual bit weight for scheduling algorithm
+ */
+ cx->weight = hweight64(cx->idxmsk64);
+
+ /*
+ * if we return an empty mask, then switch
+ * back to static empty constraint to avoid
+ * the cost of freeing later on
+ */
+ if (cx->weight == 0)
+ cx = &emptyconstraint;
+
+ return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c1 = event->hw.constraint;
+ struct event_constraint *c2;
+
+ /*
+ * first time only
+ * - static constraint: no change across incremental scheduling calls
+ * - dynamic constraint: handled by intel_get_excl_constraints()
+ */
+ c2 = __intel_get_event_constraints(cpuc, idx, event);
+ if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+ bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+ c1->weight = c2->weight;
+ c2 = c1;
+ }
+
+ if (cpuc->excl_cntrs)
+ return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+ return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xlo, *xl;
+ unsigned long flags = 0; /* keep compiler happy */
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake)
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional
+ */
+ if (!xl->sched_started)
+ raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+ /*
+ * if event was actually assigned, then mark the
+ * counter state as unused now
+ */
+ if (hwc->idx >= 0)
+ xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+ if (!xl->sched_started)
+ raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
}
static void
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
+ struct event_constraint *c = event->hw.constraint;
+
intel_put_shared_regs_event_constraints(cpuc, event);
+
+ /*
+ * is PMU has exclusive counter restrictions, then
+ * all events are subject to and must call the
+ * put_excl_constraints() routine
+ */
+ if (c && cpuc->excl_cntrs)
+ intel_put_excl_constraints(cpuc, event);
+
+ /* cleanup dynamic constraint */
+ if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+ event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+ struct perf_event *event, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = event->hw.constraint;
+ struct intel_excl_states *xlo, *xl;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+ int is_excl;
+
+ if (cpuc->is_fake || !c)
+ return;
+
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+ if (cntr >= 0) {
+ if (is_excl)
+ xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+ }
}
static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip && x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
- if (intel_pmu_needs_lbr_smpl(event)) {
+ if (needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
+
+ /*
+ * BTS is set up earlier in this path, so don't account twice
+ */
+ if (!intel_pmu_has_bts(event)) {
+ /* disallow lbr if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
}
if (event->attr.type != PERF_TYPE_RAW)
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint =
EVENT_CONSTRAINT(0, 0x4, 0);
static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
- struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
/* Handle special quirk on in_tx_checkpointed only in counter 2 */
if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
return c;
}
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+ if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (left < 128)
+ left = 128;
+ left &= ~0x3fu;
+ }
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
return regs;
}
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+ struct intel_excl_cntrs *c;
+ int i;
+
+ c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (c) {
+ raw_spin_lock_init(&c->lock);
+ for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+ c->states[0].state[i] = INTEL_EXCL_UNUSED;
+ c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+ c->states[1].state[i] = INTEL_EXCL_UNUSED;
+ c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+ }
+ c->core_id = -1;
+ }
+ return c;
+}
+
static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
- return NOTIFY_OK;
+ if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ return NOTIFY_BAD;
+ }
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- return NOTIFY_BAD;
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+ cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+ if (!cpuc->constraint_list)
+ return NOTIFY_BAD;
+
+ cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+ if (!cpuc->excl_cntrs) {
+ kfree(cpuc->constraint_list);
+ kfree(cpuc->shared_regs);
+ return NOTIFY_BAD;
+ }
+ cpuc->excl_thread_id = 0;
+ }
return NOTIFY_OK;
}
@@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu)
if (!cpuc->shared_regs)
return;
- if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+ if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
for_each_cpu(i, topology_thread_cpumask(cpu)) {
struct intel_shared_regs *pc;
pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- cpuc->kfree_on_online = cpuc->shared_regs;
+ *onln = cpuc->shared_regs;
cpuc->shared_regs = pc;
break;
}
@@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.lbr_sel_map)
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ int h = x86_pmu.num_counters >> 1;
+
+ for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ struct intel_excl_cntrs *c;
+
+ c = per_cpu(cpu_hw_events, i).excl_cntrs;
+ if (c && c->core_id == core_id) {
+ cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+ cpuc->excl_cntrs = c;
+ cpuc->excl_thread_id = 1;
+ break;
+ }
+ }
+ cpuc->excl_cntrs->core_id = core_id;
+ cpuc->excl_cntrs->refcnt++;
+ /*
+ * set hard limit to half the number of generic counters
+ */
+ cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+ cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
+ }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct intel_excl_cntrs *c;
+
+ c = cpuc->excl_cntrs;
+ if (c) {
+ if (c->core_id == -1 || --c->refcnt == 0)
+ kfree(c);
+ cpuc->excl_cntrs = NULL;
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+ }
}
static void intel_pmu_cpu_dying(int cpu)
@@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu)
cpuc->shared_regs = NULL;
}
- fini_debug_store_on_cpu(cpu);
-}
+ free_excl_cntrs(cpu);
-static void intel_pmu_flush_branch_stack(void)
-{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
- if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
+ fini_debug_store_on_cpu(cpu);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
+ .sched_task = intel_pmu_lbr_sched_task,
};
static __init void intel_clovertown_quirk(void)
@@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void)
}
}
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+ x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
+ x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
@@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_slm_event_constraints;
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
x86_pmu.extra_regs = intel_slm_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
pr_cont("Silvermont events, ");
break;
@@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void)
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
x86_pmu.extra_regs = intel_westmere_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = nhm_events_attrs;
@@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void)
case 42: /* 32nm SandyBridge */
case 45: /* 32nm SandyBridge-E/EN/EP */
x86_add_quirk(intel_sandybridge_quirk);
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void)
case 58: /* 22nm IvyBridge */
case 62: /* 22nm IvyBridge-EP/EX */
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
/* dTLB-load-misses on IVB is different than SNB */
@@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void)
else
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void)
case 63: /* 22nm Haswell Server */
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+ x86_add_quirk(intel_ht_bug);
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
- intel_pmu_lbr_init_snb();
+ intel_pmu_lbr_init_hsw();
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+ intel_pmu_lbr_init_hsw();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.limit_period = bdw_limit_period;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void)
return 0;
}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+ int cpu = smp_processor_id();
+ int w, c;
+ /*
+ * problem not present on this CPU model, nothing to do
+ */
+ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+ return 0;
+
+ w = cpumask_weight(topology_thread_cpumask(cpu));
+ if (w > 1) {
+ pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+ return 0;
+ }
+
+ watchdog_nmi_disable_all();
+
+ x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+ x86_pmu.commit_scheduling = NULL;
+ x86_pmu.start_scheduling = NULL;
+ x86_pmu.stop_scheduling = NULL;
+
+ watchdog_nmi_enable_all();
+
+ get_online_cpus();
+
+ for_each_online_cpu(c) {
+ free_excl_cntrs(c);
+ }
+
+ put_online_cpus();
+ pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+ return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..ac1f0c55f379
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
+
+struct bts_ctx {
+ struct perf_output_handle handle;
+ struct debug_store ds_back;
+ int started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE 24
+#define BTS_SAFETY_MARGIN 4080
+
+struct bts_phys {
+ struct page *page;
+ unsigned long size;
+ unsigned long offset;
+ unsigned long displacement;
+};
+
+struct bts_buffer {
+ size_t real_size; /* multiple of BTS_RECORD_SIZE */
+ unsigned int nr_pages;
+ unsigned int nr_bufs;
+ unsigned int cur_buf;
+ bool snapshot;
+ local_t data_size;
+ local_t lost;
+ local_t head;
+ unsigned long end;
+ void **data_pages;
+ struct bts_phys buf[0];
+};
+
+struct pmu bts_pmu;
+
+void intel_pmu_enable_bts(u64 config);
+void intel_pmu_disable_bts(void);
+
+static size_t buf_size(struct page *page)
+{
+ return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+ struct bts_buffer *buf;
+ struct page *page;
+ int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ unsigned long offset;
+ size_t size = nr_pages << PAGE_SHIFT;
+ int pg, nbuf, pad;
+
+ /* count all the high order buffers */
+ for (pg = 0, nbuf = 0; pg < nr_pages;) {
+ page = virt_to_page(pages[pg]);
+ if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+ return NULL;
+ pg += 1 << page_private(page);
+ nbuf++;
+ }
+
+ /*
+ * to avoid interrupts in overwrite mode, only allow one physical
+ */
+ if (overwrite && nbuf > 1)
+ return NULL;
+
+ buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->nr_pages = nr_pages;
+ buf->nr_bufs = nbuf;
+ buf->snapshot = overwrite;
+ buf->data_pages = pages;
+ buf->real_size = size - size % BTS_RECORD_SIZE;
+
+ for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+ unsigned int __nr_pages;
+
+ page = virt_to_page(pages[pg]);
+ __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+ buf->buf[nbuf].page = page;
+ buf->buf[nbuf].offset = offset;
+ buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+ buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+ pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+ buf->buf[nbuf].size -= pad;
+
+ pg += __nr_pages;
+ offset += __nr_pages << PAGE_SHIFT;
+ }
+
+ return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+ kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+ return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_phys *phys = &buf->buf[buf->cur_buf];
+ unsigned long index, thresh = 0, end = phys->size;
+ struct page *page = phys->page;
+
+ index = local_read(&buf->head);
+
+ if (!buf->snapshot) {
+ if (buf->end < phys->offset + buf_size(page))
+ end = buf->end - phys->offset - phys->displacement;
+
+ index -= phys->offset + phys->displacement;
+
+ if (end - index > BTS_SAFETY_MARGIN)
+ thresh = end - BTS_SAFETY_MARGIN;
+ else if (end - index > BTS_RECORD_SIZE)
+ thresh = end - BTS_RECORD_SIZE;
+ else
+ thresh = end;
+ }
+
+ ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+ ds->bts_index = ds->bts_buffer_base + index;
+ ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+ ds->bts_interrupt_threshold = !buf->snapshot
+ ? ds->bts_buffer_base + thresh
+ : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+ unsigned long index = head - phys->offset;
+
+ memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= bts->handle.size ||
+ bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+ return true;
+
+ return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+ if (!buf)
+ return;
+
+ head = index + bts_buffer_offset(buf, buf->cur_buf);
+ old = local_xchg(&buf->head, head);
+
+ if (!buf->snapshot) {
+ if (old == head)
+ return;
+
+ if (ds->bts_index >= ds->bts_absolute_maximum)
+ local_inc(&buf->lost);
+
+ /*
+ * old and head are always in the same physical buffer, so we
+ * can subtract them to get the data size.
+ */
+ local_add(head - old, &buf->data_size);
+ } else {
+ local_set(&buf->data_size, head);
+ }
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ u64 config = 0;
+
+ if (!buf || bts_buffer_is_full(buf, bts))
+ return;
+
+ event->hw.state = 0;
+
+ if (!buf->snapshot)
+ config |= ARCH_PERFMON_EVENTSEL_INT;
+ if (!event->attr.exclude_kernel)
+ config |= ARCH_PERFMON_EVENTSEL_OS;
+ if (!event->attr.exclude_user)
+ config |= ARCH_PERFMON_EVENTSEL_USR;
+
+ bts_config_buffer(buf);
+
+ /*
+ * local barrier to make sure that ds configuration made it
+ * before we enable BTS
+ */
+ wmb();
+
+ intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ __bts_event_start(event);
+
+ /* PMI handler: this counter is running and likely generating PMIs */
+ ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+ /*
+ * No extra synchronization is mandated by the documentation to have
+ * BTS data stores globally visible.
+ */
+ intel_pmu_disable_bts();
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
+ ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ /* PMI handler: don't restart this counter */
+ ACCESS_ONCE(bts->started) = 0;
+
+ __bts_event_stop(event);
+
+ if (flags & PERF_EF_UPDATE)
+ bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event && bts->started)
+ __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event)
+ __bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+ unsigned long head, space, next_space, pad, gap, skip, wakeup;
+ unsigned int next_buf;
+ struct bts_phys *phys, *next_phys;
+ int ret;
+
+ if (buf->snapshot)
+ return 0;
+
+ head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+ if (WARN_ON_ONCE(head != local_read(&buf->head)))
+ return -EINVAL;
+
+ phys = &buf->buf[buf->cur_buf];
+ space = phys->offset + phys->displacement + phys->size - head;
+ pad = space;
+ if (space > handle->size) {
+ space = handle->size;
+ space -= space % BTS_RECORD_SIZE;
+ }
+ if (space <= BTS_SAFETY_MARGIN) {
+ /* See if next phys buffer has more space */
+ next_buf = buf->cur_buf + 1;
+ if (next_buf >= buf->nr_bufs)
+ next_buf = 0;
+ next_phys = &buf->buf[next_buf];
+ gap = buf_size(phys->page) - phys->displacement - phys->size +
+ next_phys->displacement;
+ skip = pad + gap;
+ if (handle->size >= skip) {
+ next_space = next_phys->size;
+ if (next_space + skip > handle->size) {
+ next_space = handle->size - skip;
+ next_space -= next_space % BTS_RECORD_SIZE;
+ }
+ if (next_space > space || !space) {
+ if (pad)
+ bts_buffer_pad_out(phys, head);
+ ret = perf_aux_output_skip(handle, skip);
+ if (ret)
+ return ret;
+ /* Advance to next phys buffer */
+ phys = next_phys;
+ space = next_space;
+ head = phys->offset + phys->displacement;
+ /*
+ * After this, cur_buf and head won't match ds
+ * anymore, so we must not be racing with
+ * bts_update().
+ */
+ buf->cur_buf = next_buf;
+ local_set(&buf->head, head);
+ }
+ }
+ }
+
+ /* Don't go far beyond wakeup watermark */
+ wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+ handle->head;
+ if (space > wakeup) {
+ space = wakeup;
+ space -= space % BTS_RECORD_SIZE;
+ }
+
+ buf->end = head + space;
+
+ /*
+ * If we have no space, the lost notification would have been sent when
+ * we hit absolute_maximum - see bts_update()
+ */
+ if (!space)
+ return -ENOSPC;
+
+ return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct perf_event *event = bts->handle.event;
+ struct bts_buffer *buf;
+ s64 old_head;
+ int err;
+
+ if (!event || !bts->started)
+ return 0;
+
+ buf = perf_get_aux(&bts->handle);
+ /*
+ * Skip snapshot counters: they don't use the interrupt, but
+ * there's no other way of telling, because the pointer will
+ * keep moving
+ */
+ if (!buf || buf->snapshot)
+ return 0;
+
+ old_head = local_read(&buf->head);
+ bts_update(bts);
+
+ /* no new data */
+ if (old_head == local_read(&buf->head))
+ return 0;
+
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return 1;
+
+ err = bts_buffer_reset(buf, &bts->handle);
+ if (err)
+ perf_aux_output_end(&bts->handle, 0, false);
+
+ return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+ bts_event_stop(event, PERF_EF_UPDATE);
+
+ if (buf) {
+ if (buf->snapshot)
+ bts->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+ }
+
+ cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+ cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+ struct bts_buffer *buf;
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ return -EBUSY;
+
+ if (bts->handle.event)
+ return -EBUSY;
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return -EINVAL;
+
+ ret = bts_buffer_reset(buf, &bts->handle);
+ if (ret) {
+ perf_aux_output_end(&bts->handle, 0, false);
+ return ret;
+ }
+
+ bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+ bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+ bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+ if (mode & PERF_EF_START) {
+ bts_event_start(event, 0);
+ if (hwc->state & PERF_HES_STOPPED) {
+ bts_event_del(event, 0);
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+ if (event->attr.type != bts_pmu.type)
+ return -ENOENT;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_bts))
+ return -EBUSY;
+
+ event->destroy = bts_event_destroy;
+
+ return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+ return -ENODEV;
+
+ bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+ bts_pmu.task_ctx_nr = perf_sw_context;
+ bts_pmu.event_init = bts_event_init;
+ bts_pmu.add = bts_event_add;
+ bts_pmu.del = bts_event_del;
+ bts_pmu.start = bts_event_start;
+ bts_pmu.stop = bts_event_stop;
+ bts_pmu.read = bts_event_read;
+ bts_pmu.setup_aux = bts_buffer_setup_aux;
+ bts_pmu.free_aux = bts_buffer_free_aux;
+
+ return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+
+module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..e4d1b8b738fa
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,1379 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+ raw_spinlock_t lock;
+ int rmid;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR (1ULL << 63)
+#define RMID_VAL_UNAVAIL (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static unsigned int intel_cqm_rotation_rmid;
+
+#define INVALID_RMID (-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(unsigned int rmid)
+{
+ if (!rmid || rmid == INVALID_RMID)
+ return false;
+
+ return true;
+}
+
+static u64 __rmid_read(unsigned int rmid)
+{
+ u64 val;
+
+ /*
+ * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+ * it just says that to increase confusion.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ /*
+ * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+ * the number of cachelines tagged with @rmid.
+ */
+ return val;
+}
+
+enum rmid_recycle_state {
+ RMID_YOUNG = 0,
+ RMID_AVAILABLE,
+ RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+ unsigned int rmid;
+ enum rmid_recycle_state state;
+ struct list_head list;
+ unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ entry = cqm_rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static int __get_rmid(void)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ if (list_empty(&cqm_rmid_free_lru))
+ return INVALID_RMID;
+
+ entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void __put_rmid(unsigned int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ WARN_ON(!__rmid_valid(rmid));
+ entry = __rmid_entry(rmid);
+
+ entry->queue_time = jiffies;
+ entry->state = RMID_YOUNG;
+
+ list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+ struct cqm_rmid_entry *entry;
+ unsigned int nr_rmids;
+ int r = 0;
+
+ nr_rmids = cqm_max_rmid + 1;
+ cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+ nr_rmids, GFP_KERNEL);
+ if (!cqm_rmid_ptrs)
+ return -ENOMEM;
+
+ for (; r <= cqm_max_rmid; r++) {
+ struct cqm_rmid_entry *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto fail;
+
+ INIT_LIST_HEAD(&entry->list);
+ entry->rmid = r;
+ cqm_rmid_ptrs[r] = entry;
+
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ mutex_lock(&cache_mutex);
+ intel_cqm_rotation_rmid = __get_rmid();
+ mutex_unlock(&cache_mutex);
+
+ return 0;
+fail:
+ while (r--)
+ kfree(cqm_rmid_ptrs[r]);
+
+ kfree(cqm_rmid_ptrs);
+ return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+ /* Per-cpu and task events don't mix */
+ if ((a->attach_state & PERF_ATTACH_TASK) !=
+ (b->attach_state & PERF_ATTACH_TASK))
+ return false;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (a->cgrp != b->cgrp)
+ return false;
+#endif
+
+ /* If not task event, we're machine wide */
+ if (!(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Events that target same task are placed into the same cache group.
+ */
+ if (a->hw.target == b->hw.target)
+ return true;
+
+ /*
+ * Are we an inherited event?
+ */
+ if (b->parent == a)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return perf_cgroup_from_task(event->hw.target);
+
+ return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ * PROHIBITS
+ * system-wide -> cgroup and task
+ * cgroup -> system-wide
+ * -> task in cgroup
+ * task -> system-wide
+ * -> task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+ /*
+ * We can have any number of cgroups but only one system-wide
+ * event at a time.
+ */
+ if (a->cgrp && b->cgrp) {
+ struct perf_cgroup *ac = a->cgrp;
+ struct perf_cgroup *bc = b->cgrp;
+
+ /*
+ * This condition should have been caught in
+ * __match_event() and we should be sharing an RMID.
+ */
+ WARN_ON_ONCE(ac == bc);
+
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+
+ if (a->cgrp || b->cgrp) {
+ struct perf_cgroup *ac, *bc;
+
+ /*
+ * cgroup and system-wide events are mutually exclusive
+ */
+ if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+ (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+ return true;
+
+ /*
+ * Ensure neither event is part of the other's cgroup
+ */
+ ac = event_to_cgroup(a);
+ bc = event_to_cgroup(b);
+ if (ac == bc)
+ return true;
+
+ /*
+ * Must have cgroup and non-intersecting task events.
+ */
+ if (!ac || !bc)
+ return false;
+
+ /*
+ * We have cgroup and task events, and the task belongs
+ * to a cgroup. Check for for overlap.
+ */
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+#endif
+ /*
+ * If one of them is not a task, same story as above with cgroups.
+ */
+ if (!(a->attach_state & PERF_ATTACH_TASK) ||
+ !(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Must be non-overlapping.
+ */
+ return false;
+}
+
+struct rmid_read {
+ unsigned int rmid;
+ atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned int
+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+{
+ struct perf_event *event;
+ unsigned int old_rmid = group->hw.cqm_rmid;
+ struct list_head *head = &group->hw.cqm_group_entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ /*
+ * If our RMID is being deallocated, perform a read now.
+ */
+ if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ .rmid = old_rmid,
+ };
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+ &rr, 1);
+ local64_set(&group->count, atomic64_read(&rr.value));
+ }
+
+ raw_spin_lock_irq(&cache_lock);
+
+ group->hw.cqm_rmid = rmid;
+ list_for_each_entry(event, head, hw.cqm_group_entry)
+ event->hw.cqm_rmid = rmid;
+
+ raw_spin_unlock_irq(&cache_lock);
+
+ return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+ struct cqm_rmid_entry *entry;
+
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ if (entry->state != RMID_AVAILABLE)
+ break;
+
+ if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+ entry->state = RMID_DIRTY;
+ }
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(unsigned int rmid)
+{
+ struct perf_event *leader, *event;
+
+ lockdep_assert_held(&cache_mutex);
+
+ leader = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+ event = leader;
+
+ list_for_each_entry_continue(event, &cache_groups,
+ hw.cqm_groups_entry) {
+ if (__rmid_valid(event->hw.cqm_rmid))
+ continue;
+
+ if (__conflict_event(event, leader))
+ continue;
+
+ intel_cqm_xchg_rmid(event, rmid);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+ struct cqm_rmid_entry *entry, *tmp;
+
+ lockdep_assert_held(&cache_mutex);
+
+ *available = 0;
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ unsigned long min_queue_time;
+ unsigned long now = jiffies;
+
+ /*
+ * We hold RMIDs placed into limbo for a minimum queue
+ * time. Before the minimum queue time has elapsed we do
+ * not recycle RMIDs.
+ *
+ * The reasoning is that until a sufficient time has
+ * passed since we stopped using an RMID, any RMID
+ * placed onto the limbo list will likely still have
+ * data tagged in the cache, which means we'll probably
+ * fail to recycle it anyway.
+ *
+ * We can save ourselves an expensive IPI by skipping
+ * any RMIDs that have not been queued for the minimum
+ * time.
+ */
+ min_queue_time = entry->queue_time +
+ msecs_to_jiffies(__rmid_queue_time_ms);
+
+ if (time_after(min_queue_time, now))
+ break;
+
+ entry->state = RMID_AVAILABLE;
+ (*available)++;
+ }
+
+ /*
+ * Fast return if none of the RMIDs on the limbo list have been
+ * sitting on the queue for the minimum queue time.
+ */
+ if (!*available)
+ return false;
+
+ /*
+ * Test whether an RMID is free for each package.
+ */
+ on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+ list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+ /*
+ * Exhausted all RMIDs that have waited min queue time.
+ */
+ if (entry->state == RMID_YOUNG)
+ break;
+
+ if (entry->state == RMID_DIRTY)
+ continue;
+
+ list_del(&entry->list); /* remove from limbo */
+
+ /*
+ * The rotation RMID gets priority if it's
+ * currently invalid. In which case, skip adding
+ * the RMID to the the free lru.
+ */
+ if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_rotation_rmid = entry->rmid;
+ continue;
+ }
+
+ /*
+ * If we have groups waiting for RMIDs, hand
+ * them one now provided they don't conflict.
+ */
+ if (intel_cqm_sched_in_event(entry->rmid))
+ continue;
+
+ /*
+ * Otherwise place it onto the free list.
+ */
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+
+ return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+ struct perf_event *rotor;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ rotor = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+
+ /*
+ * The group at the front of the list should always have a valid
+ * RMID. If it doesn't then no groups have RMIDs assigned and we
+ * don't need to rotate the list.
+ */
+ if (next == rotor)
+ return;
+
+ rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+ __put_rmid(rmid);
+
+ list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+ struct perf_event *group, *g;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+ if (group == event)
+ continue;
+
+ rmid = group->hw.cqm_rmid;
+
+ /*
+ * Skip events that don't have a valid RMID.
+ */
+ if (!__rmid_valid(rmid))
+ continue;
+
+ /*
+ * No conflict? No problem! Leave the event alone.
+ */
+ if (!__conflict_event(group, event))
+ continue;
+
+ intel_cqm_xchg_rmid(group, INVALID_RMID);
+ __put_rmid(rmid);
+ }
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ * 1. To handle the scheduling of conflicting events
+ * 2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+ struct perf_event *group, *start = NULL;
+ unsigned int threshold_limit;
+ unsigned int nr_needed = 0;
+ unsigned int nr_available;
+ bool rotated = false;
+
+ mutex_lock(&cache_mutex);
+
+again:
+ /*
+ * Fast path through this function if there are no groups and no
+ * RMIDs that need cleaning.
+ */
+ if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+ if (!__rmid_valid(group->hw.cqm_rmid)) {
+ if (!start)
+ start = group;
+ nr_needed++;
+ }
+ }
+
+ /*
+ * We have some event groups, but they all have RMIDs assigned
+ * and no RMIDs need cleaning.
+ */
+ if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ if (!nr_needed)
+ goto stabilize;
+
+ /*
+ * We have more event groups without RMIDs than available RMIDs,
+ * or we have event groups that conflict with the ones currently
+ * scheduled.
+ *
+ * We force deallocate the rmid of the group at the head of
+ * cache_groups. The first event group without an RMID then gets
+ * assigned intel_cqm_rotation_rmid. This ensures we always make
+ * forward progress.
+ *
+ * Rotate the cache_groups list so the previous head is now the
+ * tail.
+ */
+ __intel_cqm_pick_and_rotate(start);
+
+ /*
+ * If the rotation is going to succeed, reduce the threshold so
+ * that we don't needlessly reuse dirty RMIDs.
+ */
+ if (__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+ intel_cqm_rotation_rmid = __get_rmid();
+
+ intel_cqm_sched_out_conflicting_events(start);
+
+ if (__intel_cqm_threshold)
+ __intel_cqm_threshold--;
+ }
+
+ rotated = true;
+
+stabilize:
+ /*
+ * We now need to stablize the RMID we freed above (if any) to
+ * ensure that the next time we rotate we have an RMID with zero
+ * occupancy value.
+ *
+ * Alternatively, if we didn't need to perform any rotation,
+ * we'll have a bunch of RMIDs in limbo that need stabilizing.
+ */
+ threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+ while (intel_cqm_rmid_stabilize(&nr_available) &&
+ __intel_cqm_threshold < threshold_limit) {
+ unsigned int steal_limit;
+
+ /*
+ * Don't spin if nobody is actively waiting for an RMID,
+ * the rotation worker will be kicked as soon as an
+ * event needs an RMID anyway.
+ */
+ if (!nr_needed)
+ break;
+
+ /* Allow max 25% of RMIDs to be in limbo. */
+ steal_limit = (cqm_max_rmid + 1) / 4;
+
+ /*
+ * We failed to stabilize any RMIDs so our rotation
+ * logic is now stuck. In order to make forward progress
+ * we have a few options:
+ *
+ * 1. rotate ("steal") another RMID
+ * 2. increase the threshold
+ * 3. do nothing
+ *
+ * We do both of 1. and 2. until we hit the steal limit.
+ *
+ * The steal limit prevents all RMIDs ending up on the
+ * limbo list. This can happen if every RMID has a
+ * non-zero occupancy above threshold_limit, and the
+ * occupancy values aren't dropping fast enough.
+ *
+ * Note that there is prioritisation at work here - we'd
+ * rather increase the number of RMIDs on the limbo list
+ * than increase the threshold, because increasing the
+ * threshold skews the event data (because we reuse
+ * dirty RMIDs) - threshold bumps are a last resort.
+ */
+ if (nr_available < steal_limit)
+ goto again;
+
+ __intel_cqm_threshold++;
+ }
+
+out:
+ mutex_unlock(&cache_mutex);
+ return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+ unsigned long delay;
+
+ __intel_cqm_rmid_rotate();
+
+ delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+ schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+ struct perf_event **group)
+{
+ struct perf_event *iter;
+ unsigned int rmid;
+ bool conflict = false;
+
+ list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+ rmid = iter->hw.cqm_rmid;
+
+ if (__match_event(iter, event)) {
+ /* All tasks in a group share an RMID */
+ event->hw.cqm_rmid = rmid;
+ *group = iter;
+ return;
+ }
+
+ /*
+ * We only care about conflicts for events that are
+ * actually scheduled in (and hence have a valid RMID).
+ */
+ if (__conflict_event(iter, event) && __rmid_valid(rmid))
+ conflict = true;
+ }
+
+ if (conflict)
+ rmid = INVALID_RMID;
+ else
+ rmid = __get_rmid();
+
+ event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+ unsigned long flags;
+ unsigned int rmid;
+ u64 val;
+
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ rmid = event->hw.cqm_rmid;
+
+ if (!__rmid_valid(rmid))
+ goto out;
+
+ val = __rmid_read(rmid);
+
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ goto out;
+
+ local64_set(&event->count, val);
+out:
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read(rr->rmid);
+
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+ return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+ unsigned long flags;
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ };
+
+ /*
+ * We only need to worry about task events. System-wide events
+ * are handled like usual, i.e. entirely with
+ * intel_cqm_event_read().
+ */
+ if (event->cpu != -1)
+ return __perf_event_count(event);
+
+ /*
+ * Only the group leader gets to report values. This stops us
+ * reporting duplicate values to userspace, and gives us a clear
+ * rule for which task gets to report the values.
+ *
+ * Note that it is impossible to attribute these values to
+ * specific packages - we forfeit that ability when we create
+ * task events.
+ */
+ if (!cqm_group_leader(event))
+ return 0;
+
+ /*
+ * Notice that we don't perform the reading of an RMID
+ * atomically, because we can't hold a spin lock across the
+ * IPIs.
+ *
+ * Speculatively perform the read, since @event might be
+ * assigned a different (possibly invalid) RMID while we're
+ * busying performing the IPI calls. It's therefore necessary to
+ * check @event's RMID afterwards, and if it has changed,
+ * discard the result of the read.
+ */
+ rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+ if (!__rmid_valid(rr.rmid))
+ goto out;
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ if (event->hw.cqm_rmid == rr.rmid)
+ local64_set(&event->count, atomic64_read(&rr.value));
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+ return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned int rmid = event->hw.cqm_rmid;
+ unsigned long flags;
+
+ if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+ return;
+
+ event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+
+ if (state->cnt++)
+ WARN_ON_ONCE(state->rmid != rmid);
+ else
+ WARN_ON_ONCE(state->rmid);
+
+ state->rmid = rmid;
+ wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned long flags;
+
+ if (event->hw.cqm_state & PERF_HES_STOPPED)
+ return;
+
+ event->hw.cqm_state |= PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+ intel_cqm_event_read(event);
+
+ if (!--state->cnt) {
+ state->rmid = 0;
+ wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+ } else {
+ WARN_ON_ONCE(!state->rmid);
+ }
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+ unsigned long flags;
+ unsigned int rmid;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
+ event->hw.cqm_state = PERF_HES_STOPPED;
+ rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+ intel_cqm_event_start(event, mode);
+
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return 0;
+}
+
+static void intel_cqm_event_del(struct perf_event *event, int mode)
+{
+ intel_cqm_event_stop(event, mode);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+ struct perf_event *group_other = NULL;
+
+ mutex_lock(&cache_mutex);
+
+ /*
+ * If there's another event in this group...
+ */
+ if (!list_empty(&event->hw.cqm_group_entry)) {
+ group_other = list_first_entry(&event->hw.cqm_group_entry,
+ struct perf_event,
+ hw.cqm_group_entry);
+ list_del(&event->hw.cqm_group_entry);
+ }
+
+ /*
+ * And we're the group leader..
+ */
+ if (cqm_group_leader(event)) {
+ /*
+ * If there was a group_other, make that leader, otherwise
+ * destroy the group and return the RMID.
+ */
+ if (group_other) {
+ list_replace(&event->hw.cqm_groups_entry,
+ &group_other->hw.cqm_groups_entry);
+ } else {
+ unsigned int rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid))
+ __put_rmid(rmid);
+ list_del(&event->hw.cqm_groups_entry);
+ }
+ }
+
+ mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+ struct perf_event *group = NULL;
+ bool rotate = false;
+
+ if (event->attr.type != intel_cqm_pmu.type)
+ return -ENOENT;
+
+ if (event->attr.config & ~QOS_EVENT_MASK)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+ INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+ event->destroy = intel_cqm_event_destroy;
+
+ mutex_lock(&cache_mutex);
+
+ /* Will also set rmid */
+ intel_cqm_setup_event(event, &group);
+
+ if (group) {
+ list_add_tail(&event->hw.cqm_group_entry,
+ &group->hw.cqm_group_entry);
+ } else {
+ list_add_tail(&event->hw.cqm_groups_entry,
+ &cache_groups);
+
+ /*
+ * All RMIDs are either in use or have recently been
+ * used. Kick the rotation worker to clean/free some.
+ *
+ * We only do this for the group leader, rather than for
+ * every event in a group to save on needless work.
+ */
+ if (!__rmid_valid(event->hw.cqm_rmid))
+ rotate = true;
+ }
+
+ mutex_unlock(&cache_mutex);
+
+ if (rotate)
+ schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+ return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+ .name = "events",
+ .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+ .name = "format",
+ .attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ ssize_t rv;
+
+ mutex_lock(&cache_mutex);
+ rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+ mutex_unlock(&cache_mutex);
+
+ return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int bytes, cachelines;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cache_mutex);
+
+ __intel_cqm_max_threshold = bytes;
+ cachelines = bytes / cqm_l3_scale;
+
+ /*
+ * The new maximum takes effect immediately.
+ */
+ if (__intel_cqm_threshold > cachelines)
+ __intel_cqm_threshold = cachelines;
+
+ mutex_unlock(&cache_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+ &dev_attr_max_recycle_threshold.attr,
+ NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+ .attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+ &intel_cqm_events_group,
+ &intel_cqm_format_group,
+ &intel_cqm_group,
+ NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+ .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+ .attr_groups = intel_cqm_attr_groups,
+ .task_ctx_nr = perf_sw_context,
+ .event_init = intel_cqm_event_init,
+ .add = intel_cqm_event_add,
+ .del = intel_cqm_event_del,
+ .start = intel_cqm_event_start,
+ .stop = intel_cqm_event_stop,
+ .read = intel_cqm_event_read,
+ .count = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ for_each_cpu(i, &cqm_cpumask) {
+ if (phys_id == topology_physical_package_id(i))
+ return; /* already got reader for this socket */
+ }
+
+ cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_prepare(unsigned int cpu)
+{
+ struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ raw_spin_lock_init(&state->lock);
+ state->rmid = 0;
+ state->cnt = 0;
+
+ WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+ WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ /*
+ * Is @cpu a designated cqm reader?
+ */
+ if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+ return;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+
+ if (phys_id == topology_physical_package_id(i)) {
+ cpumask_set_cpu(i, &cqm_cpumask);
+ break;
+ }
+ }
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ intel_cqm_cpu_prepare(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ intel_cqm_cpu_exit(cpu);
+ break;
+ case CPU_STARTING:
+ cqm_pick_event_reader(cpu);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+ {}
+};
+
+static int __init intel_cqm_init(void)
+{
+ char *str, scale[20];
+ int i, cpu, ret;
+
+ if (!x86_match_cpu(intel_cqm_match))
+ return -ENODEV;
+
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+ /*
+ * It's possible that not all resources support the same number
+ * of RMIDs. Instead of making scheduling much more complicated
+ * (where we have to match a task's RMID to a cpu that supports
+ * that many RMIDs) just find the minimum RMIDs supported across
+ * all cpus.
+ *
+ * Also, check that the scales match on all cpus.
+ */
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_cache_max_rmid < cqm_max_rmid)
+ cqm_max_rmid = c->x86_cache_max_rmid;
+
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
+ boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event_attr_intel_cqm_llc_scale.event_str = str;
+
+ ret = intel_cqm_setup_rmid_cache();
+ if (ret)
+ goto out;
+
+ for_each_online_cpu(i) {
+ intel_cqm_cpu_prepare(i);
+ cqm_pick_event_reader(i);
+ }
+
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+ ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+ if (ret)
+ pr_err("Intel CQM perf registration failed: %d\n", ret);
+ else
+ pr_info("Intel CQM monitoring enabled\n");
+
+out:
+ cpu_notifier_register_done();
+
+ return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..813f75d71175 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
debugctlmsr |= DEBUGCTLMSR_TR;
debugctlmsr |= DEBUGCTLMSR_BTS;
- debugctlmsr |= DEBUGCTLMSR_BTINT;
+ if (config & ARCH_PERFMON_EVENTSEL_INT)
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
if (!(config & ARCH_PERFMON_EVENTSEL_OS))
debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
@@ -557,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
EVENT_CONSTRAINT_END
};
@@ -564,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
EVENT_CONSTRAINT_END
};
@@ -587,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -602,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -611,6 +620,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -622,6 +635,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -633,16 +650,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT 9 /* enable call stack */
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
@@ -69,33 +71,31 @@ static enum {
#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
#define LBR_FROM_FLAG_ABORT (1ULL << 61)
-#define for_each_branch_sample_type(x) \
- for ((x) = PERF_SAMPLE_BRANCH_USER; \
- (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
/*
* x86control flow change classification
* x86control flow changes include branches, interrupts, traps, faults
*/
enum {
- X86_BR_NONE = 0, /* unknown */
-
- X86_BR_USER = 1 << 0, /* branch target is user */
- X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
-
- X86_BR_CALL = 1 << 2, /* call */
- X86_BR_RET = 1 << 3, /* return */
- X86_BR_SYSCALL = 1 << 4, /* syscall */
- X86_BR_SYSRET = 1 << 5, /* syscall return */
- X86_BR_INT = 1 << 6, /* sw interrupt */
- X86_BR_IRET = 1 << 7, /* return from interrupt */
- X86_BR_JCC = 1 << 8, /* conditional */
- X86_BR_JMP = 1 << 9, /* jump */
- X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
- X86_BR_IND_CALL = 1 << 11,/* indirect calls */
- X86_BR_ABORT = 1 << 12,/* transaction abort */
- X86_BR_IN_TX = 1 << 13,/* in transaction */
- X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -112,13 +112,15 @@ enum {
X86_BR_JMP |\
X86_BR_IRQ |\
X86_BR_ABORT |\
- X86_BR_IND_CALL)
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
#define X86_BR_ANY_CALL \
(X86_BR_CALL |\
X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
X86_BR_SYSCALL |\
X86_BR_IRQ |\
X86_BR_INT)
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
* otherwise it becomes near impossible to get a reliable stack.
*/
-static void __intel_pmu_lbr_enable(void)
+static void __intel_pmu_lbr_enable(bool pmi)
{
- u64 debugctl;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 debugctl, lbr_select = 0, orig_debugctl;
- if (cpuc->lbr_sel)
- wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
+ /*
+ * No need to reprogram LBR_SELECT in a PMI, as it
+ * did not change.
+ */
+ if (cpuc->lbr_sel && !pmi) {
+ lbr_select = cpuc->lbr_sel->config;
+ wrmsrl(MSR_LBR_SELECT, lbr_select);
+ }
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ orig_debugctl = debugctl;
+ debugctl |= DEBUGCTLMSR_LBR;
+ /*
+ * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+ * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+ * may cause superfluous increase/decrease of LBR_TOS.
+ */
+ if (!(lbr_select & LBR_CALL_STACK))
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (orig_debugctl != debugctl)
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void __intel_pmu_lbr_disable(void)
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_64();
}
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrl(x86_pmu.lbr_tos, tos);
+ return tos;
+}
+
+enum {
+ LBR_NONE,
+ LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0 ||
+ task_ctx->lbr_stack_state == LBR_NONE) {
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0) {
+ task_ctx->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ /*
+ * If LBR callstack feature is enabled and the stack was saved when
+ * the task was scheduled out, restore the stack. Otherwise flush
+ * the LBR stack.
+ */
+ task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ if (task_ctx) {
+ if (sched_in) {
+ __intel_pmu_lbr_restore(task_ctx);
+ cpuc->lbr_context = ctx;
+ } else {
+ __intel_pmu_lbr_save(task_ctx);
+ }
+ return;
+ }
+
+ /*
+ * When sampling the branck stack in system-wide, it may be
+ * necessary to flush the stack on context switch. This happens
+ * when the branch stack does not tag its entries with the pid
+ * of the current task. Otherwise it becomes impossible to
+ * associate a branch entry with a task. This ambiguity is more
+ * likely to appear when the branch stack supports priv level
+ * filtering and the user sets it to monitor only at the user
+ * level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch
+ * stack with branch from multiple tasks.
+ */
+ if (sched_in) {
+ intel_pmu_lbr_reset();
+ cpuc->lbr_context = ctx;
+ }
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
void intel_pmu_lbr_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event)
}
cpuc->br_sel = event->hw.branch_reg.reg;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users++;
+ }
+
cpuc->lbr_users++;
+ perf_sched_cb_inc(event->ctx->pmu);
}
void intel_pmu_lbr_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users--;
+ }
+
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
+ perf_sched_cb_dec(event->ctx->pmu);
if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
}
}
-void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
- __intel_pmu_lbr_enable();
+ __intel_pmu_lbr_enable(pmi);
}
void intel_pmu_lbr_disable_all(void)
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
}
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
- u64 tos;
-
- rdmsrl(x86_pmu.lbr_tos, tos);
-
- return tos;
-}
-
static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void)
* - in case there is no HW filter
* - in case the HW filter has errata or limitations
*/
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{
u64 br_type = event->attr.branch_sample_type;
int mask = 0;
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC;
+ if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+ if (!x86_pmu_has_lbr_callstack())
+ return -EOPNOTSUPP;
+ if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+ return -EINVAL;
+ mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+ X86_BR_CALL_STACK;
+ }
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
*/
event->hw.branch_reg.reg = mask;
+ return 0;
}
/*
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
struct hw_perf_event_extra *reg;
u64 br_type = event->attr.branch_sample_type;
- u64 mask = 0, m;
- u64 v;
+ u64 mask = 0, v;
+ int i;
- for_each_branch_sample_type(m) {
- if (!(br_type & m))
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & (1ULL << i)))
continue;
- v = x86_pmu.lbr_sel_map[m];
+ v = x86_pmu.lbr_sel_map[i];
if (v == LBR_NOT_SUPP)
return -EOPNOTSUPP;
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
- /* LBR_SELECT operates in suppress mode so invert mask */
- reg->config = ~mask & x86_pmu.lbr_sel_mask;
+ /*
+ * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+ * in suppress mode. So LBR_SELECT should be set to
+ * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ */
+ reg->config = mask ^ x86_pmu.lbr_sel_mask;
return 0;
}
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/*
* setup SW LBR filter
*/
- intel_pmu_setup_sw_lbr_filter(event);
+ ret = intel_pmu_setup_sw_lbr_filter(event);
+ if (ret)
+ return ret;
/*
* setup HW LBR filter, if any
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
+ insn_get_immediate(&insn);
+ if (insn.immediate1.value == 0) {
+ /* zero length call */
+ ret = X86_BR_ZERO_CALL;
+ break;
+ }
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
/*
* Map interface branch filters onto LBR filters
*/
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
- | LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
*/
- [PERF_SAMPLE_BRANCH_ANY_CALL] =
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
};
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
- [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
- | LBR_FAR,
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_RETURN | LBR_CALL_STACK,
};
/* core */
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void)
pr_cont("16-deep LBR, ");
}
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ pr_cont("16-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..ffe666c2c6b5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1100 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+
+#include "perf_event.h"
+#include "intel_pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m) \
+ [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
+ .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+ const char *name;
+ u32 leaf;
+ u8 reg;
+ u32 mask;
+} pt_caps[] = {
+ PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
+ PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
+ PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+ struct pt_cap_desc *cd = &pt_caps[cap];
+ u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+ unsigned int shift = __ffs(cd->mask);
+
+ return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dev_ext_attribute *ea =
+ container_of(attr, struct dev_ext_attribute, attr);
+ enum pt_capabilities cap = (long)ea->var;
+
+ return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+ .name = "caps",
+};
+
+PMU_FORMAT_ATTR(tsc, "config:10" );
+PMU_FORMAT_ATTR(noretcomp, "config:11" );
+
+static struct attribute *pt_formats_attr[] = {
+ &format_attr_tsc.attr,
+ &format_attr_noretcomp.attr,
+ NULL,
+};
+
+static struct attribute_group pt_format_group = {
+ .name = "format",
+ .attrs = pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+ &pt_cap_group,
+ &pt_format_group,
+ NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+ struct dev_ext_attribute *de_attrs;
+ struct attribute **attrs;
+ size_t size;
+ int ret;
+ long i;
+
+ attrs = NULL;
+ ret = -ENODEV;
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ goto fail;
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ cpuid_count(20, i,
+ &pt_pmu.caps[CR_EAX + i*4],
+ &pt_pmu.caps[CR_EBX + i*4],
+ &pt_pmu.caps[CR_ECX + i*4],
+ &pt_pmu.caps[CR_EDX + i*4]);
+ }
+
+ ret = -ENOMEM;
+ size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+ attrs = kzalloc(size, GFP_KERNEL);
+ if (!attrs)
+ goto fail;
+
+ size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+ de_attrs = kzalloc(size, GFP_KERNEL);
+ if (!de_attrs)
+ goto fail;
+
+ for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+ struct dev_ext_attribute *de_attr = de_attrs + i;
+
+ de_attr->attr.attr.name = pt_caps[i].name;
+
+ sysfs_attr_init(&de_attrs->attr.attr);
+
+ de_attr->attr.attr.mode = S_IRUGO;
+ de_attr->attr.show = pt_cap_show;
+ de_attr->var = (void *)i;
+
+ attrs[i] = &de_attr->attr.attr;
+ }
+
+ pt_cap_group.attrs = attrs;
+
+ return 0;
+
+fail:
+ kfree(attrs);
+
+ return ret;
+}
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+
+ if ((config & PT_CONFIG_MASK) != config)
+ return false;
+
+ return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static bool pt_is_running(void)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ return !!(ctl & RTIT_CTL_TRACEEN);
+}
+
+static void pt_config(struct perf_event *event)
+{
+ u64 reg;
+
+ reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+ if (!event->attr.exclude_kernel)
+ reg |= RTIT_CTL_OS;
+ if (!event->attr.exclude_user)
+ reg |= RTIT_CTL_USR;
+
+ reg |= (event->attr.config & PT_CONFIG_MASK);
+
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+ if (start)
+ ctl |= RTIT_CTL_TRACEEN;
+ else
+ ctl &= ~RTIT_CTL_TRACEEN;
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ /*
+ * A wrmsr that disables trace generation serializes other PT
+ * registers and causes all data packets to be written to memory,
+ * but a fence is required for the data to become globally visible.
+ *
+ * The below WMB, separating data store and aux_head store matches
+ * the consumer's RMB that separates aux_head load and data load.
+ */
+ if (!start)
+ wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+ unsigned int output_off)
+{
+ u64 reg;
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+ reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table: actual ToPA table entries, as understood by PT hardware
+ * @list: linkage to struct pt_buffer's list of tables
+ * @phys: physical address of this page
+ * @offset: offset of the first entry in this table in the buffer
+ * @size: total size of all entries in this table
+ * @last: index of the last initialized entry in this table
+ */
+struct topa {
+ struct topa_entry table[TENTS_PER_PAGE];
+ struct list_head list;
+ u64 phys;
+ u64 offset;
+ size_t size;
+ int last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu: CPU on which to allocate.
+ * @gfp: Allocation flags.
+ *
+ * Return: On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+ int node = cpu_to_node(cpu);
+ struct topa *topa;
+ struct page *p;
+
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+ if (!p)
+ return NULL;
+
+ topa = page_address(p);
+ topa->last = 0;
+ topa->phys = page_to_phys(p);
+
+ /*
+ * In case of singe-entry ToPA, always put the self-referencing END
+ * link as the 2nd entry in the table
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, 1)->end = 1;
+ }
+
+ return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa: Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+ free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf: PT buffer that's being extended.
+ * @topa: New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+ struct topa *last = buf->last;
+
+ list_add_tail(&topa->list, &buf->tables);
+
+ if (!buf->first) {
+ buf->first = buf->last = buf->cur = topa;
+ return;
+ }
+
+ topa->offset = last->offset + last->size;
+ buf->last = topa;
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return;
+
+ BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+ TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa: ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+ /* single-entry ToPA is a special case */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return !!topa->last;
+
+ return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf: PT buffer being initialized.
+ * @gfp: Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return: 0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+ struct topa *topa = buf->last;
+ int order = 0;
+ struct page *p;
+
+ p = virt_to_page(buf->data_pages[buf->nr_pages]);
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (topa_table_full(topa)) {
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+ }
+
+ TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, -1)->size = order;
+ if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, -1)->intr = 1;
+ TOPA_ENTRY(topa, -1)->stop = 1;
+ }
+
+ topa->last++;
+ topa->size += sizes(order);
+
+ buf->nr_pages += 1ul << order;
+
+ return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf: PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+ struct topa *topa;
+
+ list_for_each_entry(topa, &buf->tables, list) {
+ int i;
+
+ pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+ topa->phys, topa->offset, topa->size);
+ for (i = 0; i < TENTS_PER_PAGE; i++) {
+ pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+ &topa->table[i],
+ (unsigned long)topa->table[i].base << TOPA_SHIFT,
+ sizes(topa->table[i].size),
+ topa->table[i].end ? 'E' : ' ',
+ topa->table[i].intr ? 'I' : ' ',
+ topa->table[i].stop ? 'S' : ' ',
+ *(u64 *)&topa->table[i]);
+ if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+ topa->table[i].stop) ||
+ topa->table[i].end)
+ break;
+ }
+ }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf: PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+ buf->output_off = 0;
+ buf->cur_idx++;
+
+ if (buf->cur_idx == buf->cur->last) {
+ if (buf->cur == buf->last)
+ buf->cur = buf->first;
+ else
+ buf->cur = list_entry(buf->cur->list.next, struct topa,
+ list);
+ buf->cur_idx = 0;
+ }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt: Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ u64 topa_idx, base, old;
+
+ /* offset of the first region in this table from the beginning of buf */
+ base = buf->cur->offset + buf->output_off;
+
+ /* offset of the current output region within this table */
+ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+ base += sizes(buf->cur->table[topa_idx].size);
+
+ if (buf->snapshot) {
+ local_set(&buf->data_size, base);
+ } else {
+ old = (local64_xchg(&buf->head, base) &
+ ((buf->nr_pages << PAGE_SHIFT) - 1));
+ if (base < old)
+ base += buf->nr_pages << PAGE_SHIFT;
+
+ local_add(base - old, &buf->data_size);
+ }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf: PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+ return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf: PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+ return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt: Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ int advance = 0;
+ u64 status;
+
+ rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+ if (status & RTIT_STATUS_ERROR) {
+ pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+ pt_topa_dump(buf);
+ status &= ~RTIT_STATUS_ERROR;
+ }
+
+ if (status & RTIT_STATUS_STOPPED) {
+ status &= ~RTIT_STATUS_STOPPED;
+
+ /*
+ * On systems that only do single-entry ToPA, hitting STOP
+ * means we are already losing data; need to let the decoder
+ * know.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+ buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ local_inc(&buf->lost);
+ advance++;
+ }
+ }
+
+ /*
+ * Also on single-entry ToPA implementations, interrupt will come
+ * before the output reaches its output region's boundary.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+ pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+ void *head = pt_buffer_region(buf);
+
+ /* everything within this margin needs to be zeroed out */
+ memset(head + buf->output_off, 0,
+ pt_buffer_region_size(buf) -
+ buf->output_off);
+ advance++;
+ }
+
+ if (advance)
+ pt_buffer_advance(buf);
+
+ wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf: PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+ u64 offset, base_topa;
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+ buf->cur = phys_to_virt(base_topa);
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+ /* offset within current output region */
+ buf->output_off = offset >> 32;
+ /* index of current output region within this table */
+ buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf: PT buffer.
+ * @pg: Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+ struct topa_entry *te = buf->topa_index[pg];
+
+ /* one region */
+ if (buf->first == buf->last && buf->first->last == 1)
+ return pg;
+
+ do {
+ pg++;
+ pg &= buf->nr_pages - 1;
+ } while (buf->topa_index[pg] == te);
+
+ return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf: PT buffer.
+ * @handle: Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+ struct perf_output_handle *handle)
+
+{
+ unsigned long idx, npages, end;
+
+ if (buf->snapshot)
+ return 0;
+
+ /* can't stop in the middle of an output region */
+ if (buf->output_off + handle->size + 1 <
+ sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+ return -EINVAL;
+
+
+ /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return 0;
+
+ /* clear STOP and INT from current entry */
+ buf->topa_index[buf->stop_pos]->stop = 0;
+ buf->topa_index[buf->intr_pos]->intr = 0;
+
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ npages = (handle->size + 1) >> PAGE_SHIFT;
+ end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
+ /*if (end > handle->wakeup >> PAGE_SHIFT)
+ end = handle->wakeup >> PAGE_SHIFT;*/
+ idx = end & (buf->nr_pages - 1);
+ buf->stop_pos = idx;
+ idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
+ idx &= buf->nr_pages - 1;
+ buf->intr_pos = idx;
+ }
+
+ buf->topa_index[buf->stop_pos]->stop = 1;
+ buf->topa_index[buf->intr_pos]->intr = 1;
+
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf: PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+ struct topa *cur = buf->first, *prev = buf->last;
+ struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+ *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+ int pg = 0, idx = 0, ntopa = 0;
+
+ while (pg < buf->nr_pages) {
+ int tidx;
+
+ /* pages within one topa entry */
+ for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+ buf->topa_index[pg] = te_prev;
+
+ te_prev = te_cur;
+
+ if (idx == cur->last - 1) {
+ /* advance to next topa table */
+ idx = 0;
+ cur = list_entry(cur->list.next, struct topa, list);
+ ntopa++;
+ } else
+ idx++;
+ te_cur = TOPA_ENTRY(cur, idx);
+ }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf: PT buffer.
+ * @head: Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly.
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+ int pg;
+
+ if (buf->snapshot)
+ head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ pg = pt_topa_next_entry(buf, pg);
+
+ buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+ buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+ (unsigned long)buf->cur) / sizeof(struct topa_entry);
+ buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+ local64_set(&buf->head, head);
+ local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf: PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+ struct topa *topa, *iter;
+
+ list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+ /*
+ * right now, this is in free_aux() path only, so
+ * no need to unlink this table from the list
+ */
+ topa_free(topa);
+ }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf: PT buffer.
+ * @size: Total size of all regions within this ToPA.
+ * @gfp: Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+ gfp_t gfp)
+{
+ struct topa *topa;
+ int err;
+
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+
+ while (buf->nr_pages < nr_pages) {
+ err = topa_insert_pages(buf, gfp);
+ if (err) {
+ pt_buffer_fini_topa(buf);
+ return -ENOMEM;
+ }
+ }
+
+ pt_buffer_setup_topa_index(buf);
+
+ /* link last table to the first one, unless we're double buffering */
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(buf->last, -1)->end = 1;
+ }
+
+ pt_topa_dump(buf);
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu: Cpu on which to allocate, -1 means current.
+ * @pages: Array of pointers to buffer pages passed from perf core.
+ * @nr_pages: Number of pages in the buffer.
+ * @snapshot: If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return: Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+ struct pt_buffer *buf;
+ int node, ret;
+
+ if (!nr_pages)
+ return NULL;
+
+ if (cpu == -1)
+ cpu = raw_smp_processor_id();
+ node = cpu_to_node(cpu);
+
+ buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+ GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->cpu = cpu;
+ buf->snapshot = snapshot;
+ buf->data_pages = pages;
+
+ INIT_LIST_HEAD(&buf->tables);
+
+ ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+ if (ret) {
+ kfree(buf);
+ return NULL;
+ }
+
+ return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data: PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+ struct pt_buffer *buf = data;
+
+ pt_buffer_fini_topa(buf);
+ kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf: PT buffer.
+ * @pt: Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= pt->handle.size)
+ return true;
+
+ return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+ struct perf_event *event = pt->handle.event;
+
+ /*
+ * There may be a dangling PT bit in the interrupt status register
+ * after PT has been disabled by pt_event_stop(). Make sure we don't
+ * do anything (particularly, re-enable) for this event here.
+ */
+ if (!ACCESS_ONCE(pt->handle_nmi))
+ return;
+
+ pt_config_start(false);
+
+ if (!event)
+ return;
+
+ buf = perf_get_aux(&pt->handle);
+ if (!buf)
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+
+ if (!event->hw.state) {
+ int ret;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret) {
+ perf_aux_output_end(&pt->handle, 0, true);
+ return;
+ }
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+ }
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ ACCESS_ONCE(pt->handle_nmi) = 1;
+ event->hw.state = 0;
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ /*
+ * Protect against the PMI racing with disabling wrmsr,
+ * see comment in intel_pt_interrupt().
+ */
+ ACCESS_ONCE(pt->handle_nmi) = 0;
+ pt_config_start(false);
+
+ if (event->hw.state == PERF_HES_STOPPED)
+ return;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_UPDATE) {
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (!buf)
+ return;
+
+ if (WARN_ON_ONCE(pt->handle.event != event))
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+ }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+
+ pt_event_stop(event, PERF_EF_UPDATE);
+
+ buf = perf_get_aux(&pt->handle);
+
+ if (buf) {
+ if (buf->snapshot)
+ pt->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+ }
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+ struct pt_buffer *buf;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ if (pt->handle.event)
+ goto fail;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ ret = -EINVAL;
+ if (!buf)
+ goto fail_stop;
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ if (!buf->snapshot) {
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret)
+ goto fail_end_stop;
+ }
+
+ if (mode & PERF_EF_START) {
+ pt_event_start(event, 0);
+ ret = -EBUSY;
+ if (hwc->state == PERF_HES_STOPPED)
+ goto fail_end_stop;
+ } else {
+ hwc->state = PERF_HES_STOPPED;
+ }
+
+ return 0;
+
+fail_end_stop:
+ perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+ hwc->state = PERF_HES_STOPPED;
+fail:
+ return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+ if (event->attr.type != pt_pmu.pmu.type)
+ return -ENOENT;
+
+ if (!pt_event_valid(event))
+ return -EINVAL;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_pt))
+ return -EBUSY;
+
+ event->destroy = pt_event_destroy;
+
+ return 0;
+}
+
+static __init int pt_init(void)
+{
+ int ret, cpu, prior_warn = 0;
+
+ BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ u64 ctl;
+
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+ if (!ret && (ctl & RTIT_CTL_TRACEEN))
+ prior_warn++;
+ }
+ put_online_cpus();
+
+ if (prior_warn) {
+ x86_add_exclusive(x86_lbr_exclusive_pt);
+ pr_warn("PT is enabled at boot time, doing nothing\n");
+
+ return -EBUSY;
+ }
+
+ ret = pt_pmu_hw_init();
+ if (ret)
+ return ret;
+
+ if (!pt_cap_get(PT_CAP_topa_output)) {
+ pr_warn("ToPA output is not supported on this CPU\n");
+ return -ENODEV;
+ }
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ pt_pmu.pmu.capabilities =
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+ pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+ pt_pmu.pmu.attr_groups = pt_attr_groups;
+ pt_pmu.pmu.task_ctx_nr = perf_sw_context;
+ pt_pmu.pmu.event_init = pt_event_init;
+ pt_pmu.pmu.add = pt_event_add;
+ pt_pmu.pmu.del = pt_event_del;
+ pt_pmu.pmu.start = pt_event_start;
+ pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.read = pt_event_read;
+ pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
+ pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+ return ret;
+}
+
+module_init(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index c4bb8b8e5017..999289b94025 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -62,6 +62,14 @@
#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+#define NR_RAPL_DOMAINS 0x4
+static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+ "pp0-core",
+ "package",
+ "dram",
+ "pp1-gpu",
+};
+
/* Clients have PP0, PKG */
#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
@@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = { \
struct rapl_pmu {
spinlock_t lock;
- int hw_unit; /* 1/2^hw_unit Joule */
int n_active; /* number of active events */
struct list_head active_list;
struct pmu *pmu; /* pointer to rapl_pmu_class */
@@ -120,6 +127,7 @@ struct rapl_pmu {
struct hrtimer hrtimer;
};
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
static struct pmu rapl_pmu_class;
static cpumask_t rapl_cpu_mask;
static int rapl_cntr_mask;
@@ -127,6 +135,7 @@ static int rapl_cntr_mask;
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+static struct x86_pmu_quirk *rapl_quirks;
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
@@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw;
}
-static inline u64 rapl_scale(u64 v)
+#define rapl_add_quirk(func_) \
+do { \
+ static struct x86_pmu_quirk __quirk __initdata = { \
+ .func = func_, \
+ }; \
+ __quirk.next = rapl_quirks; \
+ rapl_quirks = &__quirk; \
+} while (0)
+
+static inline u64 rapl_scale(u64 v, int cfg)
{
+ if (cfg > NR_RAPL_DOMAINS) {
+ pr_warn("invalid domain %d, failed to scale data\n", cfg);
+ return v;
+ }
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
- return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
+ return v << (32 - rapl_hw_unit[cfg - 1]);
}
static u64 rapl_event_update(struct perf_event *event)
@@ -173,7 +195,7 @@ again:
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
- sdelta = rapl_scale(delta);
+ sdelta = rapl_scale(delta, event->hw.config);
local64_add(sdelta, &event->count);
@@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu)
cpumask_set_cpu(cpu, &rapl_cpu_mask);
}
+static __init void rapl_hsw_server_quirk(void)
+{
+ /*
+ * DRAM domain on HSW server has fixed energy unit which can be
+ * different than the unit from power unit MSR.
+ * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+ * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ */
+ rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
static int rapl_cpu_prepare(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
int phys_id = topology_physical_package_id(cpu);
u64 ms;
- u64 msr_rapl_power_unit_bits;
if (pmu)
return 0;
@@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu)
if (phys_id < 0)
return -1;
- /* protect rdmsrl() to handle virtualization */
- if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
- return -1;
-
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
if (!pmu)
return -1;
-
spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
- /*
- * grab power unit as: 1/2^unit Joules
- *
- * we cache in local PMU instance
- */
- pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
pmu->pmu = &rapl_pmu_class;
/*
@@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu)
* divide interval by 2 to avoid lockstep (2 * 100)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
- if (pmu->hw_unit < 32)
- ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+ if (rapl_hw_unit[0] < 32)
+ ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
else
ms = 2;
@@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self,
return NOTIFY_OK;
}
+static int rapl_check_hw_unit(void)
+{
+ u64 msr_rapl_power_unit_bits;
+ int i;
+
+ /* protect rdmsrl() to handle virtualization */
+ if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+ return -1;
+ for (i = 0; i < NR_RAPL_DOMAINS; i++)
+ rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+ return 0;
+}
+
static const struct x86_cpu_id rapl_cpu_match[] = {
[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
[1] = {},
@@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void)
{
struct rapl_pmu *pmu;
int cpu, ret;
+ struct x86_pmu_quirk *quirk;
+ int i;
/*
* check for Intel processor family 6
@@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void)
rapl_cntr_mask = RAPL_IDX_CLN;
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
+ case 63: /* Haswell-Server */
+ rapl_add_quirk(rapl_hsw_server_quirk);
+ rapl_cntr_mask = RAPL_IDX_SRV;
+ rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+ break;
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
rapl_cntr_mask = RAPL_IDX_HSW;
@@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void)
/* unsupported */
return 0;
}
+ ret = rapl_check_hw_unit();
+ if (ret)
+ return ret;
+ /* run cpu model quirks */
+ for (quirk = rapl_quirks; quirk; quirk = quirk->next)
+ quirk->func();
cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
@@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void)
pmu = __this_cpu_read(rapl_pmu);
- pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+ pr_info("RAPL PMU detected,"
" API unit is 2^-32 Joules,"
" %d fixed counters"
" %llu ms ovfl timer\n",
- pmu->hw_unit,
hweight32(rapl_cntr_mask),
ktime_to_ms(pmu->timer_interval));
-
+ for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+ if (rapl_cntr_mask & (1 << i)) {
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_domain_names[i], rapl_hw_unit[i]);
+ }
+ }
out:
cpu_notifier_register_done();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
}
}
- if (ubox_dev)
- pci_dev_put(ubox_dev);
+ pci_dev_put(ubox_dev);
return err ? pcibios_err_to_errno(err) : 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
{ X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
{ X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },