diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-23 15:32:18 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-23 15:32:18 +0300 |
commit | c05f3642f4304dd081876e77a68555b6aba4483f (patch) | |
tree | 915baf10c3518d162c00e62b5d855cf92282ed79 /arch/x86/kernel | |
parent | 0200fbdd431519d730b5d399a12840ec832b27cc (diff) | |
parent | dda93b45389f025fd3422d22cc31cc1ea6040305 (diff) | |
download | linux-c05f3642f4304dd081876e77a68555b6aba4483f.tar.xz |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar:
"The main updates in this cycle were:
- Lots of perf tooling changes too voluminous to list (big perf trace
and perf stat improvements, lots of libtraceevent reorganization,
etc.), so I'll list the authors and refer to the changelog for
details:
Benjamin Peterson, Jérémie Galarneau, Kim Phillips, Peter
Zijlstra, Ravi Bangoria, Sangwon Hong, Sean V Kelley, Steven
Rostedt, Thomas Gleixner, Ding Xiang, Eduardo Habkost, Thomas
Richter, Andi Kleen, Sanskriti Sharma, Adrian Hunter, Tzvetomir
Stoyanov, Arnaldo Carvalho de Melo, Jiri Olsa.
... with the bulk of the changes written by Jiri Olsa, Tzvetomir
Stoyanov and Arnaldo Carvalho de Melo.
- Continued intel_rdt work with a focus on playing well with perf
events. This also imported some non-perf RDT work due to
dependencies. (Reinette Chatre)
- Implement counter freezing for Arch Perfmon v4 (Skylake and newer).
This allows to speed up the PMI handler by avoiding unnecessary MSR
writes and make it more accurate. (Andi Kleen)
- kprobes cleanups and simplification (Masami Hiramatsu)
- Intel Goldmont PMU updates (Kan Liang)
- ... plus misc other fixes and updates"
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (155 commits)
kprobes/x86: Use preempt_enable() in optimized_callback()
x86/intel_rdt: Prevent pseudo-locking from using stale pointers
kprobes, x86/ptrace.h: Make regs_get_kernel_stack_nth() not fault on bad stack
perf/x86/intel: Export mem events only if there's PEBS support
x86/cpu: Drop pointless static qualifier in punit_dev_state_show()
x86/intel_rdt: Fix initial allocation to consider CDP
x86/intel_rdt: CBM overlap should also check for overlap with CDP peer
x86/intel_rdt: Introduce utility to obtain CDP peer
tools lib traceevent, perf tools: Move struct tep_handler definition in a local header file
tools lib traceevent: Separate out tep_strerror() for strerror_r() issues
perf python: More portable way to make CFLAGS work with clang
perf python: Make clang_has_option() work on Python 3
perf tools: Free temporary 'sys' string in read_event_files()
perf tools: Avoid double free in read_event_file()
perf tools: Free 'printk' string in parse_ftrace_printk()
perf tools: Cleanup trace-event-info 'tdata' leak
perf strbuf: Match va_{add,copy} with va_end
perf test: S390 does not support watchpoints in test 22
perf auxtrace: Include missing asm/bitsperlong.h to get BITS_PER_LONG
tools include: Adopt linux/bits.h
...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 28 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 385 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 176 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/opt.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/tsc_msr.c | 10 |
8 files changed, 432 insertions, 200 deletions
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 44c4ef3d989b..10e5ccfa9278 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -949,11 +949,11 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) } static const __initconst struct x86_cpu_id cpu_no_speculation[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_TABLET, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL, X86_FEATURE_ANY }, { X86_VENDOR_CENTAUR, 5 }, { X86_VENDOR_INTEL, 5 }, { X86_VENDOR_NSC, 5 }, @@ -968,10 +968,10 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { /* Only list CPUs which speculate but are non susceptible to SSB */ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, @@ -984,14 +984,14 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { /* in addition to cpu_no_speculation */ - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, {} diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index abb71ac70443..44272b7107ad 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -485,9 +485,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), - sizeof(unsigned long), - GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); @@ -496,7 +494,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); if (!d->mbm_total) { - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); return -ENOMEM; } } @@ -504,7 +502,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); if (!d->mbm_local) { - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); return -ENOMEM; } @@ -610,9 +608,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cancel_delayed_work(&d->cqm_limbo); } + /* + * rdt_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + kfree(d->ctrl_val); kfree(d->mbps_val); - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); kfree(d->mbm_local); kfree(d); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 0f53049719cd..27937458c231 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -404,8 +404,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, for_each_alloc_enabled_rdt_resource(r) seq_printf(s, "%s:uninitialized\n", r->name); } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name, - rdtgrp->plr->d->id, rdtgrp->plr->cbm); + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->r->name, + rdtgrp->plr->d->id, + rdtgrp->plr->cbm); + } } else { closid = rdtgrp->closid; for_each_alloc_enabled_rdt_resource(r) { diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index f8c260d522ca..815b4e92522c 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/kthread.h> #include <linux/mman.h> +#include <linux/perf_event.h> #include <linux/pm_qos.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -26,6 +27,7 @@ #include <asm/intel_rdt_sched.h> #include <asm/perf_event.h> +#include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "intel_rdt.h" #define CREATE_TRACE_POINTS @@ -91,7 +93,7 @@ static u64 get_prefetch_disable_bits(void) */ return 0xF; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -106,16 +108,6 @@ static u64 get_prefetch_disable_bits(void) return 0; } -/* - * Helper to write 64bit value to MSR without tracing. Used when - * use of the cache should be restricted and use of registers used - * for local variables avoided. - */ -static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val) -{ - __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); -} - /** * pseudo_lock_minor_get - Obtain available minor number * @minor: Pointer to where new minor number will be stored @@ -888,31 +880,14 @@ static int measure_cycles_lat_fn(void *_plr) struct pseudo_lock_region *plr = _plr; unsigned long i; u64 start, end; -#ifdef CONFIG_KASAN - /* - * The registers used for local register variables are also used - * when KASAN is active. When KASAN is active we use a regular - * variable to ensure we always use a valid pointer to access memory. - * The cost is that accessing this pointer, which could be in - * cache, will be included in the measurement of memory read latency. - */ void *mem_r; -#else -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_KASAN */ local_irq_disable(); /* - * The wrmsr call may be reordered with the assignment below it. - * Call wrmsr as directly as possible to avoid tracing clobbering - * local register variable used for memory pointer. + * Disable hardware prefetchers. */ - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); - mem_r = plr->kmem; + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed * instructions into the L1 instruction cache. @@ -934,157 +909,240 @@ static int measure_cycles_lat_fn(void *_plr) return 0; } -static int measure_cycles_perf_fn(void *_plr) +/* + * Create a perf_event_attr for the hit and miss perf events that will + * be used during the performance measurement. A perf_event maintains + * a pointer to its perf_event_attr so a unique attribute structure is + * created for each perf_event. + * + * The actual configuration of the event is set right before use in order + * to use the X86_CONFIG macro. + */ +static struct perf_event_attr perf_miss_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +static struct perf_event_attr perf_hit_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +struct residency_counts { + u64 miss_before, hits_before; + u64 miss_after, hits_after; +}; + +static int measure_residency_fn(struct perf_event_attr *miss_attr, + struct perf_event_attr *hit_attr, + struct pseudo_lock_region *plr, + struct residency_counts *counts) { - unsigned long long l3_hits = 0, l3_miss = 0; - u64 l3_hit_bits = 0, l3_miss_bits = 0; - struct pseudo_lock_region *plr = _plr; - unsigned long long l2_hits, l2_miss; - u64 l2_hit_bits, l2_miss_bits; - unsigned long i; -#ifdef CONFIG_KASAN - /* - * The registers used for local register variables are also used - * when KASAN is active. When KASAN is active we use regular variables - * at the cost of including cache access latency to these variables - * in the measurements. - */ + u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; + struct perf_event *miss_event, *hit_event; + int hit_pmcnum, miss_pmcnum; unsigned int line_size; unsigned int size; + unsigned long i; void *mem_r; -#else - register unsigned int line_size asm("esi"); - register unsigned int size asm("edi"); -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_KASAN */ + u64 tmp; + + miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(miss_event)) + goto out; + + hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(hit_event)) + goto out_miss; + + local_irq_disable(); + /* + * Check any possible error state of events used by performing + * one local read. + */ + if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + + /* + * Disable hardware prefetchers. + */ + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + + /* Initialize rest of local variables */ + /* + * Performance event has been validated right before this with + * interrupts disabled - it is thus safe to read the counter index. + */ + miss_pmcnum = x86_perf_rdpmc_index(miss_event); + hit_pmcnum = x86_perf_rdpmc_index(hit_event); + line_size = READ_ONCE(plr->line_size); + mem_r = READ_ONCE(plr->kmem); + size = READ_ONCE(plr->size); + + /* + * Read counter variables twice - first to load the instructions + * used in L1 cache, second to capture accurate value that does not + * include cache misses incurred because of instruction loads. + */ + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * From SDM: Performing back-to-back fast reads are not guaranteed + * to be monotonic. + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_after); + rdpmcl(miss_pmcnum, miss_after); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + /* Re-enable hardware prefetchers */ + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); +out_hit: + perf_event_release_kernel(hit_event); +out_miss: + perf_event_release_kernel(miss_event); +out: + /* + * All counts will be zero on failure. + */ + counts->miss_before = miss_before; + counts->hits_before = hits_before; + counts->miss_after = miss_after; + counts->hits_after = hits_after; + return 0; +} + +static int measure_l2_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; /* * Non-architectural event for the Goldmont Microarchitecture * from Intel x86 Architecture Software Developer Manual (SDM): * MEM_LOAD_UOPS_RETIRED D1H (event number) * Umask values: - * L1_HIT 01H * L2_HIT 02H - * L1_MISS 08H * L2_MISS 10H - * - * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event - * has two "no fix" errata associated with it: BDM35 and BDM100. On - * this platform we use the following events instead: - * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/) - * REFERENCES FFH - * MISS 3FH - * LONGEST_LAT_CACHE 2EH (Documented in SDM) - * REFERENCE 4FH - * MISS 41H */ - - /* - * Start by setting flags for IA32_PERFEVTSELx: - * OS (Operating system mode) 0x2 - * INT (APIC interrupt enable) 0x10 - * EN (Enable counter) 0x40 - * - * Then add the Umask value and event number to select performance - * event. - */ - switch (boot_cpu_data.x86_model) { case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GEMINI_LAKE: - l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1; - l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1; - break; - case INTEL_FAM6_BROADWELL_X: - /* On BDW the l2_hit_bits count references, not hits */ - l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24; - l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24; - /* On BDW the l3_hit_bits count references, not hits */ - l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e; - l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e; + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + perf_miss_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x10); + perf_hit_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x2); break; default: goto out; } - local_irq_disable(); + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); /* - * Call wrmsr direcly to avoid the local register variables from - * being overwritten due to reordering of their assignment with - * the wrmsr calls. + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. */ - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); - /* Disable events and reset counters */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0); - } - /* Set and enable the L2 counters */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, - l3_hit_bits); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, - l3_miss_bits); - } - mem_r = plr->kmem; - size = plr->size; - line_size = plr->line_size; - for (i = 0; i < size; i += line_size) { - asm volatile("mov (%0,%1,1), %%eax\n\t" - : - : "r" (mem_r), "r" (i) - : "%eax", "memory"); - } + trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, + counts.miss_after - counts.miss_before); +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +static int measure_l3_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; + /* - * Call wrmsr directly (no tracing) to not influence - * the cache access counters as they are disabled. + * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event + * has two "no fix" errata associated with it: BDM35 and BDM100. On + * this platform the following events are used instead: + * LONGEST_LAT_CACHE 2EH (Documented in SDM) + * REFERENCE 4FH + * MISS 41H */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, - l2_hit_bits & ~(0x40ULL << 16)); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, - l2_miss_bits & ~(0x40ULL << 16)); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, - l3_hit_bits & ~(0x40ULL << 16)); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, - l3_miss_bits & ~(0x40ULL << 16)); - } - l2_hits = native_read_pmc(0); - l2_miss = native_read_pmc(1); - if (l3_hit_bits > 0) { - l3_hits = native_read_pmc(2); - l3_miss = native_read_pmc(3); + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* On BDW the hit event counts references, not hits */ + perf_hit_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x4f); + perf_miss_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x41); + break; + default: + goto out; } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); - local_irq_enable(); + + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); /* - * On BDW we count references and misses, need to adjust. Sometimes - * the "hits" counter is a bit more than the references, for - * example, x references but x + 1 hits. To not report invalid - * hit values in this case we treat that as misses eaqual to - * references. + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. */ - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) - l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss); - trace_pseudo_lock_l2(l2_hits, l2_miss); - if (l3_hit_bits > 0) { - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) - l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss); - trace_pseudo_lock_l3(l3_hits, l3_miss); + + counts.miss_after -= counts.miss_before; + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + /* + * On BDW references and misses are counted, need to adjust. + * Sometimes the "hits" counter is a bit more than the + * references, for example, x references but x + 1 hits. + * To not report invalid hit values in this case we treat + * that as misses equal to references. + */ + /* First compute the number of cache references measured */ + counts.hits_after -= counts.hits_before; + /* Next convert references to cache hits */ + counts.hits_after -= min(counts.miss_after, counts.hits_after); + } else { + counts.hits_after -= counts.hits_before; } + trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); out: plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -1116,6 +1174,11 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) goto out; } + if (!plr->d) { + ret = -ENODEV; + goto out; + } + plr->thread_done = 0; cpu = cpumask_first(&plr->d->cpu_mask); if (!cpu_online(cpu)) { @@ -1123,13 +1186,20 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) goto out; } + plr->cpu = cpu; + if (sel == 1) thread = kthread_create_on_node(measure_cycles_lat_fn, plr, cpu_to_node(cpu), "pseudo_lock_measure/%u", cpu); else if (sel == 2) - thread = kthread_create_on_node(measure_cycles_perf_fn, plr, + thread = kthread_create_on_node(measure_l2_residency, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 3) + thread = kthread_create_on_node(measure_l3_residency, plr, cpu_to_node(cpu), "pseudo_lock_measure/%u", cpu); @@ -1173,7 +1243,7 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file, buf[buf_size] = '\0'; ret = kstrtoint(buf, 10, &sel); if (ret == 0) { - if (sel != 1) + if (sel != 1 && sel != 2 && sel != 3) return -EINVAL; ret = debugfs_file_get(file->f_path.dentry); if (ret) @@ -1429,6 +1499,11 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) plr = rdtgrp->plr; + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + /* * Task is required to run with affinity to the cpus associated * with the pseudo-locked region. If this is not the case the task diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b140c68bc14b..f27b8115ffa2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -268,17 +268,27 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdtgroup *rdtgrp; + struct cpumask *mask; int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->plr->d->cpu_mask)); - else + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + } } else { ret = -ENOENT; } @@ -961,7 +971,78 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, } /** - * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * rdt_cdp_peer_get - Retrieve CDP peer if it exists + * @r: RDT resource to which RDT domain @d belongs + * @d: Cache instance for which a CDP peer is requested + * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer) + * Used to return the result. + * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer) + * Used to return the result. + * + * RDT resources are managed independently and by extension the RDT domains + * (RDT resource instances) are managed independently also. The Code and + * Data Prioritization (CDP) RDT resources, while managed independently, + * could refer to the same underlying hardware. For example, + * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache. + * + * When provided with an RDT resource @r and an instance of that RDT + * resource @d rdt_cdp_peer_get() will return if there is a peer RDT + * resource and the exact instance that shares the same hardware. + * + * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists. + * If a CDP peer was found, @r_cdp will point to the peer RDT resource + * and @d_cdp will point to the peer RDT domain. + */ +static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d, + struct rdt_resource **r_cdp, + struct rdt_domain **d_cdp) +{ + struct rdt_resource *_r_cdp = NULL; + struct rdt_domain *_d_cdp = NULL; + int ret = 0; + + switch (r->rid) { + case RDT_RESOURCE_L3DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + break; + case RDT_RESOURCE_L3CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + break; + case RDT_RESOURCE_L2DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE]; + break; + case RDT_RESOURCE_L2CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA]; + break; + default: + ret = -ENOENT; + goto out; + } + + /* + * When a new CPU comes online and CDP is enabled then the new + * RDT domains (if any) associated with both CDP RDT resources + * are added in the same CPU online routine while the + * rdtgroup_mutex is held. It should thus not happen for one + * RDT domain to exist and be associated with its RDT CDP + * resource but there is no RDT domain associated with the + * peer RDT CDP resource. Hence the WARN. + */ + _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL); + if (WARN_ON(!_d_cdp)) { + _r_cdp = NULL; + ret = -EINVAL; + } + +out: + *r_cdp = _r_cdp; + *d_cdp = _d_cdp; + + return ret; +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other * @r: Resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. @@ -980,8 +1061,8 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, * * Return: false if CBM does not overlap, true if it does. */ -bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - unsigned long cbm, int closid, bool exclusive) +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) { enum rdtgrp_mode mode; unsigned long ctrl_b; @@ -1017,6 +1098,41 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, } /** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + struct rdt_resource *r_cdp; + struct rdt_domain *d_cdp; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive)) + return true; + + if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0) + return false; + + return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive); +} + +/** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive * * An exclusive resource group implies that there should be no sharing of @@ -1176,6 +1292,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; + int ret = 0; bool sep; u32 ctrl; @@ -1186,11 +1303,18 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->r, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->r->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->r, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + } goto out; } @@ -1220,7 +1344,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); - return 0; + return ret; } /* rdtgroup information files for one cache resource. */ @@ -2354,14 +2478,16 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { + struct rdt_resource *r_cdp = NULL; + struct rdt_domain *d_cdp = NULL; u32 used_b = 0, unused_b = 0; u32 closid = rdtgrp->closid; struct rdt_resource *r; unsigned long tmp_cbm; enum rdtgrp_mode mode; struct rdt_domain *d; + u32 peer_ctl, *ctrl; int i, ret; - u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { /* @@ -2371,6 +2497,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) if (r->rid == RDT_RESOURCE_MBA) continue; list_for_each_entry(d, &r->domains, list) { + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; @@ -2380,9 +2507,19 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) break; - used_b |= *ctrl; + /* + * If CDP is active include peer + * domain's usage to ensure there + * is no overlap with an exclusive + * group. + */ + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl; + d->new_ctrl |= *ctrl | peer_ctl; } } if (d->plr && d->plr->cbm > 0) @@ -2805,6 +2942,13 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); + + if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA])) + seq_puts(seq, ",mba_MBps"); + return 0; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index eaf02f2e7300..40b16b270656 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -179,7 +179,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + preempt_enable(); } NOKPROBE_SYMBOL(optimized_callback); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6d5dc5dabfd7..03b7529333a6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -636,7 +636,7 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 27ef714d886c..3d0e9aeea7c8 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -59,12 +59,12 @@ static const struct freq_desc freq_desc_ann = { }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), - INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), - INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), - INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), - INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), {} }; |