summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/Makefile37
-rw-r--r--arch/x86/entry/entry_64.S26
-rw-r--r--arch/x86/events/core.c28
-rw-r--r--arch/x86/events/intel/core.c547
-rw-r--r--arch/x86/events/intel/ds.c131
-rw-r--r--arch/x86/events/intel/uncore.c58
-rw-r--r--arch/x86/events/intel/uncore.h5
-rw-r--r--arch/x86/events/intel/uncore_snb.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c114
-rw-r--r--arch/x86/events/perf_event.h21
-rw-r--r--arch/x86/events/probe.c7
-rw-r--r--arch/x86/events/probe.h7
-rw-r--r--arch/x86/events/rapl.c34
-rw-r--r--arch/x86/include/asm/compat.h11
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h17
-rw-r--r--arch/x86/include/asm/disabled-features.h3
-rw-r--r--arch/x86/include/asm/efi.h46
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/elfcore-compat.h31
-rw-r--r--arch/x86/include/asm/fpu/api.h12
-rw-r--r--arch/x86/include/asm/idtentry.h6
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irqflags.h46
-rw-r--r--arch/x86/include/asm/kprobes.h11
-rw-r--r--arch/x86/include/asm/mce.h22
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h30
-rw-r--r--arch/x86/include/asm/paravirt_types.h17
-rw-r--r--arch/x86/include/asm/perf_event.h24
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/preempt.h48
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/resctrl.h11
-rw-r--r--arch/x86/include/asm/special_insns.h6
-rw-r--r--arch/x86/include/asm/static_call.h7
-rw-r--r--arch/x86/include/asm/thermal.h13
-rw-r--r--arch/x86/include/asm/tlb.h1
-rw-r--r--arch/x86/include/asm/vm86.h1
-rw-r--r--arch/x86/include/uapi/asm/vm86.h4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c3
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/mce/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c16
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c739
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c24
-rw-r--r--arch/x86/kernel/cpu/scattered.c5
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c3
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c13
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c14
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/irq.c21
-rw-r--r--arch/x86/kernel/irqflags.S11
-rw-r--r--arch/x86/kernel/kprobes/core.c168
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/msr.c7
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/paravirt_patch.c10
-rw-r--r--arch/x86/kernel/pci-iommu_table.c3
-rw-r--r--arch/x86/kernel/ptrace.c46
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/static_call.c17
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/vm86_32.c62
-rw-r--r--arch/x86/kvm/x86.c1
-rw-r--r--arch/x86/mm/fault.c403
-rw-r--r--arch/x86/mm/init.c19
-rw-r--r--arch/x86/mm/mem_encrypt.c5
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c422
-rw-r--r--arch/x86/net/bpf_jit_comp32.c6
-rw-r--r--arch/x86/oprofile/Makefile12
-rw-r--r--arch/x86/oprofile/backtrace.c127
-rw-r--r--arch/x86/oprofile/init.c38
-rw-r--r--arch/x86/oprofile/nmi_int.c780
-rw-r--r--arch/x86/oprofile/op_counter.h30
-rw-r--r--arch/x86/oprofile/op_model_amd.c542
-rw-r--r--arch/x86/oprofile/op_model_p4.c723
-rw-r--r--arch/x86/oprofile/op_model_ppro.c245
-rw-r--r--arch/x86/oprofile/op_x86_model.h90
-rw-r--r--arch/x86/pci/init.c15
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/efi/efi_64.c33
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S6
-rw-r--r--arch/x86/platform/efi/quirks.c16
-rw-r--r--arch/x86/platform/geode/alix.c19
-rw-r--r--arch/x86/platform/geode/geos.c19
-rw-r--r--arch/x86/platform/geode/net5501.c13
-rw-r--r--arch/x86/platform/goldfish/Makefile2
-rw-r--r--arch/x86/platform/goldfish/goldfish.c54
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c4
-rw-r--r--arch/x86/tools/relocs.c12
-rw-r--r--arch/x86/xen/enlighten_pv.c32
-rw-r--r--arch/x86/xen/irq.c23
-rw-r--r--arch/x86/xen/p2m.c15
-rw-r--r--arch/x86/xen/xen-asm.S52
-rw-r--r--arch/x86/xen/xen-ops.h3
110 files changed, 1800 insertions, 4611 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 21f851179ff0..595193bc2d31 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -32,6 +32,7 @@ config X86_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select SWIOTLB
+ select ARCH_HAS_ELFCORE_COMPAT
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -206,7 +207,6 @@ config X86
select HAVE_MOVE_PMD
select HAVE_MOVE_PUD
select HAVE_NMI
- select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
@@ -224,6 +224,7 @@ config X86
select HAVE_STACK_VALIDATION if X86_64
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
+ select HAVE_PREEMPT_DYNAMIC
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -890,7 +891,7 @@ config HPET_TIMER
config HPET_EMULATE_RTC
def_bool y
- depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+ depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
config APB_TIMER
def_bool y if X86_INTEL_MID
@@ -1158,10 +1159,6 @@ config X86_MCE_INJECT
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
-config X86_THERMAL_VECTOR
- def_bool y
- depends on X86_MCE_INTEL
-
source "arch/x86/events/Kconfig"
config X86_LEGACY_VM86
@@ -2864,7 +2861,6 @@ config IA32_EMULATION
depends on X86_64
select ARCH_WANT_OLD_COMPAT_IPC
select BINFMT_ELF
- select COMPAT_BINFMT_ELF
select COMPAT_OLD_SIGACTION
help
Include code to run legacy 32-bit programs under a
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5857917f83ee..b797f1561943 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -50,6 +50,9 @@ export BITS
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+# Intel CET isn't enabled in the kernel
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
+
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
@@ -120,9 +123,6 @@ else
KBUILD_CFLAGS += -mno-red-zone
KBUILD_CFLAGS += -mcmodel=kernel
-
- # Intel CET isn't enabled in the kernel
- KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
endif
ifdef CONFIG_X86_X32
@@ -232,9 +232,6 @@ core-y += arch/x86/
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
-# must be linked after kernel/
-drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
-
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
@@ -295,16 +292,20 @@ archclean:
$(Q)$(MAKE) $(clean)=arch/x86/tools
define archhelp
- echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
- echo ' install - Install kernel using'
- echo ' (your) ~/bin/$(INSTALLKERNEL) or'
- echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
- echo ' install to $$(INSTALL_PATH) and run lilo'
- echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
- echo ' bzdisk/fdimage*/isoimage also accept:'
- echo ' FDARGS="..." arguments for the booted kernel'
- echo ' FDINITRD=file initrd for the booted kernel'
+ echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
+ echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or'
+ echo ' (distribution) /sbin/$(INSTALLKERNEL) or install to '
+ echo ' $$(INSTALL_PATH) and run lilo'
+ echo ''
+ echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
+ echo ' bzdisk/fdimage*/isoimage also accept:'
+ echo ' FDARGS="..." arguments for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
+ echo ''
+ echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
+ echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
+
endef
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad..ce0464d630a2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -46,14 +46,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT_XXL
-SYM_CODE_START(native_usergs_sysret64)
- UNWIND_HINT_EMPTY
- swapgs
- sysretq
-SYM_CODE_END(native_usergs_sysret64)
-#endif /* CONFIG_PARAVIRT_XXL */
-
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
+ * In the Xen PV case we must use iret anyway.
*/
+
+ ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
+ X86_FEATURE_XENPV
+
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
@@ -215,7 +212,8 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
- USERGS_SYSRET64
+ swapgs
+ sysretq
SYM_CODE_END(entry_SYSCALL_64)
/*
@@ -669,7 +667,7 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
- SWAPGS /* to kernel GS */
+ swapgs /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi
@@ -699,7 +697,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
- SWAPGS /* to user GS */
+ swapgs /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp
@@ -943,7 +941,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
ret
.Lparanoid_entry_swapgs:
- SWAPGS
+ swapgs
/*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
@@ -1001,7 +999,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
jnz restore_regs_and_return_to_kernel
/* We are returning to a context with user GSBASE */
- SWAPGS_UNSAFE_STACK
+ swapgs
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
@@ -1426,7 +1424,7 @@ nmi_no_fsgsbase:
jnz nmi_restore
nmi_swapgs:
- SWAPGS_UNSAFE_STACK
+ swapgs
nmi_restore:
POP_REGS
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e37de298a495..6ddeed3cd2ac 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -81,6 +81,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -253,6 +255,8 @@ static bool check_hw_exists(void)
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+ if (fixed_counter_disabled(i))
+ continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
val_fail = val;
@@ -665,6 +669,12 @@ void x86_pmu_disable_all(void)
}
}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+ return static_call(x86_pmu_guest_get_msrs)(nr);
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
/*
* There may be PMI landing after enabled=0. The PMI hitting could be before or
* after disable_all.
@@ -1523,6 +1533,8 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1923,6 +1935,8 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
+
+ static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -1930,6 +1944,13 @@ static void _x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
+static inline struct perf_guest_switch_msr *
+perf_guest_get_msrs_nop(int *nr)
+{
+ *nr = 0;
+ return NULL;
+}
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1995,12 +2016,17 @@ static int __init init_hw_perf_events(void)
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... fixed-purpose events: %lu\n",
+ hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
+ << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
+ if (!x86_pmu.guest_get_msrs)
+ x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop;
+
x86_pmu_static_call_update();
/*
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index d4569bfa83e3..5bac48d5c18e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_spr_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+ INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
+ INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
+ /*
+ * Generally event codes < 0x90 are restricted to counters 0-3.
+ * The 0x2E and 0x3C are exception, which has no restriction.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+ /*
+ * Generally event codes >= 0x90 are likely to have no restrictions.
+ * The exception are defined as above.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
+
+ EVENT_CONSTRAINT_END
+};
+
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -314,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
"4", "2");
-EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
-EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
-EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
-EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
-EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84");
+EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85");
+EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86");
+EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87");
static struct attribute *snb_events_attrs[] = {
EVENT_PTR(td_slots_issued),
@@ -384,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+static __initconst const u64 spr_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe124,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_MISS) ] = 0xe424,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe12,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ [ C(RESULT_MISS) ] = 0xe13,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = 0xe11,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4c4,
+ [ C(RESULT_MISS) ] = 0x4c5,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+};
+
+static __initconst const u64 spr_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10001,
+ [ C(RESULT_MISS) ] = 0x3fbfc00001,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
+ [ C(RESULT_MISS) ] = 0x3f3fc00002,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10c000001,
+ [ C(RESULT_MISS) ] = 0x3fb3000001,
+ },
+ },
+};
+
/*
* Notes on the events:
* - data reads do not include code reads (comparable to earlier tables)
@@ -2134,18 +2289,6 @@ static void intel_tfa_pmu_enable_all(int added)
intel_pmu_enable_all(added);
}
-static void enable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() |
- DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
-static void disable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() &
- ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -2337,8 +2480,8 @@ static void __icl_update_topdown_event(struct perf_event *event,
}
}
-static void update_saved_topdown_regs(struct perf_event *event,
- u64 slots, u64 metrics)
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+ u64 metrics, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2347,7 +2490,7 @@ static void update_saved_topdown_regs(struct perf_event *event,
event->hw.saved_slots = slots;
event->hw.saved_metric = metrics;
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2362,7 +2505,8 @@ static void update_saved_topdown_regs(struct perf_event *event,
* The PERF_METRICS and Fixed counter 3 are read separately. The values may be
* modify by a NMI. PMU has to be disabled before calling this function.
*/
-static u64 icl_update_topdown_event(struct perf_event *event)
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2378,7 +2522,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
/* read PERF_METRICS */
rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2404,7 +2548,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
* Don't need to reset the PERF_METRICS and Fixed counter 3.
* Because the values will be restored in next schedule in.
*/
- update_saved_topdown_regs(event, slots, metrics);
+ update_saved_topdown_regs(event, slots, metrics, metric_end);
reset = false;
}
@@ -2413,12 +2557,18 @@ static u64 icl_update_topdown_event(struct perf_event *event)
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrl(MSR_PERF_METRICS, 0);
if (event)
- update_saved_topdown_regs(event, 0, 0);
+ update_saved_topdown_regs(event, 0, 0, metric_end);
}
return slots;
}
+static u64 icl_update_topdown_event(struct perf_event *event)
+{
+ return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+ x86_pmu.num_topdown_events - 1);
+}
+
static void intel_pmu_read_topdown_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2573,8 +2723,11 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -2709,95 +2862,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
return handled;
}
-static bool disable_counter_freezing = true;
-static int __init intel_perf_counter_freezing_setup(char *s)
-{
- bool res;
-
- if (kstrtobool(s, &res))
- return -EINVAL;
-
- disable_counter_freezing = !res;
- return 1;
-}
-__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
-
-/*
- * Simplified handler for Arch Perfmon v4:
- * - We rely on counter freezing/unfreezing to enable/disable the PMU.
- * This is done automatically on PMU ack.
- * - Ack the PMU only after the APIC.
- */
-
-static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
-{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- int handled = 0;
- bool bts = false;
- u64 status;
- int pmu_enabled = cpuc->enabled;
- int loops = 0;
-
- /* PMU has been disabled because of counter freezing */
- cpuc->enabled = 0;
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- bts = true;
- intel_bts_disable_local();
- handled = intel_pmu_drain_bts_buffer();
- handled += intel_bts_interrupt();
- }
- status = intel_pmu_get_status();
- if (!status)
- goto done;
-again:
- intel_pmu_lbr_read();
- if (++loops > 100) {
- static bool warned;
-
- if (!warned) {
- WARN(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- warned = true;
- }
- intel_pmu_reset();
- goto done;
- }
-
-
- handled += handle_pmi_common(regs, status);
-done:
- /* Ack the PMI in the APIC */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /*
- * The counters start counting immediately while ack the status.
- * Make it as close as possible to IRET. This avoids bogus
- * freezing on Skylake CPUs.
- */
- if (status) {
- intel_pmu_ack_status(status);
- } else {
- /*
- * CPU may issues two PMIs very close to each other.
- * When the PMI handler services the first one, the
- * GLOBAL_STATUS is already updated to reflect both.
- * When it IRETs, the second PMI is immediately
- * handled and it sees clear status. At the meantime,
- * there may be a third PMI, because the freezing bit
- * isn't set since the ack in first PMI handlers.
- * Double check if there is more work to be done.
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
- }
-
- if (bts)
- intel_bts_enable_local();
- cpuc->enabled = pmu_enabled;
- return handled;
-}
-
/*
* This handler is triggered by the local APIC, so the APIC IRQ handling
* rules apply:
@@ -3563,6 +3627,26 @@ static int core_pmu_hw_config(struct perf_event *event)
return intel_pmu_bts_config(event);
}
+#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \
+ ((x86_pmu.num_topdown_events - 1) << 8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+ return is_metric_event(event) &&
+ event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
+static inline bool is_mem_loads_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
+}
+
+static inline bool is_mem_loads_aux_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
+}
+
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -3636,7 +3720,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.config & X86_ALL_EVENT_FLAGS)
return -EINVAL;
- if (is_metric_event(event)) {
+ if (is_available_metric_event(event)) {
struct perf_event *leader = event->group_leader;
/* The metric events don't support sampling. */
@@ -3665,6 +3749,33 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
}
+ /*
+ * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
+ * doesn't function quite right. As a work-around it needs to always be
+ * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
+ * The actual count of this second event is irrelevant it just needs
+ * to be active to make the first event function correctly.
+ *
+ * In a group, the auxiliary event must be in front of the load latency
+ * event. The rule is to simplify the implementation of the check.
+ * That's because perf cannot have a complete group at the moment.
+ */
+ if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+ (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
+ is_mem_loads_event(event)) {
+ struct perf_event *leader = event->group_leader;
+ struct perf_event *sibling = NULL;
+
+ if (!is_mem_loads_aux_event(leader)) {
+ for_each_sibling_event(sibling, leader) {
+ if (is_mem_loads_aux_event(sibling))
+ break;
+ }
+ if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
+ return -ENODATA;
+ }
+ }
+
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
return 0;
@@ -3680,26 +3791,6 @@ static int intel_pmu_hw_config(struct perf_event *event)
return 0;
}
-#ifdef CONFIG_RETPOLINE
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr);
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr);
-#endif
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-#ifdef CONFIG_RETPOLINE
- if (x86_pmu.guest_get_msrs == intel_guest_get_msrs)
- return intel_guest_get_msrs(nr);
- else if (x86_pmu.guest_get_msrs == core_guest_get_msrs)
- return core_guest_get_msrs(nr);
-#endif
- if (x86_pmu.guest_get_msrs)
- return x86_pmu.guest_get_msrs(nr);
- *nr = 0;
- return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -3865,6 +3956,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
}
static struct event_constraint *
+spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = icl_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0. If a :ppp event which is not
+ * available on the GP counter 0, error out.
+ */
+ if (event->attr.precise_ip == 3) {
+ if (c->idxmsk64 & BIT_ULL(0))
+ return &counter0_constraint;
+
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
@@ -3953,6 +4067,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left)
return max(left, 32ULL);
}
+static u64 spr_limit_period(struct perf_event *event, u64 left)
+{
+ if (event->attr.precise_ip == 3)
+ return max(left, 128ULL);
+
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -4094,9 +4216,6 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
- if (x86_pmu.counter_freezing)
- enable_counter_freeze();
-
/* Disable perf metrics if any added CPU doesn't support it. */
if (x86_pmu.intel_cap.perf_metrics) {
union perf_capabilities perf_cap;
@@ -4167,9 +4286,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dying(int cpu)
{
fini_debug_store_on_cpu(cpu);
-
- if (x86_pmu.counter_freezing)
- disable_counter_freeze();
}
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@@ -4397,6 +4513,9 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
@@ -4561,39 +4680,6 @@ static __init void intel_nehalem_quirk(void)
}
}
-static const struct x86_cpu_desc counter_freezing_ucodes[] = {
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006),
- {}
-};
-
-static bool intel_counter_freezing_broken(void)
-{
- return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
-}
-
-static __init void intel_counter_freezing_quirk(void)
-{
- /* Check if it's already disabled */
- if (disable_counter_freezing)
- return;
-
- /*
- * If the system starts with the wrong ucode, leave the
- * counter-freezing feature permanently disabled.
- */
- if (intel_counter_freezing_broken()) {
- pr_info("PMU counter freezing disabled due to CPU errata,"
- "please upgrade microcode\n");
- x86_pmu.counter_freezing = false;
- x86_pmu.handle_irq = intel_pmu_handle_irq;
- }
-}
-
/*
* enable software workaround for errata:
* SNB: BJ122
@@ -4703,6 +4789,42 @@ static struct attribute *icl_tsx_events_attrs[] = {
NULL,
};
+
+EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82");
+
+static struct attribute *spr_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_spr),
+ EVENT_PTR(mem_ld_aux),
+ NULL,
+};
+
+static struct attribute *spr_td_events_attrs[] = {
+ EVENT_PTR(slots),
+ EVENT_PTR(td_retiring),
+ EVENT_PTR(td_bad_spec),
+ EVENT_PTR(td_fe_bound),
+ EVENT_PTR(td_be_bound),
+ EVENT_PTR(td_heavy_ops),
+ EVENT_PTR(td_br_mispredict),
+ EVENT_PTR(td_fetch_lat),
+ EVENT_PTR(td_mem_bound),
+ NULL,
+};
+
+static struct attribute *spr_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_capacity_read),
+ EVENT_PTR(tx_capacity_write),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL,
+};
+
static ssize_t freeze_on_smi_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
@@ -4926,7 +5048,7 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
struct event_constraint *c;
- unsigned int unused;
+ unsigned int fixed_mask;
struct extra_reg *er;
bool pmem = false;
int version, i;
@@ -4948,7 +5070,7 @@ __init int intel_pmu_init(void)
* Check whether the Architectural PerfMon supports
* Branch Misses Retired hw_event or not.
*/
- cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+ cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
@@ -4972,15 +5094,15 @@ __init int intel_pmu_init(void)
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
*/
- if (version > 1) {
+ if (version > 1 && version < 5) {
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
x86_pmu.num_counters_fixed =
max((int)edx.split.num_counters_fixed, assume);
- }
- if (version >= 4)
- x86_pmu.counter_freezing = !disable_counter_freezing;
+ fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+ } else if (version >= 5)
+ x86_pmu.num_counters_fixed = fls(fixed_mask);
if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
@@ -5109,7 +5231,6 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GOLDMONT_D:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -5136,7 +5257,6 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -5483,12 +5603,50 @@ __init int intel_pmu_init(void)
x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 4;
x86_pmu.update_topdown_event = icl_update_topdown_event;
x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
pr_cont("Icelake events, ");
name = "icelake";
break;
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ pmem = true;
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ x86_pmu.event_constraints = intel_spr_event_constraints;
+ x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_spr_extra_regs;
+ x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = spr_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_skl_attr = skl_format_attr;
+ mem_attr = spr_events_attrs;
+ td_attr = spr_td_events_attrs;
+ tsx_attr = spr_tsx_events_attrs;
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+ x86_pmu.lbr_pt_coexist = true;
+ intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 8;
+ x86_pmu.update_topdown_event = icl_update_topdown_event;
+ x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+ pr_cont("Sapphire Rapids events, ");
+ name = "sapphire_rapids";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -5531,8 +5689,7 @@ __init int intel_pmu_init(void)
x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
}
- x86_pmu.intel_ctrl |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+ x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)
@@ -5549,13 +5706,22 @@ __init int intel_pmu_init(void)
* events to the generic counters.
*/
if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+ /*
+ * Disable topdown slots and metrics events,
+ * if slots event is not in CPUID.
+ */
+ if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
+ c->idxmsk64 = 0;
c->weight = hweight64(c->idxmsk64);
continue;
}
- if (c->cmask == FIXED_EVENT_FLAGS
- && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ if (c->cmask == FIXED_EVENT_FLAGS) {
+ /* Disabled fixed counters which are not in CPUID */
+ c->idxmsk64 &= x86_pmu.intel_ctrl;
+
+ if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
c->idxmsk64 &=
~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
@@ -5601,13 +5767,6 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
- /*
- * For arch perfmon 4 use counter freezing to avoid
- * several MSR accesses in the PMI.
- */
- if (x86_pmu.counter_freezing)
- x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
-
if (x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 67dbc91bccfe..7ebae1826403 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -36,7 +36,9 @@ union intel_x86_pebs_dse {
unsigned int ld_dse:4;
unsigned int ld_stlb_miss:1;
unsigned int ld_locked:1;
- unsigned int ld_reserved:26;
+ unsigned int ld_data_blk:1;
+ unsigned int ld_addr_blk:1;
+ unsigned int ld_reserved:24;
};
struct {
unsigned int st_l1d_hit:1;
@@ -45,6 +47,12 @@ union intel_x86_pebs_dse {
unsigned int st_locked:1;
unsigned int st_reserved2:26;
};
+ struct {
+ unsigned int st_lat_dse:4;
+ unsigned int st_lat_stlb_miss:1;
+ unsigned int st_lat_locked:1;
+ unsigned int ld_reserved3:26;
+ };
};
@@ -198,6 +206,63 @@ static u64 load_latency_data(u64 status)
if (dse.ld_locked)
val |= P(LOCK, LOCKED);
+ /*
+ * Ice Lake and earlier models do not support block infos.
+ */
+ if (!x86_pmu.pebs_block) {
+ val |= P(BLK, NA);
+ return val;
+ }
+ /*
+ * bit 6: load was blocked since its data could not be forwarded
+ * from a preceding store
+ */
+ if (dse.ld_data_blk)
+ val |= P(BLK, DATA);
+
+ /*
+ * bit 7: load was blocked due to potential address conflict with
+ * a preceding store
+ */
+ if (dse.ld_addr_blk)
+ val |= P(BLK, ADDR);
+
+ if (!dse.ld_data_blk && !dse.ld_addr_blk)
+ val |= P(BLK, NA);
+
+ return val;
+}
+
+static u64 store_latency_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = pebs_data_source[dse.st_lat_dse];
+
+ /*
+ * bit 4: TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (dse.st_lat_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /*
+ * bit 5: locked prefix
+ */
+ if (dse.st_lat_locked)
+ val |= P(LOCK, LOCKED);
+
+ val |= P(BLK, NA);
+
return val;
}
@@ -870,6 +935,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_spr_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
+ INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
@@ -960,7 +1047,8 @@ static void adaptive_pebs_record_size_update(void)
}
#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \
- PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+ PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_WEIGHT_TYPE | \
PERF_SAMPLE_TRANSACTION | \
PERF_SAMPLE_DATA_PAGE_SIZE)
@@ -987,7 +1075,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
(attr->sample_regs_intr & PEBS_GP_REGS);
- tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+ tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
((attr->config & INTEL_ARCH_EVENT_MASK) ==
x86_pmu.rtm_abort_event);
@@ -1331,6 +1419,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
if (fl & PERF_X86_EVENT_PEBS_LDLAT)
val = load_latency_data(aux);
+ else if (fl & PERF_X86_EVENT_PEBS_STLAT)
+ val = store_latency_data(aux);
else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
val = precise_datala_hsw(event, aux);
else if (fst)
@@ -1369,8 +1459,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
/*
* Use latency for weight (only avail with PEBS-LL)
*/
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data->weight = pebs->lat;
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+ data->weight.full = pebs->lat;
/*
* data.data_src encodes the data source
@@ -1462,8 +1552,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
- if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
- data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
+ if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+ data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
if (sample_type & PERF_SAMPLE_TRANSACTION)
data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
@@ -1507,6 +1597,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
#endif
}
+#define PEBS_LATENCY_MASK 0xffff
+#define PEBS_CACHE_LATENCY_OFFSET 32
+
/*
* With adaptive PEBS the layout depends on what fields are configured.
*/
@@ -1577,9 +1670,27 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
if (format_size & PEBS_DATACFG_MEMINFO) {
- if (sample_type & PERF_SAMPLE_WEIGHT)
- data->weight = meminfo->latency ?:
- intel_get_tsx_weight(meminfo->tsx_tuning);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ u64 weight = meminfo->latency;
+
+ if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
+ data->weight.var2_w = weight & PEBS_LATENCY_MASK;
+ weight >>= PEBS_CACHE_LATENCY_OFFSET;
+ }
+
+ /*
+ * Although meminfo::latency is defined as a u64,
+ * only the lower 32 bits include the valid data
+ * in practice on Ice Lake and earlier platforms.
+ */
+ if (sample_type & PERF_SAMPLE_WEIGHT) {
+ data->weight.full = weight ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ } else {
+ data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ }
+ }
if (sample_type & PERF_SAMPLE_DATA_SRC)
data->data_src.val = get_data_src(event, meminfo->aux);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 357258f82dc8..33c8180d5a87 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =
MODULE_LICENSE("GPL");
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
{
struct pci2phy_map *map;
- int phys_id = -1;
+ int die_id = -1;
raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
if (map->segment == pci_domain_nr(bus)) {
- phys_id = map->pbus_to_physid[bus->number];
+ die_id = map->pbus_to_dieid[bus->number];
break;
}
}
raw_spin_unlock(&pci2phy_map_lock);
- return phys_id;
+ return die_id;
}
static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ lookup:
alloc = NULL;
map->segment = segment;
for (i = 0; i < 256; i++)
- map->pbus_to_physid[i] = -1;
+ map->pbus_to_dieid[i] = -1;
list_add_tail(&map->list, &pci2phy_map_head);
end:
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
uncore_pmu_init_hrtimer(box);
box->cpu = -1;
- box->pci_phys_id = -1;
box->dieid = -1;
/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
/*
* Get the die information of a PCI device.
* @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
- int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
{
- *phys_id = uncore_pcibus_to_physid(pdev->bus);
- if (*phys_id < 0)
- return -ENODEV;
-
- *die = (topology_max_die_per_package() > 1) ? *phys_id :
- topology_phys_to_logical_pkg(*phys_id);
+ *die = uncore_pcibus_to_dieid(pdev->bus);
if (*die < 0)
return -EINVAL;
@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
* @pdev: The PCI device.
* @type: The corresponding PMU type of the device.
* @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
static int uncore_pci_pmu_register(struct pci_dev *pdev,
struct intel_uncore_type *type,
struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+ int die)
{
struct intel_uncore_box *box;
int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
atomic_inc(&box->refcnt);
- box->pci_phys_id = phys_id;
box->dieid = die;
box->pci_dev = pdev;
box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
{
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu = NULL;
- int phys_id, die, ret;
+ int die, ret;
- ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+ ret = uncore_pci_get_dev_die_info(pdev, &die);
if (ret)
return ret;
@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}
- ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+ ret = uncore_pci_pmu_register(pdev, type, pmu, die);
pci_set_drvdata(pdev, pmu->boxes[die]);
@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
/*
* Unregister the PMU of a PCI device
* @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
{
struct intel_uncore_box *box = pmu->boxes[die];
- if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
- return;
-
pmu->boxes[die] = NULL;
if (atomic_dec_return(&pmu->activeboxes) == 0)
uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
- int i, phys_id, die;
+ int i, die;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return;
box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
pci_set_drvdata(pdev, NULL);
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
}
static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
struct intel_uncore_pmu *pmu;
- int phys_id, die;
+ int die;
/* Unregister the PMU when the device is going to be deleted. */
if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (!pmu)
return NOTIFY_DONE;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return NOTIFY_DONE;
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
return NOTIFY_OK;
}
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
struct pci_dev *pci_sub_dev;
bool notify = false;
unsigned int devfn;
- int phys_id, die;
+ int die;
while (ids && ids->vendor) {
pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
if (!pmu)
continue;
- if (uncore_pci_get_dev_die_info(pci_sub_dev,
- &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;
if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
- phys_id, die))
+ die))
notify = true;
}
ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 9efea154349d..a3c6e1643ad2 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
};
struct intel_uncore_box {
- int pci_phys_id;
int dieid; /* Logical die ID */
int n_active; /* number of active events */
int n_events;
@@ -173,11 +172,11 @@ struct freerunning_counters {
struct pci2phy_map {
struct list_head list;
int segment;
- int pbus_to_physid[256];
+ int pbus_to_dieid[256];
};
struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 098f893e2e22..51271288499e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
pci_dev_put(dev);
return -ENOMEM;
}
- map->pbus_to_physid[bus] = 0;
+ map->pbus_to_dieid[bus] = 0;
raw_spin_unlock(&pci2phy_map_lock);
pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb1821215d..b79951d0707c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid, segment;
+ int i, bus, nodeid, segment, die_id;
struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1370,36 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
if (!ubox_dev)
break;
bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
- if (err)
- break;
+ /*
+ * The nodeid and idmap registers only contain enough
+ * information to handle 8 nodes. On systems with more
+ * than 8 nodes, we need to rely on NUMA information,
+ * filled in from BIOS supplied information, to determine
+ * the topology.
+ */
+ if (nr_node_ids <= 8) {
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+ if (err)
+ break;
+ nodeid = config & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ if (err)
+ break;
- segment = pci_domain_nr(ubox_dev->bus);
- raw_spin_lock(&pci2phy_map_lock);
- map = __find_pci2phy_map(segment);
- if (!map) {
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
+ break;
+ }
+ }
raw_spin_unlock(&pci2phy_map_lock);
- err = -ENOMEM;
- break;
- }
+ } else {
+ int node = pcibus_to_node(ubox_dev->bus);
+ int cpu;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- map->pbus_to_physid[bus] = i;
+ die_id = -1;
+ for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ if (WARN_ON_ONCE(die_id == -1)) {
+ err = -EINVAL;
break;
}
}
- raw_spin_unlock(&pci2phy_map_lock);
}
if (!err) {
@@ -1412,17 +1453,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
}
}
@@ -4646,19 +4687,14 @@ int snr_uncore_pci_init(void)
static struct pci_dev *snr_uncore_get_mc_dev(int id)
{
struct pci_dev *mc_dev = NULL;
- int phys_id, pkg;
+ int pkg;
while (1) {
mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
if (!mc_dev)
break;
- phys_id = uncore_pcibus_to_physid(mc_dev->bus);
- if (phys_id < 0)
- continue;
- pkg = topology_phys_to_logical_pkg(phys_id);
- if (pkg < 0)
- continue;
- else if (pkg == id)
+ pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+ if (pkg == id)
break;
}
return mc_dev;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7895cf4c59a7..53b2b5fc23bc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -80,6 +80,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */
#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -443,6 +444,10 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+#define INTEL_PSD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT)
+
#define INTEL_PST_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
@@ -682,8 +687,7 @@ struct x86_pmu {
/* PMI handler bits */
unsigned int late_ack :1,
- enabled_ack :1,
- counter_freezing :1;
+ enabled_ack :1;
/*
* sysfs attrs
*/
@@ -724,7 +728,8 @@ struct x86_pmu {
pebs_broken :1,
pebs_prec_dist :1,
pebs_no_tlb :1,
- pebs_no_isolation :1;
+ pebs_no_isolation :1,
+ pebs_block :1;
int pebs_record_size;
int pebs_buffer_size;
int max_pebs_events;
@@ -776,6 +781,7 @@ struct x86_pmu {
/*
* Intel perf metrics
*/
+ int num_topdown_events;
u64 (*update_topdown_event)(struct perf_event *event);
int (*set_topdown_event_period)(struct perf_event *event);
@@ -871,6 +877,8 @@ do { \
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
+#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
+#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1060,6 +1068,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline bool fixed_counter_disabled(int i)
+{
+ return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+}
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -1157,6 +1170,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
extern struct event_constraint intel_icl_pebs_event_constraints[];
+extern struct event_constraint intel_spr_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_add(struct perf_event *event);
diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c
index 136a1e847254..600bf8d15c0c 100644
--- a/arch/x86/events/probe.c
+++ b/arch/x86/events/probe.c
@@ -28,6 +28,7 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
for (bit = 0; bit < cnt; bit++) {
if (!msr[bit].no_check) {
struct attribute_group *grp = msr[bit].grp;
+ u64 mask;
/* skip entry with no group */
if (!grp)
@@ -44,8 +45,12 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
if (rdmsrl_safe(msr[bit].msr, &val))
continue;
+
+ mask = msr[bit].mask;
+ if (!mask)
+ mask = ~0ULL;
/* Disable zero counters if requested. */
- if (!zero && !val)
+ if (!zero && !(val & mask))
continue;
grp->is_visible = NULL;
diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h
index 4c8e0afc5fb5..261b9bda24e3 100644
--- a/arch/x86/events/probe.h
+++ b/arch/x86/events/probe.h
@@ -4,10 +4,11 @@
#include <linux/sysfs.h>
struct perf_msr {
- u64 msr;
- struct attribute_group *grp;
+ u64 msr;
+ struct attribute_group *grp;
bool (*test)(int idx, void *data);
- bool no_check;
+ bool no_check;
+ u64 mask;
};
unsigned long
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 7dbbeaacd995..f42a70496a24 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -454,16 +454,9 @@ static struct attribute *rapl_events_cores[] = {
NULL,
};
-static umode_t
-rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- return 0;
-}
-
static struct attribute_group rapl_events_cores_group = {
.name = "events",
.attrs = rapl_events_cores,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_pkg[] = {
@@ -476,7 +469,6 @@ static struct attribute *rapl_events_pkg[] = {
static struct attribute_group rapl_events_pkg_group = {
.name = "events",
.attrs = rapl_events_pkg,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_ram[] = {
@@ -489,7 +481,6 @@ static struct attribute *rapl_events_ram[] = {
static struct attribute_group rapl_events_ram_group = {
.name = "events",
.attrs = rapl_events_ram,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_gpu[] = {
@@ -502,7 +493,6 @@ static struct attribute *rapl_events_gpu[] = {
static struct attribute_group rapl_events_gpu_group = {
.name = "events",
.attrs = rapl_events_gpu,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_psys[] = {
@@ -515,7 +505,6 @@ static struct attribute *rapl_events_psys[] = {
static struct attribute_group rapl_events_psys_group = {
.name = "events",
.attrs = rapl_events_psys,
- .is_visible = rapl_not_visible,
};
static bool test_msr(int idx, void *data)
@@ -523,12 +512,23 @@ static bool test_msr(int idx, void *data)
return test_bit(idx, (unsigned long *) data);
}
+/* Only lower 32bits of the MSR represents the energy counter */
+#define RAPL_MSR_MASK 0xFFFFFFFF
+
static struct perf_msr intel_rapl_msrs[] = {
- [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr },
- [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
- [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr },
- [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr },
- [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr },
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
+};
+
+static struct perf_msr intel_rapl_spr_msrs[] = {
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
};
/*
@@ -761,7 +761,7 @@ static struct rapl_model model_spr = {
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_fam17h = {
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index f145e3326c6d..be09c7eac89f 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -159,17 +159,6 @@ struct compat_shmid64_ds {
compat_ulong_t __unused5;
};
-/*
- * The type of struct elf_prstatus.pr_reg in compatible core dumps.
- */
-typedef struct user_regs_struct compat_elf_gregset_t;
-
-/* Full regset -- prstatus on x32, otherwise on ia32 */
-#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
-#define SET_PR_FPVALID(S, V, R) \
- do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
- while (0)
-
#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 59bf91c57aa8..1728d4ce5730 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -30,6 +30,7 @@ enum cpuid_leafs
CPUID_7_ECX,
CPUID_8000_0007_EBX,
CPUID_7_EDX,
+ CPUID_8000_001F_EAX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
@@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 84b887825f12..1feb6c089ba2 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 19 /* N 32-bit words worth of info */
+#define NCAPINTS 20 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -96,7 +96,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
-#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */
+/* FREE! ( 3*32+17) */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
@@ -201,7 +201,7 @@
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
+/* FREE! ( 7*32+10) */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
@@ -211,7 +211,7 @@
#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
+/* FREE! ( 7*32+20) */
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
@@ -236,8 +236,6 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
-#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
-#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -385,6 +383,13 @@
#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
+#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
+#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
+#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 7947cb1782da..b7dd944dc867 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -91,6 +91,7 @@
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define DISABLED_MASK19 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c98f78330b09..4d0b126835b8 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -12,6 +12,7 @@
#include <linux/pgtable.h>
extern unsigned long efi_fw_vendor, efi_config_table;
+extern unsigned long efi_mixed_mode_stack_pa;
/*
* We map the EFI regions needed for runtime services non-contiguously,
@@ -68,17 +69,33 @@ extern unsigned long efi_fw_vendor, efi_config_table;
#f " called with too many arguments (" #p ">" #n ")"); \
})
+static inline void efi_fpu_begin(void)
+{
+ /*
+ * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires
+ * that FCW and MXCSR (64-bit) must be initialized prior to calling
+ * UEFI code. (Oddly the spec does not require that the FPU stack
+ * be empty.)
+ */
+ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+}
+
+static inline void efi_fpu_end(void)
+{
+ kernel_fpu_end();
+}
+
#ifdef CONFIG_X86_32
#define arch_efi_call_virt_setup() \
({ \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
})
#define arch_efi_call_virt_teardown() \
({ \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#define arch_efi_call_virt(p, f, args...) p->f(args)
@@ -94,22 +111,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
__efi_call(__VA_ARGS__); \
})
-/*
- * struct efi_scratch - Scratch space used while switching to/from efi_mm
- * @phys_stack: stack used during EFI Mixed Mode
- * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm
- */
-struct efi_scratch {
- u64 phys_stack;
- struct mm_struct *prev_mm;
-} __packed;
-
#define arch_efi_call_virt_setup() \
({ \
efi_sync_low_kernel_mappings(); \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
- efi_switch_mm(&efi_mm); \
+ efi_enter_mm(); \
})
#define arch_efi_call_virt(p, f, args...) \
@@ -117,9 +124,9 @@ struct efi_scratch {
#define arch_efi_call_virt_teardown() \
({ \
- efi_switch_mm(efi_scratch.prev_mm); \
+ efi_leave_mm(); \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#ifdef CONFIG_KASAN
@@ -136,7 +143,6 @@ struct efi_scratch {
#endif /* CONFIG_X86_32 */
-extern struct efi_scratch efi_scratch;
extern int __init efi_memblock_x86_reserve_range(void);
extern void __init efi_print_memmap(void);
extern void __init efi_map_region(efi_memory_desc_t *md);
@@ -149,10 +155,12 @@ extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
extern int __init efi_reuse_config(u64 tables, int nr_tables);
extern void efi_delete_dummy_variable(void);
-extern void efi_switch_mm(struct mm_struct *mm);
-extern void efi_recover_from_page_fault(unsigned long phys_addr);
+extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
extern void efi_free_boot_services(void);
+void efi_enter_mm(void);
+void efi_leave_mm(void);
+
/* kexec external ABI */
struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 66bdfe838d61..9224d40cdefe 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -364,7 +364,7 @@ do { \
#define COMPAT_ARCH_DLINFO \
if (exec->e_machine == EM_X86_64) \
ARCH_DLINFO_X32; \
-else \
+else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
ARCH_DLINFO_IA32
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h
new file mode 100644
index 000000000000..f1b6c7a8d8fc
--- /dev/null
+++ b/arch/x86/include/asm/elfcore-compat.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_ELFCORE_COMPAT_H
+#define _ASM_X86_ELFCORE_COMPAT_H
+
+#include <asm/user32.h>
+
+/*
+ * On amd64 we have two 32bit ABIs - i386 and x32. The latter
+ * has bigger registers, so we use it for compat_elf_regset_t.
+ * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID
+ * are used to choose the size and location of ->pr_fpvalid of
+ * the layout actually used.
+ */
+typedef struct user_regs_struct compat_elf_gregset_t;
+
+struct i386_elf_prstatus
+{
+ struct compat_elf_prstatus_common common;
+ struct user_regs_struct32 pr_reg;
+ compat_int_t pr_fpvalid;
+};
+
+#define PRSTATUS_SIZE \
+ (user_64bit_mode(task_pt_regs(current)) \
+ ? sizeof(struct compat_elf_prstatus) \
+ : sizeof(struct i386_elf_prstatus))
+#define SET_PR_FPVALID(S) \
+ (*(user_64bit_mode(task_pt_regs(current)) \
+ ? &(S)->pr_fpvalid \
+ : &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1)
+
+#endif
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 67a4f1cb2aac..ed33a14188f6 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -32,7 +32,19 @@ extern void fpregs_mark_activate(void);
/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
{
+#ifdef CONFIG_X86_64
+ /*
+ * Any 64-bit code that uses 387 instructions must explicitly request
+ * KFPU_387.
+ */
+ kernel_fpu_begin_mask(KFPU_MXCSR);
+#else
+ /*
+ * 32-bit kernel code may use 387 operations as well as SSE2, etc,
+ * as long as it checks that the CPU has the required capability.
+ */
kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+#endif
}
/*
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index f656aabd1545..41e2e2e1b439 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -585,6 +585,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
#else
DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
#endif
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
+#endif
#endif
/* NMI */
@@ -605,6 +608,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
+#endif
/* #VC */
#ifdef CONFIG_AMD_MEM_ENCRYPT
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 528c8a71fe7f..76d389691b5b 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -40,8 +40,6 @@ extern void native_init_IRQ(void);
extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs);
-extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);
-
extern void init_ISA_irqs(void);
extern void __init init_IRQ(void);
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2dfc8d380dab..144d70ea4393 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void)
return flags;
}
-extern inline void native_restore_fl(unsigned long flags);
-extern inline void native_restore_fl(unsigned long flags)
-{
- asm volatile("push %0 ; popf"
- : /* no output */
- :"g" (flags)
- :"memory", "cc");
-}
-
static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
@@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void)
return native_save_fl();
}
-static __always_inline void arch_local_irq_restore(unsigned long flags)
-{
- native_restore_fl(flags);
-}
-
static __always_inline void arch_local_irq_disable(void)
{
native_irq_disable();
@@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
-#define SWAPGS swapgs
-/*
- * Currently paravirt can't handle swapgs nicely when we
- * don't have a stack we can rely on (such as a user space
- * stack). So we either find a way around these or just fault
- * and emulate if a guest tries to call swapgs directly.
- *
- * Either way, this is a good way to document that we don't
- * have a reliable stack. x86_64 only.
- */
-#define SWAPGS_UNSAFE_STACK swapgs
-
#define INTERRUPT_RETURN jmp native_iret
-#define USERGS_SYSRET64 \
- swapgs; \
- sysretq;
-#define USERGS_SYSRET32 \
- swapgs; \
- sysretl
#else
#define INTERRUPT_RETURN iret
@@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void)
return arch_irqs_disabled_flags(flags);
}
+
+static __always_inline void arch_local_irq_restore(unsigned long flags)
+{
+ if (!arch_irqs_disabled_flags(flags))
+ arch_local_irq_enable();
+}
+#else
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PV
+#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#else
+#define SWAPGS swapgs
+#endif
+#endif
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 991a7ad540c7..d20a3d6be36e 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -58,14 +58,17 @@ struct arch_specific_insn {
/* copy of the original instruction */
kprobe_opcode_t *insn;
/*
- * boostable = false: This instruction type is not boostable.
- * boostable = true: This instruction has been boosted: we have
+ * boostable = 0: This instruction type is not boostable.
+ * boostable = 1: This instruction has been boosted: we have
* added a relative jump after the instruction copy in insn,
* so no single-step and fixup are needed (unless there's
* a post_handler).
*/
- bool boostable;
- bool if_modifier;
+ unsigned boostable:1;
+ unsigned if_modifier:1;
+ unsigned is_call:1;
+ unsigned is_pushf:1;
+ unsigned is_abs_ip:1;
/* Number of bytes of text poked */
int tp_len;
};
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 56cdeaac76a0..ddfb3cad8dff 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -289,28 +289,6 @@ extern void (*mce_threshold_vector)(void);
extern void (*deferred_error_int_vector)(void);
/*
- * Thermal handler
- */
-
-void intel_init_thermal(struct cpuinfo_x86 *c);
-
-/* Interrupt Handler for core thermal thresholds */
-extern int (*platform_thermal_notify)(__u64 msr_val);
-
-/* Interrupt Handler for package thermal thresholds */
-extern int (*platform_thermal_package_notify)(__u64 msr_val);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-extern bool (*platform_thermal_package_rate_control)(void);
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-extern void mcheck_intel_therm_init(void);
-#else
-static inline void mcheck_intel_therm_init(void) { }
-#endif
-
-/*
* Used by APEI to report memory error via /dev/mcelog
*/
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2b7cc5397f80..ab45a220fac4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -127,14 +127,12 @@ static inline unsigned int x86_cpuid_family(void)
}
#ifdef CONFIG_MICROCODE
-int __init microcode_init(void);
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
extern bool initrd_gone;
#else
-static inline int __init microcode_init(void) { return 0; };
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 9d5d949e662e..1cb9c17a4cb4 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -9,7 +9,6 @@
#ifdef CONFIG_X86_LOCAL_APIC
-extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 645bd1d0ee07..64297eabad63 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -66,7 +66,7 @@
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything executable
+ * We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f8dce11d2bc1..4abf110e2243 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void)
return PVOP_CALLEE0(unsigned long, irq.save_fl);
}
-static inline notrace void arch_local_irq_restore(unsigned long f)
-{
- PVOP_VCALLEE1(irq.restore_fl, f);
-}
-
static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(irq.irq_disable);
@@ -776,31 +771,6 @@ extern void default_banner(void);
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
-/*
- * If swapgs is used while the userspace stack is still current,
- * there's no way to call a pvop. The PV replacement *must* be
- * inlined, or the swapgs instruction must be trapped and emulated.
- */
-#define SWAPGS_UNSAFE_STACK \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
-
-/*
- * Note: swapgs is very special, and in practise is either going to be
- * implemented with a single "swapgs" instruction or something very
- * special. Either way, we don't need to save any registers for
- * it.
- */
-#define SWAPGS \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
- )
-
-#define USERGS_SYSRET64 \
- PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);)
-
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b6b02b7c19cc..de87087d3bde 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -156,20 +156,10 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
- /*
- * Switch to usermode gs and return to 64-bit usermode using
- * sysret. Only used in 64-bit kernels to return to 64-bit
- * processes. Usermode register state, including %rsp, must
- * already be restored.
- */
- void (*usergs_sysret64)(void);
-
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
- void (*swapgs)(void);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
@@ -178,16 +168,13 @@ struct pv_cpu_ops {
struct pv_irq_ops {
#ifdef CONFIG_PARAVIRT_XXL
/*
- * Get/set interrupt state. save_fl and restore_fl are only
- * expected to use X86_EFLAGS_IF; all other bits
- * returned from save_fl are undefined, and may be ignored by
- * restore_fl.
+ * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF;
+ * all other bits returned from save_fl are undefined.
*
* NOTE: These functions callers expect the callee to preserve
* more registers than the standard C calling convention.
*/
struct paravirt_callee_save save_fl;
- struct paravirt_callee_save restore_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index b9a7fd0a27e2..544f41a179fb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -261,8 +261,12 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1)
#define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2)
#define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3)
-#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND
-#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \
+#define INTEL_PMC_IDX_TD_HEAVY_OPS (INTEL_PMC_IDX_METRIC_BASE + 4)
+#define INTEL_PMC_IDX_TD_BR_MISPREDICT (INTEL_PMC_IDX_METRIC_BASE + 5)
+#define INTEL_PMC_IDX_TD_FETCH_LAT (INTEL_PMC_IDX_METRIC_BASE + 6)
+#define INTEL_PMC_IDX_TD_MEM_BOUND (INTEL_PMC_IDX_METRIC_BASE + 7)
+#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_MEM_BOUND
+#define INTEL_PMC_MSK_TOPDOWN ((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \
INTEL_PMC_MSK_FIXED_SLOTS)
/*
@@ -280,8 +284,14 @@ struct x86_pmu_capability {
#define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */
#define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */
#define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */
-#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND
-#define INTEL_TD_METRIC_NUM 4
+/* Level 2 metrics */
+#define INTEL_TD_METRIC_HEAVY_OPS 0x8400 /* Heavy Operations metric */
+#define INTEL_TD_METRIC_BR_MISPREDICT 0x8500 /* Branch Mispredict metric */
+#define INTEL_TD_METRIC_FETCH_LAT 0x8600 /* Fetch Latency metric */
+#define INTEL_TD_METRIC_MEM_BOUND 0x8700 /* Memory bound metric */
+
+#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
+#define INTEL_TD_METRIC_NUM 8
static inline bool is_metric_idx(int idx)
{
@@ -483,11 +493,7 @@ static inline void perf_check_microcode(void) { }
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
#else
-static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
- *nr = 0;
- return NULL;
-}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
return -1;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 394757ee030a..f24d7ef8fffa 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -177,8 +177,6 @@ enum page_cache_mode {
#define __pgprot(x) ((pgprot_t) { (x) } )
#define __pg(x) __pgprot(x)
-#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
-
#define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G)
#define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0)
#define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 69485ca13665..f8cb8af4de5c 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -5,6 +5,7 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>
+#include <linux/static_call_types.h>
DECLARE_PER_CPU(int, __preempt_count);
@@ -103,16 +104,45 @@ static __always_inline bool should_resched(int preempt_offset)
}
#ifdef CONFIG_PREEMPTION
- extern asmlinkage void preempt_schedule_thunk(void);
-# define __preempt_schedule() \
- asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)
- extern asmlinkage void preempt_schedule(void);
- extern asmlinkage void preempt_schedule_notrace_thunk(void);
-# define __preempt_schedule_notrace() \
- asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)
+extern asmlinkage void preempt_schedule(void);
+extern asmlinkage void preempt_schedule_thunk(void);
- extern asmlinkage void preempt_schedule_notrace(void);
-#endif
+#define __preempt_schedule_func preempt_schedule_thunk
+
+extern asmlinkage void preempt_schedule_notrace(void);
+extern asmlinkage void preempt_schedule_notrace_thunk(void);
+
+#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+
+#define __preempt_schedule() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+
+#define __preempt_schedule_notrace() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+#else /* PREEMPT_DYNAMIC */
+
+#define __preempt_schedule() \
+ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);
+
+#define __preempt_schedule_notrace() \
+ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);
+
+#endif /* PREEMPT_DYNAMIC */
+
+#endif /* PREEMPTION */
#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 3ff0d48469f2..b2d504f11937 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -101,6 +101,7 @@
#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define REQUIRED_MASK19 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 07603064df8f..d60ed0668a59 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -56,19 +56,22 @@ static void __resctrl_sched_in(void)
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
u32 closid = state->default_closid;
u32 rmid = state->default_rmid;
+ u32 tmp;
/*
* If this task has a closid/rmid assigned, use it.
* Else use the closid/rmid assigned to this cpu.
*/
if (static_branch_likely(&rdt_alloc_enable_key)) {
- if (current->closid)
- closid = current->closid;
+ tmp = READ_ONCE(current->closid);
+ if (tmp)
+ closid = tmp;
}
if (static_branch_likely(&rdt_mon_enable_key)) {
- if (current->rmid)
- rmid = current->rmid;
+ tmp = READ_ONCE(current->rmid);
+ if (tmp)
+ rmid = tmp;
}
if (closid != state->cur_closid || rmid != state->cur_rmid) {
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index cc177b4431ae..1d3cbaef4bb7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -243,10 +243,10 @@ static inline void serialize(void)
}
/* The dst parameter must be 64-bytes aligned */
-static inline void movdir64b(void *dst, const void *src)
+static inline void movdir64b(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
/*
* MOVDIR64B %(rdx), rax.
@@ -286,7 +286,7 @@ static inline void movdir64b(void *dst, const void *src)
static inline int enqcmds(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
int zf;
/*
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index c37f11999d0c..cbb67b6030f9 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -37,4 +37,11 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+
+#define ARCH_ADD_TRAMP_KEY(name) \
+ asm(".pushsection .static_call_tramp_key, \"a\" \n" \
+ ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \
+ ".long " STATIC_CALL_KEY_STR(name) " - . \n" \
+ ".popsection \n")
+
#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
new file mode 100644
index 000000000000..ddbdefd5b94f
--- /dev/null
+++ b/arch/x86/include/asm/thermal.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_THERMAL_H
+#define _ASM_X86_THERMAL_H
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+void intel_init_thermal(struct cpuinfo_x86 *c);
+bool x86_thermal_enabled(void);
+void intel_thermal_interrupt(void);
+#else
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+#endif
+
+#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 820082bd6880..1bfe979bb9bc 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,6 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 26efbec94448..9e8ac5073ecb 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -36,7 +36,6 @@ struct vm86 {
unsigned long saved_sp0;
unsigned long flags;
- unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
index d2ee4e307ef8..18909b8050bc 100644
--- a/arch/x86/include/uapi/asm/vm86.h
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -97,7 +97,7 @@ struct revectored_struct {
struct vm86_struct {
struct vm86_regs regs;
unsigned long flags;
- unsigned long screen_bitmap;
+ unsigned long screen_bitmap; /* unused, preserved by vm86() */
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
@@ -106,7 +106,7 @@ struct vm86_struct {
/*
* flags masks
*/
-#define VM86_SCREEN_BITMAP 0x0001
+#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */
struct vm86plus_info_struct {
unsigned long force_return_for_pic:1;
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 828be792231e..b14533af7676 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,9 +13,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
#ifdef CONFIG_PARAVIRT_XXL
- OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
- cpu.usergs_sysret64);
- OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..9215b91bc044 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+ if (c->extended_cpuid_level >= 0x8000001f)
+ c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+
init_scattered_cpuid_features(c);
init_speculation_control(c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816fdbec795a..0e422a544835 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -24,6 +24,7 @@
#include <asm/traps.h>
#include <asm/resctrl.h>
#include <asm/numa.h>
+#include <asm/thermal.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c)
tsx_disable();
split_lock_init();
+
+ intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
index 9f020c994154..015856abdbb1 100644
--- a/arch/x86/kernel/cpu/mce/Makefile
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
mce-inject-y := inject.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e133ce1e562b..7962355436da 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -878,6 +878,12 @@ static atomic_t mce_executing;
static atomic_t mce_callin;
/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
+/*
* Check if a timeout waiting for other CPUs happened.
*/
static int mce_timed_out(u64 *t, const char *msg)
@@ -894,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- if (mca_cfg.tolerant <= 1)
+ if (mca_cfg.tolerant <= 1) {
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
+ }
cpu_missing = 1;
return 1;
}
@@ -1006,6 +1016,7 @@ static int mce_start(int *no_way_out)
* is updated before mce_callin.
*/
order = atomic_inc_return(&mce_callin);
+ cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
/*
* Wait for everyone.
@@ -1114,6 +1125,7 @@ static int mce_end(int order)
reset:
atomic_set(&global_nwo, 0);
atomic_set(&mce_callin, 0);
+ cpumask_setall(&mce_missing_cpus);
barrier();
/*
@@ -2178,7 +2190,6 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
- mcheck_intel_therm_init();
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
@@ -2713,6 +2724,7 @@ static void mce_reset(void)
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
atomic_set(&global_nwo, 0);
+ cpumask_setall(&mce_missing_cpus);
}
static int fake_panic_get(void *data, u64 *val)
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index c2476fe0682e..e309476743b7 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -531,7 +531,6 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
- intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
intel_ppin_init(c);
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
deleted file mode 100644
index a7cd2d203ced..000000000000
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ /dev/null
@@ -1,739 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Thermal throttle event support code (such as syslog messaging and rate
- * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
- *
- * This allows consistent reporting of CPU thermal throttle events.
- *
- * Maintains a counter in /sys that keeps track of the number of thermal
- * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog is rate limited).
- *
- * Author: Dmitriy Zavin (dmitriyz@google.com)
- *
- * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
- * Inspired by Ross Biro's and Al Borchers' counter code.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/processor.h>
-#include <asm/traps.h>
-#include <asm/apic.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
-
-#include "internal.h"
-
-/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
-
-#define THERMAL_THROTTLING_EVENT 0
-#define POWER_LIMIT_EVENT 1
-
-/**
- * struct _thermal_state - Represent the current thermal event state
- * @next_check: Stores the next timestamp, when it is allowed
- * to log the next warning message.
- * @last_interrupt_time: Stores the timestamp for the last threshold
- * high event.
- * @therm_work: Delayed workqueue structure
- * @count: Stores the current running count for thermal
- * or power threshold interrupts.
- * @last_count: Stores the previous running count for thermal
- * or power threshold interrupts.
- * @max_time_ms: This shows the maximum amount of time CPU was
- * in throttled state for a single thermal
- * threshold high to low state.
- * @total_time_ms: This is a cumulative time during which CPU was
- * in the throttled state.
- * @rate_control_active: Set when a throttling message is logged.
- * This is used for the purpose of rate-control.
- * @new_event: Stores the last high/low status of the
- * THERM_STATUS_PROCHOT or
- * THERM_STATUS_POWER_LIMIT.
- * @level: Stores whether this _thermal_state instance is
- * for a CORE level or for PACKAGE level.
- * @sample_index: Index for storing the next sample in the buffer
- * temp_samples[].
- * @sample_count: Total number of samples collected in the buffer
- * temp_samples[].
- * @average: The last moving average of temperature samples
- * @baseline_temp: Temperature at which thermal threshold high
- * interrupt was generated.
- * @temp_samples: Storage for temperature samples to calculate
- * moving average.
- *
- * This structure is used to represent data related to thermal state for a CPU.
- * There is a separate storage for core and package level for each CPU.
- */
-struct _thermal_state {
- u64 next_check;
- u64 last_interrupt_time;
- struct delayed_work therm_work;
- unsigned long count;
- unsigned long last_count;
- unsigned long max_time_ms;
- unsigned long total_time_ms;
- bool rate_control_active;
- bool new_event;
- u8 level;
- u8 sample_index;
- u8 sample_count;
- u8 average;
- u8 baseline_temp;
- u8 temp_samples[3];
-};
-
-struct thermal_state {
- struct _thermal_state core_throttle;
- struct _thermal_state core_power_limit;
- struct _thermal_state package_throttle;
- struct _thermal_state package_power_limit;
- struct _thermal_state core_thresh0;
- struct _thermal_state core_thresh1;
- struct _thermal_state pkg_thresh0;
- struct _thermal_state pkg_thresh1;
-};
-
-/* Callback to handle core threshold interrupts */
-int (*platform_thermal_notify)(__u64 msr_val);
-EXPORT_SYMBOL(platform_thermal_notify);
-
-/* Callback to handle core package threshold_interrupts */
-int (*platform_thermal_package_notify)(__u64 msr_val);
-EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-bool (*platform_thermal_package_rate_control)(void);
-EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
-
-
-static DEFINE_PER_CPU(struct thermal_state, thermal_state);
-
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
-
-static u32 lvtthmr_init __read_mostly;
-
-#ifdef CONFIG_SYSFS
-#define define_therm_throt_device_one_ro(_name) \
- static DEVICE_ATTR(_name, 0444, \
- therm_throt_device_show_##_name, \
- NULL) \
-
-#define define_therm_throt_device_show_func(event, name) \
- \
-static ssize_t therm_throt_device_show_##event##_##name( \
- struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) { \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).event.name); \
- } else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
-}
-
-define_therm_throt_device_show_func(core_throttle, count);
-define_therm_throt_device_one_ro(core_throttle_count);
-
-define_therm_throt_device_show_func(core_power_limit, count);
-define_therm_throt_device_one_ro(core_power_limit_count);
-
-define_therm_throt_device_show_func(package_throttle, count);
-define_therm_throt_device_one_ro(package_throttle_count);
-
-define_therm_throt_device_show_func(package_power_limit, count);
-define_therm_throt_device_one_ro(package_power_limit_count);
-
-define_therm_throt_device_show_func(core_throttle, max_time_ms);
-define_therm_throt_device_one_ro(core_throttle_max_time_ms);
-
-define_therm_throt_device_show_func(package_throttle, max_time_ms);
-define_therm_throt_device_one_ro(package_throttle_max_time_ms);
-
-define_therm_throt_device_show_func(core_throttle, total_time_ms);
-define_therm_throt_device_one_ro(core_throttle_total_time_ms);
-
-define_therm_throt_device_show_func(package_throttle, total_time_ms);
-define_therm_throt_device_one_ro(package_throttle_total_time_ms);
-
-static struct attribute *thermal_throttle_attrs[] = {
- &dev_attr_core_throttle_count.attr,
- &dev_attr_core_throttle_max_time_ms.attr,
- &dev_attr_core_throttle_total_time_ms.attr,
- NULL
-};
-
-static const struct attribute_group thermal_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
-};
-#endif /* CONFIG_SYSFS */
-
-#define CORE_LEVEL 0
-#define PACKAGE_LEVEL 1
-
-#define THERM_THROT_POLL_INTERVAL HZ
-#define THERM_STATUS_PROCHOT_LOG BIT(1)
-
-#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
-#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
-
-static void clear_therm_status_log(int level)
-{
- int msr;
- u64 mask, msr_val;
-
- if (level == CORE_LEVEL) {
- msr = MSR_IA32_THERM_STATUS;
- mask = THERM_STATUS_CLEAR_CORE_MASK;
- } else {
- msr = MSR_IA32_PACKAGE_THERM_STATUS;
- mask = THERM_STATUS_CLEAR_PKG_MASK;
- }
-
- rdmsrl(msr, msr_val);
- msr_val &= mask;
- wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
-}
-
-static void get_therm_status(int level, bool *proc_hot, u8 *temp)
-{
- int msr;
- u64 msr_val;
-
- if (level == CORE_LEVEL)
- msr = MSR_IA32_THERM_STATUS;
- else
- msr = MSR_IA32_PACKAGE_THERM_STATUS;
-
- rdmsrl(msr, msr_val);
- if (msr_val & THERM_STATUS_PROCHOT_LOG)
- *proc_hot = true;
- else
- *proc_hot = false;
-
- *temp = (msr_val >> 16) & 0x7F;
-}
-
-static void __maybe_unused throttle_active_work(struct work_struct *work)
-{
- struct _thermal_state *state = container_of(to_delayed_work(work),
- struct _thermal_state, therm_work);
- unsigned int i, avg, this_cpu = smp_processor_id();
- u64 now = get_jiffies_64();
- bool hot;
- u8 temp;
-
- get_therm_status(state->level, &hot, &temp);
- /* temperature value is offset from the max so lesser means hotter */
- if (!hot && temp > state->baseline_temp) {
- if (state->rate_control_active)
- pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
- this_cpu,
- state->level == CORE_LEVEL ? "Core" : "Package",
- state->count);
-
- state->rate_control_active = false;
- return;
- }
-
- if (time_before64(now, state->next_check) &&
- state->rate_control_active)
- goto re_arm;
-
- state->next_check = now + CHECK_INTERVAL;
-
- if (state->count != state->last_count) {
- /* There was one new thermal interrupt */
- state->last_count = state->count;
- state->average = 0;
- state->sample_count = 0;
- state->sample_index = 0;
- }
-
- state->temp_samples[state->sample_index] = temp;
- state->sample_count++;
- state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
- if (state->sample_count < ARRAY_SIZE(state->temp_samples))
- goto re_arm;
-
- avg = 0;
- for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
- avg += state->temp_samples[i];
-
- avg /= ARRAY_SIZE(state->temp_samples);
-
- if (state->average > avg) {
- pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
- this_cpu,
- state->level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- state->rate_control_active = true;
- }
-
- state->average = avg;
-
-re_arm:
- clear_therm_status_log(state->level);
- schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
-}
-
-/***
- * therm_throt_process - Process thermal throttling event from interrupt
- * @curr: Whether the condition is current or not (boolean), since the
- * thermal interrupt normally gets called both when the thermal
- * event begins and once the event has ended.
- *
- * This function is called by the thermal interrupt after the
- * IRQ has been acknowledged.
- *
- * It will take care of rate limiting and printing messages to the syslog.
- */
-static void therm_throt_process(bool new_event, int event, int level)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- bool old_event;
- u64 now;
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
-
- now = get_jiffies_64();
- if (level == CORE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->core_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->core_power_limit;
- else
- return;
- } else if (level == PACKAGE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->package_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->package_power_limit;
- else
- return;
- } else
- return;
-
- old_event = state->new_event;
- state->new_event = new_event;
-
- if (new_event)
- state->count++;
-
- if (event != THERMAL_THROTTLING_EVENT)
- return;
-
- if (new_event && !state->last_interrupt_time) {
- bool hot;
- u8 temp;
-
- get_therm_status(state->level, &hot, &temp);
- /*
- * Ignore short temperature spike as the system is not close
- * to PROCHOT. 10C offset is large enough to ignore. It is
- * already dropped from the high threshold temperature.
- */
- if (temp > 10)
- return;
-
- state->baseline_temp = temp;
- state->last_interrupt_time = now;
- schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
- } else if (old_event && state->last_interrupt_time) {
- unsigned long throttle_time;
-
- throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
- if (throttle_time > state->max_time_ms)
- state->max_time_ms = throttle_time;
- state->total_time_ms += throttle_time;
- state->last_interrupt_time = 0;
- }
-}
-
-static int thresh_event_valid(int level, int event)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- u64 now = get_jiffies_64();
-
- if (level == PACKAGE_LEVEL)
- state = (event == 0) ? &pstate->pkg_thresh0 :
- &pstate->pkg_thresh1;
- else
- state = (event == 0) ? &pstate->core_thresh0 :
- &pstate->core_thresh1;
-
- if (time_before64(now, state->next_check))
- return 0;
-
- state->next_check = now + CHECK_INTERVAL;
-
- return 1;
-}
-
-static bool int_pln_enable;
-static int __init int_pln_enable_setup(char *s)
-{
- int_pln_enable = true;
-
- return 1;
-}
-__setup("int_pln_enable", int_pln_enable_setup);
-
-#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device: */
-static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
-{
- int err;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
- if (err)
- return err;
-
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_core_power_limit_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
- }
-
- if (cpu_has(c, X86_FEATURE_PTS)) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_max_time_ms.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_total_time_ms.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_power_limit_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
- }
- }
-
- return 0;
-
-del_group:
- sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-
- return err;
-}
-
-static void thermal_throttle_remove_dev(struct device *dev)
-{
- sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int thermal_throttle_online(unsigned int cpu)
-{
- struct thermal_state *state = &per_cpu(thermal_state, cpu);
- struct device *dev = get_cpu_device(cpu);
- u32 l;
-
- state->package_throttle.level = PACKAGE_LEVEL;
- state->core_throttle.level = CORE_LEVEL;
-
- INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
- INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
-
- /* Unmask the thermal vector after the above workqueues are initialized. */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
- return thermal_throttle_add_dev(dev, cpu);
-}
-
-static int thermal_throttle_offline(unsigned int cpu)
-{
- struct thermal_state *state = &per_cpu(thermal_state, cpu);
- struct device *dev = get_cpu_device(cpu);
- u32 l;
-
- /* Mask the thermal vector before draining evtl. pending work */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
-
- cancel_delayed_work_sync(&state->package_throttle.therm_work);
- cancel_delayed_work_sync(&state->core_throttle.therm_work);
-
- state->package_throttle.rate_control_active = false;
- state->core_throttle.rate_control_active = false;
-
- thermal_throttle_remove_dev(dev);
- return 0;
-}
-
-static __init int thermal_throttle_init_device(void)
-{
- int ret;
-
- if (!atomic_read(&therm_throt_en))
- return 0;
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
- thermal_throttle_online,
- thermal_throttle_offline);
- return ret < 0 ? ret : 0;
-}
-device_initcall(thermal_throttle_init_device);
-
-#endif /* CONFIG_SYSFS */
-
-static void notify_package_thresholds(__u64 msr_val)
-{
- bool notify_thres_0 = false;
- bool notify_thres_1 = false;
-
- if (!platform_thermal_package_notify)
- return;
-
- /* lower threshold check */
- if (msr_val & THERM_LOG_THRESHOLD0)
- notify_thres_0 = true;
- /* higher threshold check */
- if (msr_val & THERM_LOG_THRESHOLD1)
- notify_thres_1 = true;
-
- if (!notify_thres_0 && !notify_thres_1)
- return;
-
- if (platform_thermal_package_rate_control &&
- platform_thermal_package_rate_control()) {
- /* Rate control is implemented in callback */
- platform_thermal_package_notify(msr_val);
- return;
- }
-
- /* lower threshold reached */
- if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
- platform_thermal_package_notify(msr_val);
- /* higher threshold reached */
- if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
- platform_thermal_package_notify(msr_val);
-}
-
-static void notify_thresholds(__u64 msr_val)
-{
- /* check whether the interrupt handler is defined;
- * otherwise simply return
- */
- if (!platform_thermal_notify)
- return;
-
- /* lower threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD0) &&
- thresh_event_valid(CORE_LEVEL, 0))
- platform_thermal_notify(msr_val);
- /* higher threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD1) &&
- thresh_event_valid(CORE_LEVEL, 1))
- platform_thermal_notify(msr_val);
-}
-
-/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
-{
- __u64 msr_val;
-
- if (static_cpu_has(X86_FEATURE_HWP))
- wrmsrl_safe(MSR_HWP_STATUS, 0);
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-
- /* Check for violation of core thermal thresholds*/
- notify_thresholds(msr_val);
-
- therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- CORE_LEVEL);
-
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- CORE_LEVEL);
-
- if (this_cpu_has(X86_FEATURE_PTS)) {
- rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
- /* check violations of package thermal thresholds */
- notify_package_thresholds(msr_val);
- therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- PACKAGE_LEVEL);
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val &
- PACKAGE_THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- PACKAGE_LEVEL);
- }
-}
-
-static void unexpected_thermal_interrupt(void)
-{
- pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
- smp_processor_id());
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
-{
- trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
- trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- ack_APIC_irq();
-}
-
-/* Thermal monitoring depends on APIC, ACPI and clock modulation */
-static int intel_thermal_supported(struct cpuinfo_x86 *c)
-{
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
- if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
- return 0;
- return 1;
-}
-
-void __init mcheck_intel_therm_init(void)
-{
- /*
- * This function is only called on boot CPU. Save the init thermal
- * LVT value on BSP and use that value to restore APs' thermal LVT
- * entry BIOS programmed later
- */
- if (intel_thermal_supported(&boot_cpu_data))
- lvtthmr_init = apic_read(APIC_LVTTHMR);
-}
-
-void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- int tm2 = 0;
- u32 l, h;
-
- if (!intel_thermal_supported(c))
- return;
-
- /*
- * First check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already:
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- h = lvtthmr_init;
- /*
- * The initial value of thermal LVT entries on all APs always reads
- * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
- * sequence to them and LVT registers are reset to 0s except for
- * the mask bits which are set to 1s when APs receive INIT IPI.
- * If BIOS takes over the thermal interrupt and sets its interrupt
- * delivery mode to SMI (not fixed), it restores the value that the
- * BIOS has programmed on AP based on BSP's info we saved since BIOS
- * is always setting the same value for all threads/cores.
- */
- if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
- apic_write(APIC_LVTTHMR, lvtthmr_init);
-
-
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- if (system_state == SYSTEM_BOOTING)
- pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- /* early Pentium M models use different method for enabling TM2 */
- if (cpu_has(c, X86_FEATURE_TM2)) {
- if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
- rdmsr(MSR_THERM2_CTL, l, h);
- if (l & MSR_THERM2_CTL_TM_SELECT)
- tm2 = 1;
- } else if (l & MSR_IA32_MISC_ENABLE_TM2)
- tm2 = 1;
- }
-
- /* We'll mask the thermal vector in the lapic till we're ready: */
- h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- (l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
-
- if (cpu_has(c, X86_FEATURE_PTS)) {
- rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- (l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE))
- & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE
- | PACKAGE_THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE), h);
- }
-
- smp_thermal_vector = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
- tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
-}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ec6f0415bc6d..b935e1b5f115 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = {
.attrs = cpu_root_microcode_attrs,
};
-int __init microcode_init(void)
+static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5bd011737272..9231640782fa 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
- size_base = to_size_factor(size_base, &size_factor),
+ size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
+ start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index a29997e6cf9e..b90f3f437765 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -3,7 +3,6 @@
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
-#define DEBUG
#include <linux/export.h>
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 61eb26edc6d2..28c8a23aa42e 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -31,8 +31,6 @@
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
-#define DEBUG
-
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index a5ee607a3b89..3ef5868ac588 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -3,7 +3,7 @@
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
- * with other users (like oprofile).
+ * with other users.
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
@@ -105,15 +105,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
}
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return !test_bit(counter, perfctr_nmi_owner);
-}
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index ee71c47844cb..c4d320d02fd5 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -572,6 +572,7 @@ union cpuid_0x10_x_edx {
void rdt_last_cmd_clear(void);
void rdt_last_cmd_puts(const char *s);
+__printf(1, 2)
void rdt_last_cmd_printf(const char *fmt, ...);
void rdt_ctrl_update(void *arg);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 460f3e0df106..f9190adc52cb 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -563,11 +563,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
*/
if (rdtgrp->type == RDTCTRL_GROUP) {
- tsk->closid = rdtgrp->closid;
- tsk->rmid = rdtgrp->mon.rmid;
+ WRITE_ONCE(tsk->closid, rdtgrp->closid);
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else if (rdtgrp->type == RDTMON_GROUP) {
if (rdtgrp->mon.parent->closid == tsk->closid) {
- tsk->rmid = rdtgrp->mon.rmid;
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else {
rdt_last_cmd_puts("Can't move task to different control group\n");
return -EINVAL;
@@ -2310,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
for_each_process_thread(p, t) {
if (!from || is_closid_match(t, from) ||
is_rmid_match(t, from)) {
- t->closid = to->closid;
- t->rmid = to->mon.rmid;
+ WRITE_ONCE(t->closid, to->closid);
+ WRITE_ONCE(t->rmid, to->mon.rmid);
-#ifdef CONFIG_SMP
/*
- * This is safe on x86 w/o barriers as the ordering
- * of writing to task_cpu() and t->on_cpu is
- * reverse to the reading here. The detection is
- * inaccurate as tasks might move or schedule
- * before the smp function call takes place. In
- * such a case the function call is pointless, but
+ * If the task is on a CPU, set the CPU in the mask.
+ * The detection is inaccurate as tasks might move or
+ * schedule before the smp function call takes place.
+ * In such a case the function call is pointless, but
* there is no other side effect.
*/
- if (mask && t->on_cpu)
+ if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
cpumask_set_cpu(task_cpu(t), mask);
-#endif
}
}
read_unlock(&tasklist_lock);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 236924930bf0..972ec3bfa9c0 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,11 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
- { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
- { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
- { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
- { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
- { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index f2eac41bb4ff..8ce6d8371cfb 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -72,6 +72,9 @@ static int sgx_release(struct inode *inode, struct file *file)
synchronize_srcu(&encl->srcu);
mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
kfree(encl_mm);
+
+ /* 'encl_mm' is gone, put encl_mm->encl reference: */
+ kref_put(&encl->refcount, sgx_encl_release);
}
kref_put(&encl->refcount, sgx_encl_release);
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index ee50a5010277..7449ef33f081 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -141,7 +141,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
struct sgx_encl_page *entry;
unsigned long phys_addr;
struct sgx_encl *encl;
- unsigned long pfn;
vm_fault_t ret;
encl = vma->vm_private_data;
@@ -168,13 +167,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
- /* Check if another thread got here first to insert the PTE. */
- if (!follow_pfn(vma, addr, &pfn)) {
- mutex_unlock(&encl->lock);
-
- return VM_FAULT_NOPAGE;
- }
-
ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
if (ret != VM_FAULT_NOPAGE) {
mutex_unlock(&encl->lock);
@@ -481,6 +473,9 @@ static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
{
struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ /* 'encl_mm' is going away, put encl_mm->encl reference: */
+ kref_put(&encl_mm->encl->refcount, sgx_encl_release);
+
kfree(encl_mm);
}
@@ -534,6 +529,8 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
if (!encl_mm)
return -ENOMEM;
+ /* Grab a refcount for the encl_mm->encl reference: */
+ kref_get(&encl->refcount);
encl_mm->encl = encl;
encl_mm->mm = mm;
encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index c519fc5f6948..8df81a3ed945 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void)
return true;
}
-static void __init sgx_init(void)
+static int __init sgx_init(void)
{
int ret;
int i;
if (!cpu_feature_enabled(X86_FEATURE_SGX))
- return;
+ return -ENODEV;
if (!sgx_page_cache_init())
- return;
+ return -ENOMEM;
- if (!sgx_page_reclaimer_init())
+ if (!sgx_page_reclaimer_init()) {
+ ret = -ENOMEM;
goto err_page_cache;
+ }
ret = sgx_drv_init();
if (ret)
goto err_kthread;
- return;
+ return 0;
err_kthread:
kthread_stop(ksgxd_tsk);
@@ -728,6 +730,8 @@ err_page_cache:
vfree(sgx_epc_sections[i].pages);
memunmap(sgx_epc_sections[i].virt_addr);
}
+
+ return ret;
}
device_initcall(sgx_init);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5d8047441a0a..683749b80ae2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
fx->fop = 0;
fx->rip = 0;
fx->rdp = 0;
- memset(&fx->st_space[0], 0, 128);
+ memset(fx->st_space, 0, sizeof(fx->st_space));
}
/*
* SSE is in init state
*/
if (!(xfeatures & XFEATURE_MASK_SSE))
- memset(&fx->xmm_space[0], 0, 256);
+ memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
/*
* First two features are FPU and SSE, which above we handled
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c5dd50369e2f..d4ad344e80bf 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -21,6 +21,7 @@
#include <asm/hw_irq.h>
#include <asm/desc.h>
#include <asm/traps.h>
+#include <asm/thermal.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -374,3 +375,23 @@ void fixup_irqs(void)
}
}
#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+static void smp_thermal_vector(void)
+{
+ if (x86_thermal_enabled())
+ intel_thermal_interrupt();
+ else
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
+{
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ ack_APIC_irq();
+}
+#endif
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 0db0375235b4..8ef35063964b 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl)
ret
SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
-
-/*
- * void native_restore_fl(unsigned long flags)
- * %eax/%rdi: flags
- */
-SYM_FUNC_START(native_restore_fl)
- push %_ASM_ARG1
- popf
- ret
-SYM_FUNC_END(native_restore_fl)
-EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a65e9e97857f..df776cdca327 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -133,26 +133,6 @@ void synthesize_relcall(void *dest, void *from, void *to)
NOKPROBE_SYMBOL(synthesize_relcall);
/*
- * Skip the prefixes of the instruction.
- */
-static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
-{
- insn_attr_t attr;
-
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- while (inat_is_legacy_prefix(attr)) {
- insn++;
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- }
-#ifdef CONFIG_X86_64
- if (inat_is_rex_prefix(attr))
- insn++;
-#endif
- return insn;
-}
-NOKPROBE_SYMBOL(skip_prefixes);
-
-/*
* Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
@@ -312,25 +292,6 @@ static int can_probe(unsigned long paddr)
}
/*
- * Returns non-zero if opcode modifies the interrupt flag.
- */
-static int is_IF_modifier(kprobe_opcode_t *insn)
-{
- /* Skip prefixes */
- insn = skip_prefixes(insn);
-
- switch (*insn) {
- case 0xfa: /* cli */
- case 0xfb: /* sti */
- case 0xcf: /* iret/iretd */
- case 0x9d: /* popf/popfd */
- return 1;
- }
-
- return 0;
-}
-
-/*
* Copy an instruction with recovering modified instruction by kprobes
* and adjust the displacement if the instruction uses the %rip-relative
* addressing mode. Note that since @real will be the final place of copied
@@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
len += JMP32_INSN_SIZE;
- p->ainsn.boostable = true;
+ p->ainsn.boostable = 1;
} else {
- p->ainsn.boostable = false;
+ p->ainsn.boostable = 0;
}
return len;
@@ -450,6 +411,67 @@ void free_insn_page(void *page)
module_memfree(page);
}
+static void set_resume_flags(struct kprobe *p, struct insn *insn)
+{
+ insn_byte_t opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0x9d: /* popf/popfd */
+ /* Check whether the instruction modifies Interrupt Flag or not */
+ p->ainsn.if_modifier = 1;
+ break;
+ case 0x9c: /* pushfl */
+ p->ainsn.is_pushf = 1;
+ break;
+ case 0xcf: /* iret */
+ p->ainsn.if_modifier = 1;
+ fallthrough;
+ case 0xc2: /* ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ case 0xea: /* jmp absolute -- ip is correct */
+ /* ip is already adjusted, no more changes required */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ break;
+ case 0xe8: /* call relative - Fix return addr */
+ p->ainsn.is_call = 1;
+ break;
+#ifdef CONFIG_X86_32
+ case 0x9a: /* call absolute -- same as call absolute, indirect */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+#endif
+ case 0xff:
+ opcode = insn->opcode.bytes[1];
+ if ((opcode & 0x30) == 0x10) {
+ /*
+ * call absolute, indirect
+ * Fix return addr; ip is correct.
+ * But this is not boostable
+ */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+ } else if (((opcode & 0x31) == 0x20) ||
+ ((opcode & 0x31) == 0x21)) {
+ /*
+ * jmp near and far, absolute indirect
+ * ip is correct.
+ */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ }
+ break;
+ }
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
@@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p)
*/
len = prepare_boost(buf, p, &insn);
- /* Check whether the instruction modifies Interrupt Flag or not */
- p->ainsn.if_modifier = is_IF_modifier(buf);
+ /* Analyze the opcode and set resume flags */
+ set_resume_flags(p, &insn);
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
@@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!can_probe((unsigned long)p->addr))
return -EILSEQ;
+
+ memset(&p->ainsn, 0, sizeof(p->ainsn));
+
/* insn: must be on special executable page on x86. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
@@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler);
* 2) If the single-stepped instruction was a call, the return address
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
*/
static void resume_execution(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
@@ -818,60 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
unsigned long orig_ip = (unsigned long)p->addr;
- kprobe_opcode_t *insn = p->ainsn.insn;
-
- /* Skip prefixes */
- insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
- switch (*insn) {
- case 0x9c: /* pushfl */
+
+ /* Fixup the contents of top of stack */
+ if (p->ainsn.is_pushf) {
*tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
*tos |= kcb->kprobe_old_flags;
- break;
- case 0xc2: /* iret/ret/lret */
- case 0xc3:
- case 0xca:
- case 0xcb:
- case 0xcf:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.boostable = true;
- goto no_change;
- case 0xe8: /* call relative - Fix return addr */
+ } else if (p->ainsn.is_call) {
*tos = orig_ip + (*tos - copy_ip);
- break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
-#endif
- case 0xff:
- if ((insn[1] & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
- } else if (((insn[1] & 0x31) == 0x20) ||
- ((insn[1] & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct. And this is boostable
- */
- p->ainsn.boostable = true;
- goto no_change;
- }
- break;
- default:
- break;
}
- regs->ip += orig_ip - copy_ip;
+ if (!p->ainsn.is_abs_ip)
+ regs->ip += orig_ip - copy_ip;
-no_change:
restore_btf();
}
NOKPROBE_SYMBOL(resume_execution);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b8aee71840ae..aa15132228da 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
- tlb_gather_mmu(&tlb, mm, start, end);
+ /*
+ * Although free_pgd_range() is intended for freeing user
+ * page-tables, it also works out for kernel mappings on x86.
+ * We use tlb_gather_mmu_fullmm() to avoid confusing the
+ * range-tracking logic in __tlb_adjust_range().
+ */
+ tlb_gather_mmu_fullmm(&tlb, mm);
free_pgd_range(&tlb, start, end, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb);
#endif
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 34b153cbd4ac..5e9a34b5bd74 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
+ case R_386_PLT32:
/* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 8a67d1fa8dc5..ed8ac6bcbafb 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -182,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = security_locked_down(LOCKDOWN_MSR);
if (err)
break;
+
+ err = filter_write(regs[1]);
+ if (err)
+ return err;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6c3407ba6ee9..c60222ab8ab9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insn_buff, len);
- else if (type == PARAVIRT_PATCH(cpu.iret) ||
- type == PARAVIRT_PATCH(cpu.usergs_sysret64))
+ else if (type == PARAVIRT_PATCH(cpu.iret))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
#endif
@@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
.start = 0,
@@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
- .cpu.usergs_sysret64 = native_usergs_sysret64,
.cpu.iret = native_iret,
- .cpu.swapgs = native_swapgs,
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
@@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.irq.safe_halt = native_safe_halt,
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index ace6e334cb39..abd27ec67397 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -25,10 +25,7 @@ struct patch_xxl {
const unsigned char mmu_read_cr2[3];
const unsigned char mmu_read_cr3[3];
const unsigned char mmu_write_cr3[3];
- const unsigned char irq_restore_fl[2];
const unsigned char cpu_wbinvd[2];
- const unsigned char cpu_usergs_sysret64[6];
- const unsigned char cpu_swapgs[3];
const unsigned char mov64[3];
};
@@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = {
.mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
.mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
.mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
- .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq
.cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
- .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8,
- 0x48, 0x0f, 0x07 }, // swapgs; sysretq
- .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs
.mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
};
@@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
- PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
@@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
- PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
- PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
#endif
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 2e9006c1e240..42e92ec62973 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -4,9 +4,6 @@
#include <linux/string.h>
#include <linux/kallsyms.h>
-
-#define DEBUG 1
-
static struct iommu_table_entry * __init
find_dependents_of(struct iommu_table_entry *start,
struct iommu_table_entry *finish,
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index bedca011459c..87a4143aa7d7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
+#ifdef CONFIG_X86_64
+static const struct user_regset_view user_x86_64_view; /* Initialized below. */
+#endif
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request,
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
+#ifdef CONFIG_X86_64
+ /* This is native 64-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_64_view;
+#else
+ /* This is native 32-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_32_view;
+#endif
+
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
+/*
+ * This is used by the core dump code to decide which regset to dump. The
+ * core dump code writes out the resulting .e_machine and the corresponding
+ * regsets. This is suboptimal if the task is messing around with its CS.L
+ * field, but at worst the core dump will end up missing some information.
+ *
+ * Unfortunately, it is also used by the broken PTRACE_GETREGSET and
+ * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
+ * no way to make sure that the e_machine they use matches the caller's
+ * expectations. The result is that the data format returned by
+ * PTRACE_GETREGSET depends on the returned CS field (and even the offset
+ * of the returned CS field depends on its value!) and the data format
+ * accepted by PTRACE_SETREGSET is determined by the old CS value. The
+ * upshot is that it is basically impossible to use these APIs correctly.
+ *
+ * The best way to fix it in the long run would probably be to add new
+ * improved ptrace() APIs to read and write registers reliably, possibly by
+ * allowing userspace to select the ELF e_machine variant that they expect.
+ */
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index db115943e8bd..9991c5920aac 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
},
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
+ },
+ },
+
/* Sony */
{ /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 117e24fbfd8a..02813a7f3a7c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1833,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled)
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
arch_turbo_freq_ratio;
}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
static bool turbo_disabled(void)
{
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ca9a380d9c0b..9442c4136c38 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,14 +11,26 @@ enum insn_type {
RET = 3, /* tramp / site cond-tail-call */
};
+/*
+ * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
+ * The REX.W cancels the effect of any data16.
+ */
+static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
+ const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
switch (type) {
case CALL:
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ if (func == &__static_call_return0) {
+ emulate = code;
+ code = &xor5rax;
+ }
+
break;
case NOP:
@@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
if (unlikely(system_state == SYSTEM_BOOTING))
return text_poke_early(insn, code, size);
- text_poke_bp(insn, code, size, NULL);
+ text_poke_bp(insn, code, size, emulate);
}
static void __static_call_validate(void *insn, bool tail)
@@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
- !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5))
+ !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+ !memcmp(insn, xor5rax, 5))
return;
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 504fa5425bce..660b78827638 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
- long error;
- error = -EINVAL;
if (off & ~PAGE_MASK)
- goto out;
+ return -EINVAL;
- error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
-out:
- return error;
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
static void find_start_end(unsigned long addr, unsigned long flags,
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 764573de3996..e5a7a10a0164 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
- unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
+
+ /*
+ * Don't write screen_bitmap in case some user had a value there
+ * and expected it to remain unchanged.
+ */
user_access_end();
@@ -160,49 +164,6 @@ Efault:
do_exit(SIGSEGV);
}
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- spinlock_t *ptl;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i;
-
- mmap_write_lock(mm);
- pgd = pgd_offset(mm, 0xA0000);
- if (pgd_none_or_clear_bad(pgd))
- goto out;
- p4d = p4d_offset(pgd, 0xA0000);
- if (p4d_none_or_clear_bad(p4d))
- goto out;
- pud = pud_offset(p4d, 0xA0000);
- if (pud_none_or_clear_bad(pud))
- goto out;
- pmd = pmd_offset(pud, 0xA0000);
-
- if (pmd_trans_huge(*pmd)) {
- vma = find_vma(mm, 0xA0000);
- split_huge_pmd(vma, pmd, 0xA0000);
- }
- if (pmd_none_or_clear_bad(pmd))
- goto out;
- pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
- for (i = 0; i < 32; i++) {
- if (pte_present(*pte))
- set_pte(pte, pte_wrprotect(*pte));
- pte++;
- }
- pte_unmap_unlock(pte, ptl);
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
-}
-
-
-
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
offsetof(struct vm86_struct, int_revectored)))
return -EFAULT;
+
+ /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
+ if (v.flags & VM86_SCREEN_BITMAP) {
+ char comm[TASK_COMM_LEN];
+
+ pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current));
+ return -EINVAL;
+ }
+
memset(&vm86regs, 0, sizeof(vm86regs));
vm86regs.pt.bx = v.regs.ebx;
@@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86regs.gs = v.regs.gs;
vm86->flags = v.flags;
- vm86->screen_bitmap = v.screen_bitmap;
vm86->cpu_type = v.cpu_type;
if (copy_from_user(&vm86->int_revectored,
@@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
update_task_stack(tsk);
preempt_enable();
- if (vm86->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk->mm);
-
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
return regs->ax;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b404e4d7dd8..b967c1c774a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1782,6 +1782,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
+ xfer_to_guest_mode_prepare();
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
xfer_to_guest_mode_work_pending();
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f1f1b5a0956a..525197381baa 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,7 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
-#include <linux/efi.h> /* efi_recover_from_page_fault()*/
+#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -25,7 +25,7 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/efi.h> /* efi_recover_from_page_fault()*/
+#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
@@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
* 32-bit mode:
*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
+ * Check that here and ignore it. This is AMD erratum #91.
*
* 64-bit mode:
*
@@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
#ifdef CONFIG_X86_64
case 0x40:
/*
- * In AMD64 long mode 0x40..0x4F are valid REX prefixes
- * Need to figure out under what instruction mode the
- * instruction was issued. Could check the LDT for lm,
- * but for now it's good enough to assume that long
- * mode only uses well known segments or kernel.
+ * In 64-bit mode 0x40..0x4F are valid REX prefixes
*/
return (!user_mode(regs) || user_64bit_mode(regs));
#endif
@@ -110,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
}
}
+static bool is_amd_k8_pre_npt(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
+ c->x86_vendor == X86_VENDOR_AMD &&
+ c->x86 == 0xf && c->x86_model < 0x40);
+}
+
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
@@ -117,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
unsigned char *instr;
int prefetch = 0;
+ /* Erratum #91 affects AMD K8, pre-NPT CPUs */
+ if (!is_amd_k8_pre_npt())
+ return 0;
+
/*
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
@@ -127,20 +136,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
- return 0;
+ /*
+ * This code has historically always bailed out if IP points to a
+ * not-present page (e.g. due to a race). No one has ever
+ * complained about this.
+ */
+ pagefault_disable();
while (instr < max_instr) {
unsigned char opcode;
- if (get_kernel_nofault(opcode, instr))
- break;
+ if (user_mode(regs)) {
+ if (get_user(opcode, instr))
+ break;
+ } else {
+ if (get_kernel_nofault(opcode, instr))
+ break;
+ }
instr++;
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
+
+ pagefault_enable();
return prefetch;
}
@@ -262,25 +282,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
}
}
-/*
- * Did it hit the DOS screen memory VA from vm86 mode?
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-#ifdef CONFIG_VM86
- unsigned long bit;
-
- if (!v8086_mode(regs) || !tsk->thread.vm86)
- return;
-
- bit = (address - 0xA0000) >> PAGE_SHIFT;
- if (bit < 32)
- tsk->thread.vm86->screen_bitmap |= 1 << bit;
-#endif
-}
-
static bool low_pfn(unsigned long pfn)
{
return pfn < max_low_pfn;
@@ -335,15 +336,6 @@ KERN_ERR
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif
-/*
- * No vm86 mode in 64-bit mode:
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-}
-
static int bad_address(void *p)
{
unsigned long dummy;
@@ -427,6 +419,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
|| boot_cpu_data.x86 != 0xf)
return 0;
+ if (user_mode(regs))
+ return 0;
+
if (address != regs->ip)
return 0;
@@ -462,10 +457,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
}
/* Pentium F0 0F C7 C8 bug workaround: */
-static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
- if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+ if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
+ idt_is_f00f_address(address)) {
handle_invalid_op(regs);
return 1;
}
@@ -630,53 +627,20 @@ static void set_signal_archinfo(unsigned long address,
}
static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int signal, int si_code)
+page_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
- struct task_struct *tsk = current;
unsigned long flags;
int sig;
if (user_mode(regs)) {
/*
- * This is an implicit supervisor-mode access from user
- * mode. Bypass all the kernel-mode recovery code and just
- * OOPS.
+ * Implicit kernel access from user mode? Skip the stack
+ * overflow and EFI special cases.
*/
goto oops;
}
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
- return;
- }
-
#ifdef CONFIG_VMAP_STACK
/*
* Stack overflow? During boot, we can fault near the initial
@@ -684,8 +648,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -709,28 +673,12 @@ no_context(struct pt_regs *regs, unsigned long error_code,
#endif
/*
- * 32-bit:
- *
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * 64-bit:
- *
- * Hall of shame of CPU/BIOS bugs.
- */
- if (is_prefetch(regs, error_code, address))
- return;
-
- if (is_errata93(regs, address))
- return;
-
- /*
- * Buggy firmware could access regions which might page fault, try to
- * recover from such faults.
+ * Buggy firmware could access regions which might page fault. If
+ * this happens, EFI has a special OOPS path that will try to
+ * avoid hanging the system.
*/
if (IS_ENABLED(CONFIG_EFI))
- efi_recover_from_page_fault(address);
+ efi_crash_gracefully_on_page_fault(address);
oops:
/*
@@ -741,7 +689,7 @@ oops:
show_fault_oops(regs, error_code, address);
- if (task_stack_end_corrupted(tsk))
+ if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
sig = SIGKILL;
@@ -754,6 +702,53 @@ oops:
oops_end(flags, regs, sig);
}
+static noinline void
+kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code)
+{
+ WARN_ON_ONCE(user_mode(regs));
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current->thread.sig_on_uaccess_err && signal) {
+ sanitize_error_code(address, &error_code);
+
+ set_signal_archinfo(address, error_code);
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_fault(signal, si_code, (void __user *)address);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
+ return;
+ }
+
+ /*
+ * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
+ * instruction.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+}
+
/*
* Print out info about fatal segfaults, if the show_unhandled_signals
* sysctl is set:
@@ -796,47 +791,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
{
struct task_struct *tsk = current;
- /* User mode accesses just cause a SIGSEGV */
- if (user_mode(regs) && (error_code & X86_PF_USER)) {
- /*
- * It's possible to have interrupts off here:
- */
- local_irq_enable();
-
- /*
- * Valid to do another page fault here because this one came
- * from user space:
- */
- if (is_prefetch(regs, error_code, address))
- return;
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
+ return;
+ }
- if (is_errata100(regs, address))
- return;
+ if (!(error_code & X86_PF_USER)) {
+ /* Implicit user access to kernel memory -- just oops */
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
- sanitize_error_code(address, &error_code);
+ /*
+ * User mode accesses just cause a SIGSEGV.
+ * It's possible to have interrupts off here:
+ */
+ local_irq_enable();
- if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
- return;
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space:
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
- if (likely(show_unhandled_signals))
- show_signal_msg(regs, error_code, address, tsk);
+ if (is_errata100(regs, address))
+ return;
- set_signal_archinfo(address, error_code);
+ sanitize_error_code(address, &error_code);
- if (si_code == SEGV_PKUERR)
- force_sig_pkuerr((void __user *)address, pkey);
+ if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
+ return;
- force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ if (likely(show_unhandled_signals))
+ show_signal_msg(regs, error_code, address, tsk);
- local_irq_disable();
+ set_signal_archinfo(address, error_code);
- return;
- }
+ if (si_code == SEGV_PKUERR)
+ force_sig_pkuerr((void __user *)address, pkey);
- if (is_f00f_bug(regs, address))
- return;
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
- no_context(regs, error_code, address, SIGSEGV, si_code);
+ local_irq_disable();
}
static noinline void
@@ -926,8 +923,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
vm_fault_t fault)
{
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -961,40 +958,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}
-static noinline void
-mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, vm_fault_t fault)
-{
- if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, 0, 0);
- return;
- }
-
- if (fault & VM_FAULT_OOM) {
- /* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address,
- SIGSEGV, SEGV_MAPERR);
- return;
- }
-
- /*
- * We ran out of memory, call the OOM killer, and return the
- * userspace (which will retry the fault, or kill us if we got
- * oom-killed):
- */
- pagefault_out_of_memory();
- } else {
- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
- VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
- else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
- else
- BUG();
- }
-}
-
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
@@ -1209,6 +1172,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
#endif
+ if (is_f00f_bug(regs, hw_error_code, address))
+ return;
+
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
@@ -1229,10 +1195,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
-/* Handle faults in the user portion of the address space */
+/*
+ * Handle faults in the user portion of the address space. Nothing in here
+ * should check X86_PF_USER without a specific justification: for almost
+ * all purposes, we should treat a normal kernel access to user memory
+ * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
+ * The one exception is AC flag handling, which is, per the x86
+ * architecture, special for WRUSS.
+ */
static inline
void do_user_addr_fault(struct pt_regs *regs,
- unsigned long hw_error_code,
+ unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
@@ -1244,6 +1217,21 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current;
mm = tsk->mm;
+ if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
+ /*
+ * Whoops, this is kernel mode code trying to execute from
+ * user memory. Unless this is AMD erratum #93, which
+ * corrupts RIP such that it looks like a user address,
+ * this is unrecoverable. Don't even try to look up the
+ * VMA or look for extable entries.
+ */
+ if (is_errata93(regs, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
+
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
@@ -1252,8 +1240,8 @@ void do_user_addr_fault(struct pt_regs *regs,
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
- if (unlikely(hw_error_code & X86_PF_RSVD))
- pgtable_bad(regs, hw_error_code, address);
+ if (unlikely(error_code & X86_PF_RSVD))
+ pgtable_bad(regs, error_code, address);
/*
* If SMAP is on, check for invalid kernel (supervisor) access to user
@@ -1263,10 +1251,13 @@ void do_user_addr_fault(struct pt_regs *regs,
* enforcement appears to be consistent with the USER bit.
*/
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
- !(hw_error_code & X86_PF_USER) &&
- !(regs->flags & X86_EFLAGS_AC)))
- {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ !(error_code & X86_PF_USER) &&
+ !(regs->flags & X86_EFLAGS_AC))) {
+ /*
+ * No extable entry here. This was a kernel access to an
+ * invalid pointer. get_kernel_nofault() will not get here.
+ */
+ page_fault_oops(regs, error_code, address);
return;
}
@@ -1275,7 +1266,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
@@ -1296,9 +1287,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (hw_error_code & X86_PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (hw_error_code & X86_PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
@@ -1314,7 +1305,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* to consider the PF_PK bit.
*/
if (is_vsyscall_vaddr(address)) {
- if (emulate_vsyscall(hw_error_code, regs, address))
+ if (emulate_vsyscall(error_code, regs, address))
return;
}
#endif
@@ -1337,7 +1328,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* Fault from code in kernel from
* which we do not expect faults.
*/
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
retry:
@@ -1353,17 +1344,17 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
@@ -1372,8 +1363,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(hw_error_code, vma))) {
- bad_area_access_error(regs, hw_error_code, address, vma);
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1392,11 +1383,14 @@ good_area:
*/
fault = handle_mm_fault(vma, address, flags, regs);
- /* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
+ /*
+ * Quick path to respond to signals. The core mm code
+ * has unlocked the mm for us if we get here.
+ */
if (!user_mode(regs))
- no_context(regs, hw_error_code, address, SIGBUS,
- BUS_ADRERR);
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR);
return;
}
@@ -1412,12 +1406,37 @@ good_area:
}
mmap_read_unlock(mm);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, hw_error_code, address, fault);
+ if (likely(!(fault & VM_FAULT_ERROR)))
+ return;
+
+ if (fatal_signal_pending(current) && !user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
return;
}
- check_v8086_mode(regs, address, tsk);
+ if (fault & VM_FAULT_OOM) {
+ /* Kernel mode? Handle exceptions or die: */
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+ return;
+ }
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ do_sigbus(regs, error_code, address, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ bad_area_nosemaphore(regs, error_code, address);
+ else
+ BUG();
+ }
}
NOKPROBE_SYMBOL(do_user_addr_fault);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index e26f5c5c6565..dd694fb93916 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num)
}
/*
- * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
- * With KASLR memory randomization, depending on the machine e820 memory
- * and the PUD alignment. We may need twice more pages when KASLR memory
+ * By default need to be able to allocate page tables below PGD firstly for
+ * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping.
+ * With KASLR memory randomization, depending on the machine e820 memory and the
+ * PUD alignment, twice that many pages may be needed when KASLR memory
* randomization is enabled.
*/
+
+#ifndef CONFIG_X86_5LEVEL
+#define INIT_PGD_PAGE_TABLES 3
+#else
+#define INIT_PGD_PAGE_TABLES 4
+#endif
+
#ifndef CONFIG_RANDOMIZE_MEMORY
-#define INIT_PGD_PAGE_COUNT 6
+#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
#else
-#define INIT_PGD_PAGE_COUNT 12
+#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES)
#endif
+
#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index c3d5f0236f35..4b01f7dbaf30 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -475,9 +475,10 @@ void __init mem_encrypt_init(void)
swiotlb_update_mem_attributes();
/*
- * With SEV, we need to unroll the rep string I/O instructions.
+ * With SEV, we need to unroll the rep string I/O instructions,
+ * but SEV-ES supports them through the #VC handler.
*/
- if (sev_active())
+ if (sev_active() && !sev_es_active())
static_branch_enable(&sev_enable_key);
print_mem_encrypt_feature_info();
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index bd7aff5c51f7..cd768dafca9e 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -10,8 +10,6 @@
#define pr_fmt(fmt) "mmiotrace: " fmt
-#define DEBUG 1
-
#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 796506dcfc42..79e7a0ec1da5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -205,6 +205,18 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
}
+/* Some 1-byte opcodes for binary ALU operations */
+static u8 simple_alu_opcodes[] = {
+ [BPF_ADD] = 0x01,
+ [BPF_SUB] = 0x29,
+ [BPF_AND] = 0x21,
+ [BPF_OR] = 0x09,
+ [BPF_XOR] = 0x31,
+ [BPF_LSH] = 0xE0,
+ [BPF_RSH] = 0xE8,
+ [BPF_ARSH] = 0xF8,
+};
+
static void jit_fill_hole(void *area, unsigned int size)
{
/* Fill whole space with INT3 instructions */
@@ -681,6 +693,42 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
*pprog = prog;
}
+/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
+static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is_imm8(off)) {
+ /* 1-byte signed displacement.
+ *
+ * If off == 0 we could skip this and save one extra byte, but
+ * special case of x86 R13 which always needs an offset is not
+ * worth the hassle
+ */
+ EMIT2(add_2reg(0x40, ptr_reg, val_reg), off);
+ } else {
+ /* 4-byte signed displacement */
+ EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off);
+ }
+ *pprog = prog;
+}
+
+/*
+ * Emit a REX byte if it will be necessary to address these registers
+ */
+static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is64)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ *pprog = prog;
+}
+
/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@@ -708,15 +756,7 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
break;
}
- /*
- * If insn->off == 0 we can save one extra byte, but
- * special case of x86 R13 which always needs an offset
- * is not worth the hassle
- */
- if (is_imm8(off))
- EMIT2(add_2reg(0x40, src_reg, dst_reg), off);
- else
- EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off);
+ emit_insn_suffix(&prog, src_reg, dst_reg, off);
*pprog = prog;
}
@@ -751,11 +791,51 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
break;
}
- if (is_imm8(off))
- EMIT2(add_2reg(0x40, dst_reg, src_reg), off);
- else
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off);
+ emit_insn_suffix(&prog, dst_reg, src_reg, off);
+ *pprog = prog;
+}
+
+static int emit_atomic(u8 **pprog, u8 atomic_op,
+ u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ EMIT1(0xF0); /* lock prefix */
+
+ maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW);
+
+ /* emit opcode */
+ switch (atomic_op) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
+ EMIT1(simple_alu_opcodes[atomic_op]);
+ break;
+ case BPF_ADD | BPF_FETCH:
+ /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
+ EMIT2(0x0F, 0xC1);
+ break;
+ case BPF_XCHG:
+ /* src_reg = atomic_xchg(dst_reg + off, src_reg); */
+ EMIT1(0x87);
+ break;
+ case BPF_CMPXCHG:
+ /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
+ EMIT2(0x0F, 0xB1);
+ break;
+ default:
+ pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+ return -EFAULT;
+ }
+
+ emit_insn_suffix(&prog, dst_reg, src_reg, off);
+
*pprog = prog;
+ return 0;
}
static bool ex_handler_bpf(const struct exception_table_entry *x,
@@ -789,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}
+static int emit_nops(u8 **pprog, int len)
+{
+ u8 *prog = *pprog;
+ int i, noplen, cnt = 0;
+
+ while (len > 0) {
+ noplen = len;
+
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+
+ for (i = 0; i < noplen; i++)
+ EMIT1(ideal_nops[noplen][i]);
+ len -= noplen;
+ }
+
+ *pprog = prog;
+
+ return cnt;
+}
+
+#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
- int oldproglen, struct jit_context *ctx)
+ int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
struct bpf_insn *insn = bpf_prog->insnsi;
@@ -800,8 +903,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i, cnt = 0, excnt = 0;
- int proglen = 0;
+ int ilen, proglen = 0;
u8 *prog = temp;
+ int err;
detect_reg_usage(insn, insn_cnt, callee_regs_used,
&tail_call_seen);
@@ -813,17 +917,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_prog->aux->func_idx != 0);
push_callee_regs(&prog, callee_regs_used);
- addrs[0] = prog - temp;
+
+ ilen = prog - temp;
+ if (image)
+ memcpy(image + proglen, temp, ilen);
+ proglen += ilen;
+ addrs[0] = proglen;
+ prog = temp;
for (i = 1; i <= insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
u8 b2 = 0, b3 = 0;
+ u8 *start_of_ldx;
s64 jmp_offset;
u8 jmp_cond;
- int ilen;
u8 *func;
+ int nops;
switch (insn->code) {
/* ALU */
@@ -837,17 +948,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_AND | BPF_X:
case BPF_ALU64 | BPF_OR | BPF_X:
case BPF_ALU64 | BPF_XOR | BPF_X:
- switch (BPF_OP(insn->code)) {
- case BPF_ADD: b2 = 0x01; break;
- case BPF_SUB: b2 = 0x29; break;
- case BPF_AND: b2 = 0x21; break;
- case BPF_OR: b2 = 0x09; break;
- case BPF_XOR: b2 = 0x31; break;
- }
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
+ b2 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
break;
@@ -1027,12 +1130,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
- switch (BPF_OP(insn->code)) {
- case BPF_LSH: b3 = 0xE0; break;
- case BPF_RSH: b3 = 0xE8; break;
- case BPF_ARSH: b3 = 0xF8; break;
- }
-
+ b3 = simple_alu_opcodes[BPF_OP(insn->code)];
if (imm32 == 1)
EMIT2(0xD1, add_1reg(b3, dst_reg));
else
@@ -1066,11 +1164,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
- switch (BPF_OP(insn->code)) {
- case BPF_LSH: b3 = 0xE0; break;
- case BPF_RSH: b3 = 0xE8; break;
- case BPF_ARSH: b3 = 0xF8; break;
- }
+ b3 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(0xD3, add_1reg(b3, dst_reg));
if (src_reg != BPF_REG_4)
@@ -1185,12 +1279,30 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+ /* test src_reg, src_reg */
+ maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
+ EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
+ /* jne start_of_ldx */
+ EMIT2(X86_JNE, 0);
+ /* xor dst_reg, dst_reg */
+ emit_mov_imm32(&prog, false, dst_reg, 0);
+ /* jmp byte_after_ldx */
+ EMIT2(0xEB, 0);
+
+ /* populate jmp_offset for JNE above */
+ temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ start_of_ldx = prog;
+ }
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen;
s64 delta;
+ /* populate jmp_offset for JMP above */
+ start_of_ldx[-1] = prog - start_of_ldx;
+
if (!bpf_prog->aux->extable)
break;
@@ -1230,21 +1342,56 @@ st: if (is_imm8(insn->off))
}
break;
- /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
- case BPF_STX | BPF_XADD | BPF_W:
- /* Emit 'lock add dword ptr [rax + off], eax' */
- if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
- else
- EMIT2(0xF0, 0x01);
- goto xadd;
- case BPF_STX | BPF_XADD | BPF_DW:
- EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
-xadd: if (is_imm8(insn->off))
- EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
- else
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
- insn->off);
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
+ if (insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH)) {
+ u8 *branch_target;
+ bool is64 = BPF_SIZE(insn->code) == BPF_DW;
+
+ /*
+ * Can't be implemented with a single x86 insn.
+ * Need to do a CMPXCHG loop.
+ */
+
+ /* Will need RAX as a CMPXCHG operand so save R0 */
+ emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
+ branch_target = prog;
+ /* Load old value */
+ emit_ldx(&prog, BPF_SIZE(insn->code),
+ BPF_REG_0, dst_reg, insn->off);
+ /*
+ * Perform the (commutative) operation locally,
+ * put the result in the AUX_REG.
+ */
+ emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
+ maybe_emit_mod(&prog, AUX_REG, src_reg, is64);
+ EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
+ add_2reg(0xC0, AUX_REG, src_reg));
+ /* Attempt to swap in new value */
+ err = emit_atomic(&prog, BPF_CMPXCHG,
+ dst_reg, AUX_REG, insn->off,
+ BPF_SIZE(insn->code));
+ if (WARN_ON(err))
+ return err;
+ /*
+ * ZF tells us whether we won the race. If it's
+ * cleared we need to try again.
+ */
+ EMIT2(X86_JNE, -(prog - branch_target) - 2);
+ /* Return the pre-modification value */
+ emit_mov_reg(&prog, is64, src_reg, BPF_REG_0);
+ /* Restore R0 after clobbering RAX */
+ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
+ break;
+
+ }
+
+ err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
+ insn->off, BPF_SIZE(insn->code));
+ if (err)
+ return err;
break;
/* call */
@@ -1295,20 +1442,16 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP32 | BPF_JSGE | BPF_X:
case BPF_JMP32 | BPF_JSLE | BPF_X:
/* cmp dst_reg, src_reg */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
case BPF_JMP | BPF_JSET | BPF_X:
case BPF_JMP32 | BPF_JSET | BPF_X:
/* test dst_reg, src_reg */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
@@ -1344,10 +1487,8 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP32 | BPF_JSLE | BPF_K:
/* test dst_reg, dst_reg to save one extra byte */
if (imm32 == 0) {
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ maybe_emit_mod(&prog, dst_reg, dst_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
goto emit_cond_jmp;
}
@@ -1409,6 +1550,30 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
}
jmp_offset = addrs[i + insn->off] - addrs[i];
if (is_imm8(jmp_offset)) {
+ if (jmp_padding) {
+ /* To keep the jmp_offset valid, the extra bytes are
+ * padded before the jump insn, so we substract the
+ * 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
+ *
+ * If the previous pass already emits an imm8
+ * jmp_cond, then this BPF insn won't shrink, so
+ * "nops" is 0.
+ *
+ * On the other hand, if the previous pass emits an
+ * imm32 jmp_cond, the extra 4 bytes(*) is padded to
+ * keep the image from shrinking further.
+ *
+ * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond
+ * is 2 bytes, so the size difference is 4 bytes.
+ */
+ nops = INSN_SZ_DIFF - 2;
+ if (nops != 0 && nops != 4) {
+ pr_err("unexpected jmp_cond padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, nops);
+ }
EMIT2(jmp_cond, jmp_offset);
} else if (is_simm32(jmp_offset)) {
EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
@@ -1431,11 +1596,55 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
else
jmp_offset = addrs[i + insn->off] - addrs[i];
- if (!jmp_offset)
- /* Optimize out nop jumps */
+ if (!jmp_offset) {
+ /*
+ * If jmp_padding is enabled, the extra nops will
+ * be inserted. Otherwise, optimize out nop jumps.
+ */
+ if (jmp_padding) {
+ /* There are 3 possible conditions.
+ * (1) This BPF_JA is already optimized out in
+ * the previous run, so there is no need
+ * to pad any extra byte (0 byte).
+ * (2) The previous pass emits an imm8 jmp,
+ * so we pad 2 bytes to match the previous
+ * insn size.
+ * (3) Similarly, the previous pass emits an
+ * imm32 jmp, and 5 bytes is padded.
+ */
+ nops = INSN_SZ_DIFF;
+ if (nops != 0 && nops != 2 && nops != 5) {
+ pr_err("unexpected nop jump padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, nops);
+ }
break;
+ }
emit_jmp:
if (is_imm8(jmp_offset)) {
+ if (jmp_padding) {
+ /* To avoid breaking jmp_offset, the extra bytes
+ * are padded before the actual jmp insn, so
+ * 2 bytes is substracted from INSN_SZ_DIFF.
+ *
+ * If the previous pass already emits an imm8
+ * jmp, there is nothing to pad (0 byte).
+ *
+ * If it emits an imm32 jmp (5 bytes) previously
+ * and now an imm8 jmp (2 bytes), then we pad
+ * (5 - 2 = 3) bytes to stop the image from
+ * shrinking further.
+ */
+ nops = INSN_SZ_DIFF - 2;
+ if (nops != 0 && nops != 3) {
+ pr_err("unexpected jump padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, INSN_SZ_DIFF - 2);
+ }
EMIT2(0xEB, jmp_offset);
} else if (is_simm32(jmp_offset)) {
EMIT1_off32(0xE9, jmp_offset);
@@ -1531,17 +1740,25 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_prog *p, int stack_size, bool mod_ret)
{
u8 *prog = *pprog;
+ u8 *jmp_insn;
int cnt = 0;
- if (p->aux->sleepable) {
- if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
- return -EINVAL;
- } else {
- if (emit_call(&prog, __bpf_prog_enter, prog))
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+ if (emit_call(&prog,
+ p->aux->sleepable ? __bpf_prog_enter_sleepable :
+ __bpf_prog_enter, prog))
return -EINVAL;
- /* remember prog start time returned by __bpf_prog_enter */
- emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
- }
+ /* remember prog start time returned by __bpf_prog_enter */
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+
+ /* if (__bpf_prog_enter*(prog) == 0)
+ * goto skip_exec_of_prog;
+ */
+ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
+ /* emit 2 nops that will be replaced with JE insn */
+ jmp_insn = prog;
+ emit_nops(&prog, 2);
/* arg1: lea rdi, [rbp - stack_size] */
EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@@ -1561,43 +1778,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (mod_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- if (p->aux->sleepable) {
- if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
+ /* replace 2 nops with JE insn, since jmp target is known */
+ jmp_insn[0] = X86_JE;
+ jmp_insn[1] = prog - jmp_insn - 2;
+
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+ /* arg2: mov rsi, rbx <- start time in nsec */
+ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+ if (emit_call(&prog,
+ p->aux->sleepable ? __bpf_prog_exit_sleepable :
+ __bpf_prog_exit, prog))
return -EINVAL;
- } else {
- /* arg1: mov rdi, progs[i] */
- emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
- (u32) (long) p);
- /* arg2: mov rsi, rbx <- start time in nsec */
- emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
- if (emit_call(&prog, __bpf_prog_exit, prog))
- return -EINVAL;
- }
*pprog = prog;
return 0;
}
-static void emit_nops(u8 **pprog, unsigned int len)
-{
- unsigned int i, noplen;
- u8 *prog = *pprog;
- int cnt = 0;
-
- while (len > 0) {
- noplen = len;
-
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
-
- for (i = 0; i < noplen; i++)
- EMIT1(ideal_nops[noplen][i]);
- len -= noplen;
- }
-
- *pprog = prog;
-}
-
static void emit_align(u8 **pprog, u32 align)
{
u8 *target, *prog = *pprog;
@@ -1972,6 +2169,9 @@ struct x64_jit_data {
struct jit_context ctx;
};
+#define MAX_PASSES 20
+#define PADDING_PASSES (MAX_PASSES - 5)
+
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
@@ -1981,6 +2181,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
struct jit_context ctx = {};
bool tmp_blinded = false;
bool extra_pass = false;
+ bool padding = false;
u8 *image = NULL;
int *addrs;
int pass;
@@ -2017,6 +2218,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
image = jit_data->image;
header = jit_data->header;
extra_pass = true;
+ padding = true;
goto skip_init_addrs;
}
addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
@@ -2042,8 +2244,10 @@ skip_init_addrs:
* may converge on the last pass. In such case do one more
* pass to emit the final image.
*/
- for (pass = 0; pass < 20 || image; pass++) {
- proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ for (pass = 0; pass < MAX_PASSES || image; pass++) {
+ if (!padding && pass >= PADDING_PASSES)
+ padding = true;
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
if (proglen <= 0) {
out_image:
image = NULL;
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 96fde03aa987..d17b67c69f89 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2243,10 +2243,8 @@ emit_jmp:
return -EFAULT;
}
break;
- /* STX XADD: lock *(u32 *)(dst + off) += src */
- case BPF_STX | BPF_XADD | BPF_W:
- /* STX XADD: lock *(u64 *)(dst + off) += src */
- case BPF_STX | BPF_XADD | BPF_DW:
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
goto notyet;
case BPF_JMP | BPF_EXIT:
if (seen_exit) {
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
deleted file mode 100644
index 4d49b5a27025..000000000000
--- a/arch/x86/oprofile/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_OPROFILE) += oprofile.o
-
-DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
- oprof.o cpu_buffer.o buffer_sync.o \
- event_buffer.o oprofile_files.o \
- oprofilefs.o oprofile_stats.o \
- timer_int.o nmi_timer_int.o )
-
-oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
- op_model_ppro.o op_model_p4.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
deleted file mode 100644
index 1d8391fcca68..000000000000
--- a/arch/x86/oprofile/backtrace.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/**
- * @file backtrace.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author David Smith
- */
-
-#include <linux/oprofile.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/compat.h>
-#include <linux/uaccess.h>
-
-#include <asm/ptrace.h>
-#include <asm/stacktrace.h>
-#include <asm/unwind.h>
-
-#ifdef CONFIG_COMPAT
-static struct stack_frame_ia32 *
-dump_user_backtrace_32(struct stack_frame_ia32 *head)
-{
- /* Also check accessibility of one struct frame_head beyond: */
- struct stack_frame_ia32 bufhead[2];
- struct stack_frame_ia32 *fp;
- unsigned long bytes;
-
- bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
- if (bytes != 0)
- return NULL;
-
- fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
-
- oprofile_add_trace(bufhead[0].return_address);
-
- /* frame pointers should strictly progress back up the stack
- * (towards higher addresses) */
- if (head >= fp)
- return NULL;
-
- return fp;
-}
-
-static inline int
-x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
-{
- struct stack_frame_ia32 *head;
-
- /* User process is IA32 */
- if (!current || user_64bit_mode(regs))
- return 0;
-
- head = (struct stack_frame_ia32 *) regs->bp;
- while (depth-- && head)
- head = dump_user_backtrace_32(head);
-
- return 1;
-}
-
-#else
-static inline int
-x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
-{
- return 0;
-}
-#endif /* CONFIG_COMPAT */
-
-static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
-{
- /* Also check accessibility of one struct frame_head beyond: */
- struct stack_frame bufhead[2];
- unsigned long bytes;
-
- bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
- if (bytes != 0)
- return NULL;
-
- oprofile_add_trace(bufhead[0].return_address);
-
- /* frame pointers should strictly progress back up the stack
- * (towards higher addresses) */
- if (head >= bufhead[0].next_frame)
- return NULL;
-
- return bufhead[0].next_frame;
-}
-
-void
-x86_backtrace(struct pt_regs * const regs, unsigned int depth)
-{
- struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
-
- if (!user_mode(regs)) {
- struct unwind_state state;
- unsigned long addr;
-
- if (!depth)
- return;
-
- oprofile_add_trace(regs->ip);
-
- if (!--depth)
- return;
-
- for (unwind_start(&state, current, regs, NULL);
- !unwind_done(&state); unwind_next_frame(&state)) {
- addr = unwind_get_return_address(&state);
- if (!addr)
- break;
-
- oprofile_add_trace(addr);
-
- if (!--depth)
- break;
- }
-
- return;
- }
-
- if (x86_backtrace_32(regs, depth))
- return;
-
- while (depth-- && head)
- head = dump_user_backtrace(head);
-}
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
deleted file mode 100644
index 9e138d00ad36..000000000000
--- a/arch/x86/oprofile/init.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * @file init.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/oprofile.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-
-/*
- * We support CPUs that have performance counters like the Pentium Pro
- * with the NMI mode driver.
- */
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern int op_nmi_init(struct oprofile_operations *ops);
-extern void op_nmi_exit(void);
-#else
-static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
-static void op_nmi_exit(void) { }
-#endif
-
-extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
-
-int __init oprofile_arch_init(struct oprofile_operations *ops)
-{
- ops->backtrace = x86_backtrace;
- return op_nmi_init(ops);
-}
-
-void oprofile_arch_exit(void)
-{
- op_nmi_exit();
-}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
deleted file mode 100644
index a7a7677265b6..000000000000
--- a/arch/x86/oprofile/nmi_int.c
+++ /dev/null
@@ -1,780 +0,0 @@
-/**
- * @file nmi_int.c
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- * @author Robert Richter <robert.richter@amd.com>
- * @author Barry Kasindorf <barry.kasindorf@amd.com>
- * @author Jason Yeh <jason.yeh@amd.com>
- * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#include <linux/init.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/oprofile.h>
-#include <linux/syscore_ops.h>
-#include <linux/slab.h>
-#include <linux/moduleparam.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <asm/nmi.h>
-#include <asm/msr.h>
-#include <asm/apic.h>
-
-#include "op_counter.h"
-#include "op_x86_model.h"
-
-static struct op_x86_model_spec *model;
-static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
-static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
-
-/* must be protected with get_online_cpus()/put_online_cpus(): */
-static int nmi_enabled;
-static int ctr_running;
-
-struct op_counter_config counter_config[OP_MAX_COUNTER];
-
-/* common functions */
-
-u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
- struct op_counter_config *counter_config)
-{
- u64 val = 0;
- u16 event = (u16)counter_config->event;
-
- val |= ARCH_PERFMON_EVENTSEL_INT;
- val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
- val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
- val |= (counter_config->unit_mask & 0xFF) << 8;
- counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_EDGE |
- ARCH_PERFMON_EVENTSEL_CMASK);
- val |= counter_config->extra;
- event &= model->event_mask ? model->event_mask : 0xFF;
- val |= event & 0xFF;
- val |= (u64)(event & 0x0F00) << 24;
-
- return val;
-}
-
-
-static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs)
-{
- if (ctr_running)
- model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs));
- else if (!nmi_enabled)
- return NMI_DONE;
- else
- model->stop(this_cpu_ptr(&cpu_msrs));
- return NMI_HANDLED;
-}
-
-static void nmi_cpu_save_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *controls = msrs->controls;
- unsigned int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- if (counters[i].addr)
- rdmsrl(counters[i].addr, counters[i].saved);
- }
-
- for (i = 0; i < model->num_controls; ++i) {
- if (controls[i].addr)
- rdmsrl(controls[i].addr, controls[i].saved);
- }
-}
-
-static void nmi_cpu_start(void *dummy)
-{
- struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
- if (!msrs->controls)
- WARN_ON_ONCE(1);
- else
- model->start(msrs);
-}
-
-static int nmi_start(void)
-{
- get_online_cpus();
- ctr_running = 1;
- /* make ctr_running visible to the nmi handler: */
- smp_mb();
- on_each_cpu(nmi_cpu_start, NULL, 1);
- put_online_cpus();
- return 0;
-}
-
-static void nmi_cpu_stop(void *dummy)
-{
- struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
- if (!msrs->controls)
- WARN_ON_ONCE(1);
- else
- model->stop(msrs);
-}
-
-static void nmi_stop(void)
-{
- get_online_cpus();
- on_each_cpu(nmi_cpu_stop, NULL, 1);
- ctr_running = 0;
- put_online_cpus();
-}
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static DEFINE_PER_CPU(int, switch_index);
-
-static inline int has_mux(void)
-{
- return !!model->switch_ctrl;
-}
-
-inline int op_x86_phys_to_virt(int phys)
-{
- return __this_cpu_read(switch_index) + phys;
-}
-
-inline int op_x86_virt_to_phys(int virt)
-{
- return virt % model->num_counters;
-}
-
-static void nmi_shutdown_mux(void)
-{
- int i;
-
- if (!has_mux())
- return;
-
- for_each_possible_cpu(i) {
- kfree(per_cpu(cpu_msrs, i).multiplex);
- per_cpu(cpu_msrs, i).multiplex = NULL;
- per_cpu(switch_index, i) = 0;
- }
-}
-
-static int nmi_setup_mux(void)
-{
- size_t multiplex_size =
- sizeof(struct op_msr) * model->num_virt_counters;
- int i;
-
- if (!has_mux())
- return 1;
-
- for_each_possible_cpu(i) {
- per_cpu(cpu_msrs, i).multiplex =
- kzalloc(multiplex_size, GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).multiplex)
- return 0;
- }
-
- return 1;
-}
-
-static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
-{
- int i;
- struct op_msr *multiplex = msrs->multiplex;
-
- if (!has_mux())
- return;
-
- for (i = 0; i < model->num_virt_counters; ++i) {
- if (counter_config[i].enabled) {
- multiplex[i].saved = -(u64)counter_config[i].count;
- } else {
- multiplex[i].saved = 0;
- }
- }
-
- per_cpu(switch_index, cpu) = 0;
-}
-
-static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *multiplex = msrs->multiplex;
- int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (counters[i].addr)
- rdmsrl(counters[i].addr, multiplex[virt].saved);
- }
-}
-
-static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *multiplex = msrs->multiplex;
- int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (counters[i].addr)
- wrmsrl(counters[i].addr, multiplex[virt].saved);
- }
-}
-
-static void nmi_cpu_switch(void *dummy)
-{
- int cpu = smp_processor_id();
- int si = per_cpu(switch_index, cpu);
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- nmi_cpu_stop(NULL);
- nmi_cpu_save_mpx_registers(msrs);
-
- /* move to next set */
- si += model->num_counters;
- if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
- per_cpu(switch_index, cpu) = 0;
- else
- per_cpu(switch_index, cpu) = si;
-
- model->switch_ctrl(model, msrs);
- nmi_cpu_restore_mpx_registers(msrs);
-
- nmi_cpu_start(NULL);
-}
-
-
-/*
- * Quick check to see if multiplexing is necessary.
- * The check should be sufficient since counters are used
- * in ordre.
- */
-static int nmi_multiplex_on(void)
-{
- return counter_config[model->num_counters].count ? 0 : -EINVAL;
-}
-
-static int nmi_switch_event(void)
-{
- if (!has_mux())
- return -ENOSYS; /* not implemented */
- if (nmi_multiplex_on() < 0)
- return -EINVAL; /* not necessary */
-
- get_online_cpus();
- if (ctr_running)
- on_each_cpu(nmi_cpu_switch, NULL, 1);
- put_online_cpus();
-
- return 0;
-}
-
-static inline void mux_init(struct oprofile_operations *ops)
-{
- if (has_mux())
- ops->switch_events = nmi_switch_event;
-}
-
-static void mux_clone(int cpu)
-{
- if (!has_mux())
- return;
-
- memcpy(per_cpu(cpu_msrs, cpu).multiplex,
- per_cpu(cpu_msrs, 0).multiplex,
- sizeof(struct op_msr) * model->num_virt_counters);
-}
-
-#else
-
-inline int op_x86_phys_to_virt(int phys) { return phys; }
-inline int op_x86_virt_to_phys(int virt) { return virt; }
-static inline void nmi_shutdown_mux(void) { }
-static inline int nmi_setup_mux(void) { return 1; }
-static inline void
-nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
-static inline void mux_init(struct oprofile_operations *ops) { }
-static void mux_clone(int cpu) { }
-
-#endif
-
-static void free_msrs(void)
-{
- int i;
- for_each_possible_cpu(i) {
- kfree(per_cpu(cpu_msrs, i).counters);
- per_cpu(cpu_msrs, i).counters = NULL;
- kfree(per_cpu(cpu_msrs, i).controls);
- per_cpu(cpu_msrs, i).controls = NULL;
- }
- nmi_shutdown_mux();
-}
-
-static int allocate_msrs(void)
-{
- size_t controls_size = sizeof(struct op_msr) * model->num_controls;
- size_t counters_size = sizeof(struct op_msr) * model->num_counters;
-
- int i;
- for_each_possible_cpu(i) {
- per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).counters)
- goto fail;
- per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).controls)
- goto fail;
- }
-
- if (!nmi_setup_mux())
- goto fail;
-
- return 1;
-
-fail:
- free_msrs();
- return 0;
-}
-
-static void nmi_cpu_setup(void)
-{
- int cpu = smp_processor_id();
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- nmi_cpu_save_registers(msrs);
- raw_spin_lock(&oprofilefs_lock);
- model->setup_ctrs(model, msrs);
- nmi_cpu_setup_mux(cpu, msrs);
- raw_spin_unlock(&oprofilefs_lock);
- per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static void nmi_cpu_restore_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *controls = msrs->controls;
- unsigned int i;
-
- for (i = 0; i < model->num_controls; ++i) {
- if (controls[i].addr)
- wrmsrl(controls[i].addr, controls[i].saved);
- }
-
- for (i = 0; i < model->num_counters; ++i) {
- if (counters[i].addr)
- wrmsrl(counters[i].addr, counters[i].saved);
- }
-}
-
-static void nmi_cpu_shutdown(void)
-{
- unsigned int v;
- int cpu = smp_processor_id();
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- /* restoring APIC_LVTPC can trigger an apic error because the delivery
- * mode and vector nr combination can be illegal. That's by design: on
- * power on apic lvt contain a zero vector nr which are legal only for
- * NMI delivery mode. So inhibit apic err before restoring lvtpc
- */
- v = apic_read(APIC_LVTERR);
- apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
- apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
- apic_write(APIC_LVTERR, v);
- nmi_cpu_restore_registers(msrs);
-}
-
-static int nmi_cpu_online(unsigned int cpu)
-{
- local_irq_disable();
- if (nmi_enabled)
- nmi_cpu_setup();
- if (ctr_running)
- nmi_cpu_start(NULL);
- local_irq_enable();
- return 0;
-}
-
-static int nmi_cpu_down_prep(unsigned int cpu)
-{
- local_irq_disable();
- if (ctr_running)
- nmi_cpu_stop(NULL);
- if (nmi_enabled)
- nmi_cpu_shutdown();
- local_irq_enable();
- return 0;
-}
-
-static int nmi_create_files(struct dentry *root)
-{
- unsigned int i;
-
- for (i = 0; i < model->num_virt_counters; ++i) {
- struct dentry *dir;
- char buf[4];
-
- /* quick little hack to _not_ expose a counter if it is not
- * available for use. This should protect userspace app.
- * NOTE: assumes 1:1 mapping here (that counters are organized
- * sequentially in their struct assignment).
- */
- if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
- continue;
-
- snprintf(buf, sizeof(buf), "%d", i);
- dir = oprofilefs_mkdir(root, buf);
- oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
- oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
- oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
- oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
- oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
- oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
- oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
- }
-
- return 0;
-}
-
-static enum cpuhp_state cpuhp_nmi_online;
-
-static int nmi_setup(void)
-{
- int err = 0;
- int cpu;
-
- if (!allocate_msrs())
- return -ENOMEM;
-
- /* We need to serialize save and setup for HT because the subset
- * of msrs are distinct for save and setup operations
- */
-
- /* Assume saved/restored counters are the same on all CPUs */
- err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
- if (err)
- goto fail;
-
- for_each_possible_cpu(cpu) {
- if (!IS_ENABLED(CONFIG_SMP) || !cpu)
- continue;
-
- memcpy(per_cpu(cpu_msrs, cpu).counters,
- per_cpu(cpu_msrs, 0).counters,
- sizeof(struct op_msr) * model->num_counters);
-
- memcpy(per_cpu(cpu_msrs, cpu).controls,
- per_cpu(cpu_msrs, 0).controls,
- sizeof(struct op_msr) * model->num_controls);
-
- mux_clone(cpu);
- }
-
- nmi_enabled = 0;
- ctr_running = 0;
- /* make variables visible to the nmi handler: */
- smp_mb();
- err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify,
- 0, "oprofile");
- if (err)
- goto fail;
-
- nmi_enabled = 1;
- /* make nmi_enabled visible to the nmi handler: */
- smp_mb();
- err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online",
- nmi_cpu_online, nmi_cpu_down_prep);
- if (err < 0)
- goto fail_nmi;
- cpuhp_nmi_online = err;
- return 0;
-fail_nmi:
- unregister_nmi_handler(NMI_LOCAL, "oprofile");
-fail:
- free_msrs();
- return err;
-}
-
-static void nmi_shutdown(void)
-{
- struct op_msrs *msrs;
-
- cpuhp_remove_state(cpuhp_nmi_online);
- nmi_enabled = 0;
- ctr_running = 0;
-
- /* make variables visible to the nmi handler: */
- smp_mb();
- unregister_nmi_handler(NMI_LOCAL, "oprofile");
- msrs = &get_cpu_var(cpu_msrs);
- model->shutdown(msrs);
- free_msrs();
- put_cpu_var(cpu_msrs);
-}
-
-#ifdef CONFIG_PM
-
-static int nmi_suspend(void)
-{
- /* Only one CPU left, just stop that one */
- if (nmi_enabled == 1)
- nmi_cpu_stop(NULL);
- return 0;
-}
-
-static void nmi_resume(void)
-{
- if (nmi_enabled == 1)
- nmi_cpu_start(NULL);
-}
-
-static struct syscore_ops oprofile_syscore_ops = {
- .resume = nmi_resume,
- .suspend = nmi_suspend,
-};
-
-static void __init init_suspend_resume(void)
-{
- register_syscore_ops(&oprofile_syscore_ops);
-}
-
-static void exit_suspend_resume(void)
-{
- unregister_syscore_ops(&oprofile_syscore_ops);
-}
-
-#else
-
-static inline void init_suspend_resume(void) { }
-static inline void exit_suspend_resume(void) { }
-
-#endif /* CONFIG_PM */
-
-static int __init p4_init(char **cpu_type)
-{
- __u8 cpu_model = boot_cpu_data.x86_model;
-
- if (cpu_model > 6 || cpu_model == 5)
- return 0;
-
-#ifndef CONFIG_SMP
- *cpu_type = "i386/p4";
- model = &op_p4_spec;
- return 1;
-#else
- switch (smp_num_siblings) {
- case 1:
- *cpu_type = "i386/p4";
- model = &op_p4_spec;
- return 1;
-
- case 2:
- *cpu_type = "i386/p4-ht";
- model = &op_p4_ht2_spec;
- return 1;
- }
-#endif
-
- printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
- printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
- return 0;
-}
-
-enum __force_cpu_type {
- reserved = 0, /* do not force */
- timer,
- arch_perfmon,
-};
-
-static int force_cpu_type;
-
-static int set_cpu_type(const char *str, const struct kernel_param *kp)
-{
- if (!strcmp(str, "timer")) {
- force_cpu_type = timer;
- printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
- } else if (!strcmp(str, "arch_perfmon")) {
- force_cpu_type = arch_perfmon;
- printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
- } else {
- force_cpu_type = 0;
- }
-
- return 0;
-}
-module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
-
-static int __init ppro_init(char **cpu_type)
-{
- __u8 cpu_model = boot_cpu_data.x86_model;
- struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
-
- if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
- return 0;
-
- /*
- * Documentation on identifying Intel processors by CPU family
- * and model can be found in the Intel Software Developer's
- * Manuals (SDM):
- *
- * http://www.intel.com/products/processor/manuals/
- *
- * As of May 2010 the documentation for this was in the:
- * "Intel 64 and IA-32 Architectures Software Developer's
- * Manual Volume 3B: System Programming Guide", "Table B-1
- * CPUID Signature Values of DisplayFamily_DisplayModel".
- */
- switch (cpu_model) {
- case 0 ... 2:
- *cpu_type = "i386/ppro";
- break;
- case 3 ... 5:
- *cpu_type = "i386/pii";
- break;
- case 6 ... 8:
- case 10 ... 11:
- *cpu_type = "i386/piii";
- break;
- case 9:
- case 13:
- *cpu_type = "i386/p6_mobile";
- break;
- case 14:
- *cpu_type = "i386/core";
- break;
- case 0x0f:
- case 0x16:
- case 0x17:
- case 0x1d:
- *cpu_type = "i386/core_2";
- break;
- case 0x1a:
- case 0x1e:
- case 0x2e:
- spec = &op_arch_perfmon_spec;
- *cpu_type = "i386/core_i7";
- break;
- case 0x1c:
- *cpu_type = "i386/atom";
- break;
- default:
- /* Unknown */
- return 0;
- }
-
- model = spec;
- return 1;
-}
-
-int __init op_nmi_init(struct oprofile_operations *ops)
-{
- __u8 vendor = boot_cpu_data.x86_vendor;
- __u8 family = boot_cpu_data.x86;
- char *cpu_type = NULL;
- int ret = 0;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return -ENODEV;
-
- if (force_cpu_type == timer)
- return -ENODEV;
-
- switch (vendor) {
- case X86_VENDOR_AMD:
- /* Needs to be at least an Athlon (or hammer in 32bit mode) */
-
- switch (family) {
- case 6:
- cpu_type = "i386/athlon";
- break;
- case 0xf:
- /*
- * Actually it could be i386/hammer too, but
- * give user space an consistent name.
- */
- cpu_type = "x86-64/hammer";
- break;
- case 0x10:
- cpu_type = "x86-64/family10";
- break;
- case 0x11:
- cpu_type = "x86-64/family11h";
- break;
- case 0x12:
- cpu_type = "x86-64/family12h";
- break;
- case 0x14:
- cpu_type = "x86-64/family14h";
- break;
- case 0x15:
- cpu_type = "x86-64/family15h";
- break;
- default:
- return -ENODEV;
- }
- model = &op_amd_spec;
- break;
-
- case X86_VENDOR_INTEL:
- switch (family) {
- /* Pentium IV */
- case 0xf:
- p4_init(&cpu_type);
- break;
-
- /* A P6-class processor */
- case 6:
- ppro_init(&cpu_type);
- break;
-
- default:
- break;
- }
-
- if (cpu_type)
- break;
-
- if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
- return -ENODEV;
-
- /* use arch perfmon as fallback */
- cpu_type = "i386/arch_perfmon";
- model = &op_arch_perfmon_spec;
- break;
-
- default:
- return -ENODEV;
- }
-
- /* default values, can be overwritten by model */
- ops->create_files = nmi_create_files;
- ops->setup = nmi_setup;
- ops->shutdown = nmi_shutdown;
- ops->start = nmi_start;
- ops->stop = nmi_stop;
- ops->cpu_type = cpu_type;
-
- if (model->init)
- ret = model->init(ops);
- if (ret)
- return ret;
-
- if (!model->num_virt_counters)
- model->num_virt_counters = model->num_counters;
-
- mux_init(ops);
-
- init_suspend_resume();
-
- printk(KERN_INFO "oprofile: using NMI interrupt.\n");
- return 0;
-}
-
-void op_nmi_exit(void)
-{
- exit_suspend_resume();
-}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
deleted file mode 100644
index 0b7b7b179cbe..000000000000
--- a/arch/x86/oprofile/op_counter.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * @file op_counter.h
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- */
-
-#ifndef OP_COUNTER_H
-#define OP_COUNTER_H
-
-#define OP_MAX_COUNTER 32
-
-/* Per-perfctr configuration as set via
- * oprofilefs.
- */
-struct op_counter_config {
- unsigned long count;
- unsigned long enabled;
- unsigned long event;
- unsigned long kernel;
- unsigned long user;
- unsigned long unit_mask;
- unsigned long extra;
-};
-
-extern struct op_counter_config counter_config[];
-
-#endif /* OP_COUNTER_H */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
deleted file mode 100644
index 660a83c8287b..000000000000
--- a/arch/x86/oprofile/op_model_amd.c
+++ /dev/null
@@ -1,542 +0,0 @@
-/*
- * @file op_model_amd.c
- * athlon / K7 / K8 / Family 10h model-specific MSR operations
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author Philippe Elie
- * @author Graydon Hoare
- * @author Robert Richter <robert.richter@amd.com>
- * @author Barry Kasindorf <barry.kasindorf@amd.com>
- * @author Jason Yeh <jason.yeh@amd.com>
- * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#include <linux/oprofile.h>
-#include <linux/device.h>
-#include <linux/pci.h>
-#include <linux/percpu.h>
-
-#include <asm/ptrace.h>
-#include <asm/msr.h>
-#include <asm/nmi.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-#define NUM_VIRT_COUNTERS 32
-#else
-#define NUM_VIRT_COUNTERS 0
-#endif
-
-#define OP_EVENT_MASK 0x0FFF
-#define OP_CTR_OVERFLOW (1ULL<<31)
-
-#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
-
-static int num_counters;
-static unsigned long reset_value[OP_MAX_COUNTER];
-
-#define IBS_FETCH_SIZE 6
-#define IBS_OP_SIZE 12
-
-static u32 ibs_caps;
-
-struct ibs_config {
- unsigned long op_enabled;
- unsigned long fetch_enabled;
- unsigned long max_cnt_fetch;
- unsigned long max_cnt_op;
- unsigned long rand_en;
- unsigned long dispatched_ops;
- unsigned long branch_target;
-};
-
-struct ibs_state {
- u64 ibs_op_ctl;
- int branch_target;
- unsigned long sample_size;
-};
-
-static struct ibs_config ibs_config;
-static struct ibs_state ibs_state;
-
-/*
- * IBS randomization macros
- */
-#define IBS_RANDOM_BITS 12
-#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
-#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
-
-/*
- * 16-bit Linear Feedback Shift Register (LFSR)
- *
- * 16 14 13 11
- * Feedback polynomial = X + X + X + X + 1
- */
-static unsigned int lfsr_random(void)
-{
- static unsigned int lfsr_value = 0xF00D;
- unsigned int bit;
-
- /* Compute next bit to shift in */
- bit = ((lfsr_value >> 0) ^
- (lfsr_value >> 2) ^
- (lfsr_value >> 3) ^
- (lfsr_value >> 5)) & 0x0001;
-
- /* Advance to next register value */
- lfsr_value = (lfsr_value >> 1) | (bit << 15);
-
- return lfsr_value;
-}
-
-/*
- * IBS software randomization
- *
- * The IBS periodic op counter is randomized in software. The lower 12
- * bits of the 20 bit counter are randomized. IbsOpCurCnt is
- * initialized with a 12 bit random value.
- */
-static inline u64 op_amd_randomize_ibs_op(u64 val)
-{
- unsigned int random = lfsr_random();
-
- if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
- /*
- * Work around if the hw can not write to IbsOpCurCnt
- *
- * Randomize the lower 8 bits of the 16 bit
- * IbsOpMaxCnt [15:0] value in the range of -128 to
- * +127 by adding/subtracting an offset to the
- * maximum count (IbsOpMaxCnt).
- *
- * To avoid over or underflows and protect upper bits
- * starting at bit 16, the initial value for
- * IbsOpMaxCnt must fit in the range from 0x0081 to
- * 0xff80.
- */
- val += (s8)(random >> 4);
- else
- val |= (u64)(random & IBS_RANDOM_MASK) << 32;
-
- return val;
-}
-
-static inline void
-op_amd_handle_ibs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val, ctl;
- struct op_entry entry;
-
- if (!ibs_caps)
- return;
-
- if (ibs_config.fetch_enabled) {
- rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
- if (ctl & IBS_FETCH_VAL) {
- rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
- oprofile_write_reserve(&entry, regs, val,
- IBS_FETCH_CODE, IBS_FETCH_SIZE);
- oprofile_add_data64(&entry, val);
- oprofile_add_data64(&entry, ctl);
- rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
- oprofile_add_data64(&entry, val);
- oprofile_write_commit(&entry);
-
- /* reenable the IRQ */
- ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
- ctl |= IBS_FETCH_ENABLE;
- wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
- }
- }
-
- if (ibs_config.op_enabled) {
- rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
- if (ctl & IBS_OP_VAL) {
- rdmsrl(MSR_AMD64_IBSOPRIP, val);
- oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
- ibs_state.sample_size);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA2, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA3, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSDCLINAD, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
- oprofile_add_data64(&entry, val);
- if (ibs_state.branch_target) {
- rdmsrl(MSR_AMD64_IBSBRTARGET, val);
- oprofile_add_data(&entry, (unsigned long)val);
- }
- oprofile_write_commit(&entry);
-
- /* reenable the IRQ */
- ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
- wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
- }
- }
-}
-
-static inline void op_amd_start_ibs(void)
-{
- u64 val;
-
- if (!ibs_caps)
- return;
-
- memset(&ibs_state, 0, sizeof(ibs_state));
-
- /*
- * Note: Since the max count settings may out of range we
- * write back the actual used values so that userland can read
- * it.
- */
-
- if (ibs_config.fetch_enabled) {
- val = ibs_config.max_cnt_fetch >> 4;
- val = min(val, IBS_FETCH_MAX_CNT);
- ibs_config.max_cnt_fetch = val << 4;
- val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
- val |= IBS_FETCH_ENABLE;
- wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
- }
-
- if (ibs_config.op_enabled) {
- val = ibs_config.max_cnt_op >> 4;
- if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
- /*
- * IbsOpCurCnt not supported. See
- * op_amd_randomize_ibs_op() for details.
- */
- val = clamp(val, 0x0081ULL, 0xFF80ULL);
- ibs_config.max_cnt_op = val << 4;
- } else {
- /*
- * The start value is randomized with a
- * positive offset, we need to compensate it
- * with the half of the randomized range. Also
- * avoid underflows.
- */
- val += IBS_RANDOM_MAXCNT_OFFSET;
- if (ibs_caps & IBS_CAPS_OPCNTEXT)
- val = min(val, IBS_OP_MAX_CNT_EXT);
- else
- val = min(val, IBS_OP_MAX_CNT);
- ibs_config.max_cnt_op =
- (val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
- }
- val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
- val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
- val |= IBS_OP_ENABLE;
- ibs_state.ibs_op_ctl = val;
- ibs_state.sample_size = IBS_OP_SIZE;
- if (ibs_config.branch_target) {
- ibs_state.branch_target = 1;
- ibs_state.sample_size++;
- }
- val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
- wrmsrl(MSR_AMD64_IBSOPCTL, val);
- }
-}
-
-static void op_amd_stop_ibs(void)
-{
- if (!ibs_caps)
- return;
-
- if (ibs_config.fetch_enabled)
- /* clear max count and enable */
- wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
-
- if (ibs_config.op_enabled)
- /* clear max count and enable */
- wrmsrl(MSR_AMD64_IBSOPCTL, 0);
-}
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-#endif
-
-/* functions for op_amd_spec */
-
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->counters[i].addr)
- continue;
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-}
-
-static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; i++) {
- if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
- goto fail;
- if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- goto fail;
- }
- /* both registers must be reserved */
- if (num_counters == AMD64_NUM_COUNTERS_CORE) {
- msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
- msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
- } else {
- msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
- msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- }
- continue;
- fail:
- if (!counter_config[i].enabled)
- continue;
- op_x86_warn_reserved(i);
- op_amd_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* setup reset_value */
- for (i = 0; i < OP_MAX_COUNTER; ++i) {
- if (counter_config[i].enabled
- && msrs->counters[op_x86_virt_to_phys(i)].addr)
- reset_value[i] = counter_config[i].count;
- else
- reset_value[i] = 0;
- }
-
- /* clear all counters */
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->controls[i].addr)
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- op_x86_warn_in_use(i);
- val &= model->reserved;
- wrmsrl(msrs->controls[i].addr, val);
- /*
- * avoid a false detection of ctr overflows in NMI
- * handler
- */
- wrmsrl(msrs->counters[i].addr, -1LL);
- }
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
-
- /* setup counter registers */
- wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
-
- /* setup control registers */
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-static int op_amd_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
- rdmsrl(msrs->counters[i].addr, val);
- /* bit is clear if overflowed: */
- if (val & OP_CTR_OVERFLOW)
- continue;
- oprofile_add_sample(regs, virt);
- wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
- }
-
- op_amd_handle_ibs(regs, msrs);
-
- /* See op_model_ppro.c */
- return 1;
-}
-
-static void op_amd_start(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[op_x86_phys_to_virt(i)])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-
- op_amd_start_ibs();
-}
-
-static void op_amd_stop(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /*
- * Subtle: stop on all counters to avoid race with setting our
- * pm callback
- */
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[op_x86_phys_to_virt(i)])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-
- op_amd_stop_ibs();
-}
-
-/*
- * check and reserve APIC extended interrupt LVT offset for IBS if
- * available
- */
-
-static void init_ibs(void)
-{
- ibs_caps = get_ibs_caps();
-
- if (!ibs_caps)
- return;
-
- printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
-}
-
-static int (*create_arch_files)(struct dentry *root);
-
-static int setup_ibs_files(struct dentry *root)
-{
- struct dentry *dir;
- int ret = 0;
-
- /* architecture specific files */
- if (create_arch_files)
- ret = create_arch_files(root);
-
- if (ret)
- return ret;
-
- if (!ibs_caps)
- return ret;
-
- /* model specific files */
-
- /* setup some reasonable defaults */
- memset(&ibs_config, 0, sizeof(ibs_config));
- ibs_config.max_cnt_fetch = 250000;
- ibs_config.max_cnt_op = 250000;
-
- if (ibs_caps & IBS_CAPS_FETCHSAM) {
- dir = oprofilefs_mkdir(root, "ibs_fetch");
- oprofilefs_create_ulong(dir, "enable",
- &ibs_config.fetch_enabled);
- oprofilefs_create_ulong(dir, "max_count",
- &ibs_config.max_cnt_fetch);
- oprofilefs_create_ulong(dir, "rand_enable",
- &ibs_config.rand_en);
- }
-
- if (ibs_caps & IBS_CAPS_OPSAM) {
- dir = oprofilefs_mkdir(root, "ibs_op");
- oprofilefs_create_ulong(dir, "enable",
- &ibs_config.op_enabled);
- oprofilefs_create_ulong(dir, "max_count",
- &ibs_config.max_cnt_op);
- if (ibs_caps & IBS_CAPS_OPCNT)
- oprofilefs_create_ulong(dir, "dispatched_ops",
- &ibs_config.dispatched_ops);
- if (ibs_caps & IBS_CAPS_BRNTRGT)
- oprofilefs_create_ulong(dir, "branch_target",
- &ibs_config.branch_target);
- }
-
- return 0;
-}
-
-struct op_x86_model_spec op_amd_spec;
-
-static int op_amd_init(struct oprofile_operations *ops)
-{
- init_ibs();
- create_arch_files = ops->create_files;
- ops->create_files = setup_ibs_files;
-
- if (boot_cpu_data.x86 == 0x15) {
- num_counters = AMD64_NUM_COUNTERS_CORE;
- } else {
- num_counters = AMD64_NUM_COUNTERS;
- }
-
- op_amd_spec.num_counters = num_counters;
- op_amd_spec.num_controls = num_counters;
- op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
-
- return 0;
-}
-
-struct op_x86_model_spec op_amd_spec = {
- /* num_counters/num_controls filled in at runtime */
- .reserved = MSR_AMD_EVENTSEL_RESERVED,
- .event_mask = OP_EVENT_MASK,
- .init = op_amd_init,
- .fill_in_addresses = &op_amd_fill_in_addresses,
- .setup_ctrs = &op_amd_setup_ctrs,
- .check_ctrs = &op_amd_check_ctrs,
- .start = &op_amd_start,
- .stop = &op_amd_stop,
- .shutdown = &op_amd_shutdown,
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
- .switch_ctrl = &op_mux_switch_ctrl,
-#endif
-};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
deleted file mode 100644
index ad1d91f475ab..000000000000
--- a/arch/x86/oprofile/op_model_p4.c
+++ /dev/null
@@ -1,723 +0,0 @@
-/**
- * @file op_model_p4.c
- * P4 model-specific MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Graydon Hoare
- */
-
-#include <linux/oprofile.h>
-#include <linux/smp.h>
-#include <linux/ptrace.h>
-#include <asm/nmi.h>
-#include <asm/msr.h>
-#include <asm/fixmap.h>
-#include <asm/apic.h>
-
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-#define NUM_EVENTS 39
-
-#define NUM_COUNTERS_NON_HT 8
-#define NUM_ESCRS_NON_HT 45
-#define NUM_CCCRS_NON_HT 18
-#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT)
-
-#define NUM_COUNTERS_HT2 4
-#define NUM_ESCRS_HT2 23
-#define NUM_CCCRS_HT2 9
-#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
-
-#define OP_CTR_OVERFLOW (1ULL<<31)
-
-static unsigned int num_counters = NUM_COUNTERS_NON_HT;
-static unsigned int num_controls = NUM_CONTROLS_NON_HT;
-
-/* this has to be checked dynamically since the
- hyper-threadedness of a chip is discovered at
- kernel boot-time. */
-static inline void setup_num_counters(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings == 2) {
- num_counters = NUM_COUNTERS_HT2;
- num_controls = NUM_CONTROLS_HT2;
- }
-#endif
-}
-
-static inline int addr_increment(void)
-{
-#ifdef CONFIG_SMP
- return smp_num_siblings == 2 ? 2 : 1;
-#else
- return 1;
-#endif
-}
-
-
-/* tables to simulate simplified hardware view of p4 registers */
-struct p4_counter_binding {
- int virt_counter;
- int counter_address;
- int cccr_address;
-};
-
-struct p4_event_binding {
- int escr_select; /* value to put in CCCR */
- int event_select; /* value to put in ESCR */
- struct {
- int virt_counter; /* for this counter... */
- int escr_address; /* use this ESCR */
- } bindings[2];
-};
-
-/* nb: these CTR_* defines are a duplicate of defines in
- event/i386.p4*events. */
-
-
-#define CTR_BPU_0 (1 << 0)
-#define CTR_MS_0 (1 << 1)
-#define CTR_FLAME_0 (1 << 2)
-#define CTR_IQ_4 (1 << 3)
-#define CTR_BPU_2 (1 << 4)
-#define CTR_MS_2 (1 << 5)
-#define CTR_FLAME_2 (1 << 6)
-#define CTR_IQ_5 (1 << 7)
-
-static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
- { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
- { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
- { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
- { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 },
- { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 },
- { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 },
- { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 },
- { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
-};
-
-#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
-
-/* p4 event codes in libop/op_event.h are indices into this table. */
-
-static struct p4_event_binding p4_events[NUM_EVENTS] = {
-
- { /* BRANCH_RETIRED */
- 0x05, 0x06,
- { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
- {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* MISPRED_BRANCH_RETIRED */
- 0x04, 0x03,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* TC_DELIVER_MODE */
- 0x01, 0x01,
- { { CTR_MS_0, MSR_P4_TC_ESCR0},
- { CTR_MS_2, MSR_P4_TC_ESCR1} }
- },
-
- { /* BPU_FETCH_REQUEST */
- 0x00, 0x03,
- { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
- { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
- },
-
- { /* ITLB_REFERENCE */
- 0x03, 0x18,
- { { CTR_BPU_0, MSR_P4_ITLB_ESCR0},
- { CTR_BPU_2, MSR_P4_ITLB_ESCR1} }
- },
-
- { /* MEMORY_CANCEL */
- 0x05, 0x02,
- { { CTR_FLAME_0, MSR_P4_DAC_ESCR0},
- { CTR_FLAME_2, MSR_P4_DAC_ESCR1} }
- },
-
- { /* MEMORY_COMPLETE */
- 0x02, 0x08,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* LOAD_PORT_REPLAY */
- 0x02, 0x04,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* STORE_PORT_REPLAY */
- 0x02, 0x05,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* MOB_LOAD_REPLAY */
- 0x02, 0x03,
- { { CTR_BPU_0, MSR_P4_MOB_ESCR0},
- { CTR_BPU_2, MSR_P4_MOB_ESCR1} }
- },
-
- { /* PAGE_WALK_TYPE */
- 0x04, 0x01,
- { { CTR_BPU_0, MSR_P4_PMH_ESCR0},
- { CTR_BPU_2, MSR_P4_PMH_ESCR1} }
- },
-
- { /* BSQ_CACHE_REFERENCE */
- 0x07, 0x0c,
- { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
- { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
- },
-
- { /* IOQ_ALLOCATION */
- 0x06, 0x03,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { 0, 0 } }
- },
-
- { /* IOQ_ACTIVE_ENTRIES */
- 0x06, 0x1a,
- { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
- { 0, 0 } }
- },
-
- { /* FSB_DATA_ACTIVITY */
- 0x06, 0x17,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
- },
-
- { /* BSQ_ALLOCATION */
- 0x07, 0x05,
- { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
- { 0, 0 } }
- },
-
- { /* BSQ_ACTIVE_ENTRIES */
- 0x07, 0x06,
- { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
- { 0, 0 } }
- },
-
- { /* X87_ASSIST */
- 0x05, 0x03,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* SSE_INPUT_ASSIST */
- 0x01, 0x34,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* PACKED_SP_UOP */
- 0x01, 0x08,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* PACKED_DP_UOP */
- 0x01, 0x0c,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* SCALAR_SP_UOP */
- 0x01, 0x0a,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* SCALAR_DP_UOP */
- 0x01, 0x0e,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* 64BIT_MMX_UOP */
- 0x01, 0x02,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* 128BIT_MMX_UOP */
- 0x01, 0x1a,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* X87_FP_UOP */
- 0x01, 0x04,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* X87_SIMD_MOVES_UOP */
- 0x01, 0x2e,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* MACHINE_CLEAR */
- 0x05, 0x02,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* GLOBAL_POWER_EVENTS */
- 0x06, 0x13 /* older manual says 0x05, newer 0x13 */,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
- },
-
- { /* TC_MS_XFER */
- 0x00, 0x05,
- { { CTR_MS_0, MSR_P4_MS_ESCR0},
- { CTR_MS_2, MSR_P4_MS_ESCR1} }
- },
-
- { /* UOP_QUEUE_WRITES */
- 0x00, 0x09,
- { { CTR_MS_0, MSR_P4_MS_ESCR0},
- { CTR_MS_2, MSR_P4_MS_ESCR1} }
- },
-
- { /* FRONT_END_EVENT */
- 0x05, 0x08,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* EXECUTION_EVENT */
- 0x05, 0x0c,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* REPLAY_EVENT */
- 0x05, 0x09,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* INSTR_RETIRED */
- 0x04, 0x02,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* UOPS_RETIRED */
- 0x04, 0x01,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* UOP_TYPE */
- 0x02, 0x02,
- { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
- { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
- },
-
- { /* RETIRED_MISPRED_BRANCH_TYPE */
- 0x02, 0x05,
- { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
- { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
- },
-
- { /* RETIRED_BRANCH_TYPE */
- 0x02, 0x04,
- { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
- { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
- }
-};
-
-
-#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7)
-
-#define ESCR_RESERVED_BITS 0x80000003
-#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS)
-#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2))
-#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3))
-#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1)))
-#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
-#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
-#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-
-#define CCCR_RESERVED_BITS 0x38030FFF
-#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
-#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000)
-#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13))
-#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26))
-#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
-#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
-#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
-#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-
-
-/* this assigns a "stagger" to the current CPU, which is used throughout
- the code in this module as an extra array offset, to select the "even"
- or "odd" part of all the divided resources. */
-static unsigned int get_stagger(void)
-{
-#ifdef CONFIG_SMP
- int cpu = smp_processor_id();
- return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
-#endif
- return 0;
-}
-
-
-/* finally, mediate access to a real hardware counter
- by passing a "virtual" counter numer to this macro,
- along with your stagger setting. */
-#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger)))
-
-static unsigned long reset_value[NUM_COUNTERS_NON_HT];
-
-static void p4_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (msrs->counters[i].addr)
- release_perfctr_nmi(msrs->counters[i].addr);
- }
- /*
- * some of the control registers are specially reserved in
- * conjunction with the counter registers (hence the starting offset).
- * This saves a few bits.
- */
- for (i = num_counters; i < num_controls; ++i) {
- if (msrs->controls[i].addr)
- release_evntsel_nmi(msrs->controls[i].addr);
- }
-}
-
-static int p4_fill_in_addresses(struct op_msrs * const msrs)
-{
- unsigned int i;
- unsigned int addr, cccraddr, stag;
-
- setup_num_counters();
- stag = get_stagger();
-
- /* the counter & cccr registers we pay attention to */
- for (i = 0; i < num_counters; ++i) {
- addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
- cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
- if (reserve_perfctr_nmi(addr)) {
- msrs->counters[i].addr = addr;
- msrs->controls[i].addr = cccraddr;
- }
- }
-
- /* 43 ESCR registers in three or four discontiguous group */
- for (addr = MSR_P4_BSU_ESCR0 + stag;
- addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1
- * to avoid special case in nmi_{save|restore}_registers() */
- if (boot_cpu_data.x86_model >= 0x3) {
- for (addr = MSR_P4_BSU_ESCR0 + stag;
- addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
- } else {
- for (addr = MSR_P4_IQ_ESCR0 + stag;
- addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
- }
-
- for (addr = MSR_P4_RAT_ESCR0 + stag;
- addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- for (addr = MSR_P4_MS_ESCR0 + stag;
- addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- for (addr = MSR_P4_IX_ESCR0 + stag;
- addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- /* there are 2 remaining non-contiguously located ESCRs */
-
- if (num_counters == NUM_COUNTERS_NON_HT) {
- /* standard non-HT CPUs handle both remaining ESCRs*/
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
-
- } else if (stag == 0) {
- /* HT CPUs give the first remainder to the even thread, as
- the 32nd control register */
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
-
- } else {
- /* and two copies of the second to the odd thread,
- for the 22st and 23nd control registers */
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) {
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- }
- }
-
- for (i = 0; i < num_counters; ++i) {
- if (!counter_config[i].enabled)
- continue;
- if (msrs->controls[i].addr)
- continue;
- op_x86_warn_reserved(i);
- p4_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-
-static void pmc_setup_one_p4_counter(unsigned int ctr)
-{
- int i;
- int const maxbind = 2;
- unsigned int cccr = 0;
- unsigned int escr = 0;
- unsigned int high = 0;
- unsigned int counter_bit;
- struct p4_event_binding *ev = NULL;
- unsigned int stag;
-
- stag = get_stagger();
-
- /* convert from counter *number* to counter *bit* */
- counter_bit = 1 << VIRT_CTR(stag, ctr);
-
- /* find our event binding structure. */
- if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx out of range\n",
- counter_config[ctr].event);
- return;
- }
-
- ev = &(p4_events[counter_config[ctr].event - 1]);
-
- for (i = 0; i < maxbind; i++) {
- if (ev->bindings[i].virt_counter & counter_bit) {
-
- /* modify ESCR */
- rdmsr(ev->bindings[i].escr_address, escr, high);
- ESCR_CLEAR(escr);
- if (stag == 0) {
- ESCR_SET_USR_0(escr, counter_config[ctr].user);
- ESCR_SET_OS_0(escr, counter_config[ctr].kernel);
- } else {
- ESCR_SET_USR_1(escr, counter_config[ctr].user);
- ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
- }
- ESCR_SET_EVENT_SELECT(escr, ev->event_select);
- ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
- wrmsr(ev->bindings[i].escr_address, escr, high);
-
- /* modify CCCR */
- rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
- cccr, high);
- CCCR_CLEAR(cccr);
- CCCR_SET_REQUIRED_BITS(cccr);
- CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
- if (stag == 0)
- CCCR_SET_PMI_OVF_0(cccr);
- else
- CCCR_SET_PMI_OVF_1(cccr);
- wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
- cccr, high);
- return;
- }
- }
-
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
- counter_config[ctr].event, stag, ctr);
-}
-
-
-static void p4_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- unsigned int i;
- unsigned int low, high;
- unsigned int stag;
-
- stag = get_stagger();
-
- rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (!MISC_PMC_ENABLED_P(low)) {
- printk(KERN_ERR "oprofile: P4 PMC not available\n");
- return;
- }
-
- /* clear the cccrs we will use */
- for (i = 0; i < num_counters; i++) {
- if (unlikely(!msrs->controls[i].addr))
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_CLEAR(low);
- CCCR_SET_REQUIRED_BITS(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-
- /* clear all escrs (including those outside our concern) */
- for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!msrs->controls[i].addr))
- continue;
- wrmsr(msrs->controls[i].addr, 0, 0);
- }
-
- /* setup all counters */
- for (i = 0; i < num_counters; ++i) {
- if (counter_config[i].enabled && msrs->controls[i].addr) {
- reset_value[i] = counter_config[i].count;
- pmc_setup_one_p4_counter(i);
- wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
- -(u64)counter_config[i].count);
- } else {
- reset_value[i] = 0;
- }
- }
-}
-
-
-static int p4_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- unsigned long ctr, low, high, stag, real;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
-
- if (!reset_value[i])
- continue;
-
- /*
- * there is some eccentricity in the hardware which
- * requires that we perform 2 extra corrections:
- *
- * - check both the CCCR:OVF flag for overflow and the
- * counter high bit for un-flagged overflows.
- *
- * - write the counter back twice to ensure it gets
- * updated properly.
- *
- * the former seems to be related to extra NMIs happening
- * during the current NMI; the latter is reported as errata
- * N15 in intel doc 249199-029, pentium 4 specification
- * update, though their suggested work-around does not
- * appear to solve the problem.
- */
-
- real = VIRT_CTR(stag, i);
-
- rdmsr(p4_counters[real].cccr_address, low, high);
- rdmsr(p4_counters[real].counter_address, ctr, high);
- if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
- oprofile_add_sample(regs, i);
- wrmsrl(p4_counters[real].counter_address,
- -(u64)reset_value[i]);
- CCCR_CLEAR_OVF(low);
- wrmsr(p4_counters[real].cccr_address, low, high);
- wrmsrl(p4_counters[real].counter_address,
- -(u64)reset_value[i]);
- }
- }
-
- /* P4 quirk: you have to re-unmask the apic vector */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-
- /* See op_model_ppro.c */
- return 1;
-}
-
-
-static void p4_start(struct op_msrs const * const msrs)
-{
- unsigned int low, high, stag;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_SET_ENABLE(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-}
-
-
-static void p4_stop(struct op_msrs const * const msrs)
-{
- unsigned int low, high, stag;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_SET_DISABLE(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-}
-
-#ifdef CONFIG_SMP
-struct op_x86_model_spec op_p4_ht2_spec = {
- .num_counters = NUM_COUNTERS_HT2,
- .num_controls = NUM_CONTROLS_HT2,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
-};
-#endif
-
-struct op_x86_model_spec op_p4_spec = {
- .num_counters = NUM_COUNTERS_NON_HT,
- .num_controls = NUM_CONTROLS_NON_HT,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
-};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
deleted file mode 100644
index 7913b6921959..000000000000
--- a/arch/x86/oprofile/op_model_ppro.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * @file op_model_ppro.h
- * Family 6 perfmon and architectural perfmon MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Copyright 2008 Intel Corporation
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author Philippe Elie
- * @author Graydon Hoare
- * @author Andi Kleen
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#include <linux/oprofile.h>
-#include <linux/slab.h>
-#include <asm/ptrace.h>
-#include <asm/msr.h>
-#include <asm/apic.h>
-#include <asm/nmi.h>
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-static int num_counters = 2;
-static int counter_width = 32;
-
-#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21))
-
-static u64 reset_value[OP_MAX_COUNTER];
-
-static void ppro_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->counters[i].addr)
- continue;
- release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
- release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
- }
-}
-
-static int ppro_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; i++) {
- if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
- goto fail;
- if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
- release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
- goto fail;
- }
- /* both registers must be reserved */
- msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
- msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
- continue;
- fail:
- if (!counter_config[i].enabled)
- continue;
- op_x86_warn_reserved(i);
- ppro_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-
-static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
- union cpuid10_eax eax;
- eax.full = cpuid_eax(0xa);
-
- /*
- * For Core2 (family 6, model 15), don't reset the
- * counter width:
- */
- if (!(eax.split.version_id == 0 &&
- __this_cpu_read(cpu_info.x86) == 6 &&
- __this_cpu_read(cpu_info.x86_model) == 15)) {
-
- if (counter_width < eax.split.bit_width)
- counter_width = eax.split.bit_width;
- }
- }
-
- /* clear all counters */
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->controls[i].addr)
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- op_x86_warn_in_use(i);
- val &= model->reserved;
- wrmsrl(msrs->controls[i].addr, val);
- /*
- * avoid a false detection of ctr overflows in NMI *
- * handler
- */
- wrmsrl(msrs->counters[i].addr, -1LL);
- }
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- if (counter_config[i].enabled && msrs->counters[i].addr) {
- reset_value[i] = counter_config[i].count;
- wrmsrl(msrs->counters[i].addr, -reset_value[i]);
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[i]);
- wrmsrl(msrs->controls[i].addr, val);
- } else {
- reset_value[i] = 0;
- }
- }
-}
-
-
-static int ppro_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsrl(msrs->counters[i].addr, val);
- if (val & (1ULL << (counter_width - 1)))
- continue;
- oprofile_add_sample(regs, i);
- wrmsrl(msrs->counters[i].addr, -reset_value[i]);
- }
-
- /* Only P6 based Pentium M need to re-unmask the apic vector but it
- * doesn't hurt other P6 variant */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-
- /* We can't work out if we really handled an interrupt. We
- * might have caught a *second* counter just after overflowing
- * the interrupt for this counter then arrives
- * and we don't find a counter that's overflowed, so we
- * would return 0 and get dazed + confused. Instead we always
- * assume we found an overflow. This sucks.
- */
- return 1;
-}
-
-
-static void ppro_start(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (reset_value[i]) {
- rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
- }
-}
-
-
-static void ppro_stop(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-struct op_x86_model_spec op_ppro_spec = {
- .num_counters = 2,
- .num_controls = 2,
- .reserved = MSR_PPRO_EVENTSEL_RESERVED,
- .fill_in_addresses = &ppro_fill_in_addresses,
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
-};
-
-/*
- * Architectural performance monitoring.
- *
- * Newer Intel CPUs (Core1+) have support for architectural
- * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
- * The advantage of this is that it can be done without knowing about
- * the specific CPU.
- */
-
-static void arch_perfmon_setup_counters(void)
-{
- union cpuid10_eax eax;
-
- eax.full = cpuid_eax(0xa);
-
- /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
- if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 15) {
- eax.split.version_id = 2;
- eax.split.num_counters = 2;
- eax.split.bit_width = 40;
- }
-
- num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER);
-
- op_arch_perfmon_spec.num_counters = num_counters;
- op_arch_perfmon_spec.num_controls = num_counters;
-}
-
-static int arch_perfmon_init(struct oprofile_operations *ignore)
-{
- arch_perfmon_setup_counters();
- return 0;
-}
-
-struct op_x86_model_spec op_arch_perfmon_spec = {
- .reserved = MSR_PPRO_EVENTSEL_RESERVED,
- .init = &arch_perfmon_init,
- /* num_counters/num_controls filled in at runtime */
- .fill_in_addresses = &ppro_fill_in_addresses,
- /* user space does the cpuid check for available events */
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
-};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
deleted file mode 100644
index 276cf79b5d24..000000000000
--- a/arch/x86/oprofile/op_x86_model.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * @file op_x86_model.h
- * interface to x86 model-specific MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Graydon Hoare
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#ifndef OP_X86_MODEL_H
-#define OP_X86_MODEL_H
-
-#include <asm/types.h>
-#include <asm/perf_event.h>
-
-struct op_msr {
- unsigned long addr;
- u64 saved;
-};
-
-struct op_msrs {
- struct op_msr *counters;
- struct op_msr *controls;
- struct op_msr *multiplex;
-};
-
-struct pt_regs;
-
-struct oprofile_operations;
-
-/* The model vtable abstracts the differences between
- * various x86 CPU models' perfctr support.
- */
-struct op_x86_model_spec {
- unsigned int num_counters;
- unsigned int num_controls;
- unsigned int num_virt_counters;
- u64 reserved;
- u16 event_mask;
- int (*init)(struct oprofile_operations *ops);
- int (*fill_in_addresses)(struct op_msrs * const msrs);
- void (*setup_ctrs)(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs);
- int (*check_ctrs)(struct pt_regs * const regs,
- struct op_msrs const * const msrs);
- void (*start)(struct op_msrs const * const msrs);
- void (*stop)(struct op_msrs const * const msrs);
- void (*shutdown)(struct op_msrs const * const msrs);
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
- void (*switch_ctrl)(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs);
-#endif
-};
-
-struct op_counter_config;
-
-static inline void op_x86_warn_in_use(int counter)
-{
- /*
- * The warning indicates an already running counter. If
- * oprofile doesn't collect data, then try using a different
- * performance counter on your platform to monitor the desired
- * event. Delete counter #%d from the desired event by editing
- * the /usr/share/oprofile/%s/<cpu>/events file. If the event
- * cannot be monitored by any other counter, contact your
- * hardware or BIOS vendor.
- */
- pr_warn("oprofile: counter #%d on cpu #%d may already be used\n",
- counter, smp_processor_id());
-}
-
-static inline void op_x86_warn_reserved(int counter)
-{
- pr_warn("oprofile: counter #%d is already reserved\n", counter);
-}
-
-extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
- struct op_counter_config *counter_config);
-extern int op_x86_phys_to_virt(int phys);
-extern int op_x86_virt_to_phys(int virt);
-
-extern struct op_x86_model_spec op_ppro_spec;
-extern struct op_x86_model_spec op_p4_spec;
-extern struct op_x86_model_spec op_p4_ht2_spec;
-extern struct op_x86_model_spec op_amd_spec;
-extern struct op_x86_model_spec op_arch_perfmon_spec;
-
-#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 00bfa1ebad6c..0bb3b8b44e4e 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -9,16 +9,23 @@
in the right sequence from here. */
static __init int pci_arch_init(void)
{
- int type;
-
- x86_create_pci_msi_domain();
+ int type, pcbios = 1;
type = pci_direct_probe();
if (!(pci_probe & PCI_PROBE_NOEARLY))
pci_mmcfg_early_init();
- if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
+ if (x86_init.pci.arch_init)
+ pcbios = x86_init.pci.arch_init();
+
+ /*
+ * Must happen after x86_init.pci.arch_init(). Xen sets up the
+ * x86_init.irqs.create_pci_msi_domain there.
+ */
+ x86_create_pci_msi_domain();
+
+ if (!pcbios)
return 0;
pci_pcbios_init();
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index d0e835470d01..b2f90a1a89f1 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -4,7 +4,6 @@ obj-y += atom/
obj-y += ce4100/
obj-y += efi/
obj-y += geode/
-obj-y += goldfish/
obj-y += iris/
obj-y += intel/
obj-y += intel-mid/
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8efd003540ca..1b82d77019b1 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -54,10 +54,7 @@
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
*/
static u64 efi_va = EFI_VA_START;
-
-struct efi_scratch efi_scratch;
-
-EXPORT_SYMBOL_GPL(efi_mm);
+static struct mm_struct *efi_prev_mm;
/*
* We need our own copy of the higher levels of the page tables
@@ -237,7 +234,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 1;
}
- efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */
+ efi_mixed_mode_stack_pa = page_to_phys(page + 1); /* stack grows down */
npages = (_etext - _text) >> PAGE_SHIFT;
text = __pa(_text);
@@ -462,11 +459,17 @@ void __init efi_dump_pagetable(void)
* can not change under us.
* It should be ensured that there are no concurent calls to this function.
*/
-void efi_switch_mm(struct mm_struct *mm)
+void efi_enter_mm(void)
+{
+ efi_prev_mm = current->active_mm;
+ current->active_mm = &efi_mm;
+ switch_mm(efi_prev_mm, &efi_mm, NULL);
+}
+
+void efi_leave_mm(void)
{
- efi_scratch.prev_mm = current->active_mm;
- current->active_mm = mm;
- switch_mm(efi_scratch.prev_mm, mm, NULL);
+ current->active_mm = efi_prev_mm;
+ switch_mm(&efi_mm, efi_prev_mm, NULL);
}
static DEFINE_SPINLOCK(efi_runtime_lock);
@@ -530,12 +533,12 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size,
efi_sync_low_kernel_mappings();
local_irq_save(flags);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
status = __efi_thunk(set_virtual_address_map, memory_map_size,
descriptor_size, descriptor_version, virtual_map);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
local_irq_restore(flags);
return status;
@@ -829,9 +832,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_size,
descriptor_version,
virtual_map);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
- kernel_fpu_begin();
+ efi_fpu_begin();
/* Disable interrupts around EFI calls: */
local_irq_save(flags);
@@ -840,12 +843,12 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_version, virtual_map);
local_irq_restore(flags);
- kernel_fpu_end();
+ efi_fpu_end();
/* grab the virtually remapped EFI runtime services table pointer */
efi.runtime = READ_ONCE(systab->runtime);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
return status;
}
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 26f0da238c1c..fd3dd1708eba 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -33,7 +33,7 @@ SYM_CODE_START(__efi64_thunk)
* Switch to 1:1 mapped 32-bit stack pointer.
*/
movq %rsp, %rax
- movq efi_scratch(%rip), %rsp
+ movq efi_mixed_mode_stack_pa(%rip), %rsp
push %rax
/*
@@ -70,3 +70,7 @@ SYM_CODE_START(__efi64_thunk)
pushl %ebp
lret
SYM_CODE_END(__efi64_thunk)
+
+ .bss
+ .balign 8
+SYM_DATA(efi_mixed_mode_stack_pa, .quad 0)
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 5a40fe411ebd..67d93a243c35 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
* @return: Returns, if the page fault is not handled. This function
* will never return if the page fault is handled successfully.
*/
-void efi_recover_from_page_fault(unsigned long phys_addr)
+void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
{
if (!IS_ENABLED(CONFIG_X86_64))
return;
/*
+ * If we get an interrupt/NMI while processing an EFI runtime service
+ * then this is a regular OOPS, not an EFI failure.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
* Make sure that an efi runtime service caused the page fault.
+ * READ_ONCE() because we might be OOPSing in a different thread,
+ * and we don't want to trip KTSAN while trying to OOPS.
*/
- if (efi_rts_work.efi_rts_id == EFI_NONE)
+ if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE ||
+ current_work() != &efi_rts_work.work)
return;
/*
@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
set_current_state(TASK_IDLE);
schedule();
}
-
- return;
}
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index c33f744b5388..b39bf3b5e108 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -22,6 +22,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -69,21 +70,15 @@ static struct platform_device alix_buttons_dev = {
static struct gpio_led alix_leds[] = {
{
.name = "alix:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "alix:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "alix:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -92,6 +87,17 @@ static struct gpio_led_platform_data alix_leds_data = {
.leds = alix_leds,
};
+static struct gpiod_lookup_table alix_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device alix_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -106,6 +112,7 @@ static struct platform_device *alix_devs[] __initdata = {
static void __init register_alix(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&alix_leds_gpio_table);
platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs));
}
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index 73a3f49b4eb6..d263528c90bb 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -53,21 +54,15 @@ static struct platform_device geos_buttons_dev = {
static struct gpio_led geos_leds[] = {
{
.name = "geos:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "geos:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "geos:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -76,6 +71,17 @@ static struct gpio_led_platform_data geos_leds_data = {
.leds = geos_leds,
};
+static struct gpiod_lookup_table geos_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device geos_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -90,6 +96,7 @@ static struct platform_device *geos_devs[] __initdata = {
static void __init register_geos(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&geos_leds_gpio_table);
platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
}
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 163e1b545517..558384acd777 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <asm/geode.h>
@@ -55,9 +56,7 @@ static struct platform_device net5501_buttons_dev = {
static struct gpio_led net5501_leds[] = {
{
.name = "net5501:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 0,
},
};
@@ -66,6 +65,15 @@ static struct gpio_led_platform_data net5501_leds_data = {
.leds = net5501_leds,
};
+static struct gpiod_lookup_table net5501_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH),
+ { }
+ },
+};
+
static struct platform_device net5501_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -80,6 +88,7 @@ static struct platform_device *net5501_devs[] __initdata = {
static void __init register_net5501(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&net5501_leds_gpio_table);
platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs));
}
diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile
deleted file mode 100644
index 072c395379ac..000000000000
--- a/arch/x86/platform/goldfish/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_GOLDFISH) += goldfish.o
diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c
deleted file mode 100644
index 6b6f8b4360dd..000000000000
--- a/arch/x86/platform/goldfish/goldfish.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2007 Google, Inc.
- * Copyright (C) 2011 Intel, Inc.
- * Copyright (C) 2013 Intel, Inc.
- */
-
-#include <linux/kernel.h>
-#include <linux/irq.h>
-#include <linux/platform_device.h>
-
-/*
- * Where in virtual device memory the IO devices (timers, system controllers
- * and so on)
- */
-
-#define GOLDFISH_PDEV_BUS_BASE (0xff001000)
-#define GOLDFISH_PDEV_BUS_END (0xff7fffff)
-#define GOLDFISH_PDEV_BUS_IRQ (4)
-
-#define GOLDFISH_TTY_BASE (0x2000)
-
-static struct resource goldfish_pdev_bus_resources[] = {
- {
- .start = GOLDFISH_PDEV_BUS_BASE,
- .end = GOLDFISH_PDEV_BUS_END,
- .flags = IORESOURCE_MEM,
- },
- {
- .start = GOLDFISH_PDEV_BUS_IRQ,
- .end = GOLDFISH_PDEV_BUS_IRQ,
- .flags = IORESOURCE_IRQ,
- }
-};
-
-static bool goldfish_enable __initdata;
-
-static int __init goldfish_setup(char *str)
-{
- goldfish_enable = true;
- return 0;
-}
-__setup("goldfish", goldfish_setup);
-
-static int __init goldfish_init(void)
-{
- if (!goldfish_enable)
- return -ENODEV;
-
- platform_device_register_simple("goldfish_pdev_bus", -1,
- goldfish_pdev_bus_resources, 2);
- return 0;
-}
-device_initcall(goldfish_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
index 31dda18bb370..2930b6e9473e 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -88,8 +88,8 @@ static int __init bt_sfi_init(void)
memset(&info, 0, sizeof(info));
info.fwnode = ddata->dev->fwnode;
info.parent = ddata->dev;
- info.name = ddata->name,
- info.id = PLATFORM_DEVID_NONE,
+ info.name = ddata->name;
+ info.id = PLATFORM_DEVID_NONE;
pdev = platform_device_register_full(&info);
if (IS_ERR(pdev))
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index ce7188cbdae5..1c3a1962cade 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -867,9 +867,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
@@ -910,9 +912,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 9a5a50cdaab5..dc0a337f985b 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
{
- /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */
+ /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */
exc_nmi(regs);
}
+DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault)
+{
+ /* On Xen PV, DF doesn't use IST. The C part is the same as native. */
+ exc_double_fault(regs, error_code);
+}
+
DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
{
/*
@@ -590,6 +596,20 @@ DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
BUG();
}
+#ifdef CONFIG_X86_MCE
+DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
+{
+ /*
+ * There's no IST on Xen PV, but we still need to dispatch
+ * to the correct handler.
+ */
+ if (user_mode(regs))
+ noist_exc_machine_check(regs);
+ else
+ exc_machine_check(regs);
+}
+#endif
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
@@ -608,9 +628,9 @@ struct trap_array_entry {
static struct trap_array_entry trap_array[] = {
TRAP_ENTRY_REDIR(exc_debug, true ),
- TRAP_ENTRY(exc_double_fault, true ),
+ TRAP_ENTRY_REDIR(exc_double_fault, true ),
#ifdef CONFIG_X86_MCE
- TRAP_ENTRY(exc_machine_check, true ),
+ TRAP_ENTRY_REDIR(exc_machine_check, true ),
#endif
TRAP_ENTRY_REDIR(exc_nmi, true ),
TRAP_ENTRY(exc_int3, false ),
@@ -1015,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void)
*/
if (xen_have_vcpu_info_placement) {
pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
- pv_ops.irq.restore_fl =
- __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_ops.irq.irq_disable =
__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
pv_ops.irq.irq_enable =
@@ -1053,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
.iret = xen_iret,
- .usergs_sysret64 = xen_sysret64,
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
@@ -1078,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
- /* Xen takes care of %gs when switching to usermode for us */
- .swapgs = paravirt_nop,
-
.start_context_switch = paravirt_start_context_switch,
.end_context_switch = xen_end_context_switch,
};
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 850c93f346c7..dfa091d79c2e 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-__visible void xen_restore_fl(unsigned long flags)
-{
- struct vcpu_info *vcpu;
-
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
-
- /* See xen_irq_enable() for why preemption must be disabled. */
- preempt_disable();
- vcpu = this_cpu_read(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
-
- if (flags == 0) {
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- xen_force_evtchn_callback();
- preempt_enable();
- } else
- preempt_enable_no_resched();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
-
asmlinkage __visible void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
@@ -118,7 +96,6 @@ static void xen_halt(void)
static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
- .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 3301875dd196..b5949e5a83ec 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -712,7 +712,8 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
unsigned long mfn, pfn;
/* Do not add to override if the map failed. */
- if (map_ops[i].status)
+ if (map_ops[i].status != GNTST_okay ||
+ (kmap_ops && kmap_ops[i].status != GNTST_okay))
continue;
if (map_ops[i].flags & GNTMAP_contains_pte) {
@@ -750,17 +751,15 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
unsigned long pfn = page_to_pfn(pages[i]);
- if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT))
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ else
ret = -EINVAL;
- goto out;
- }
-
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
if (kunmap_ops)
ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
- kunmap_ops, count);
-out:
+ kunmap_ops, count) ?: ret;
+
return ret;
}
EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 53cf8aa35032..02f31341e435 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -72,34 +72,6 @@ SYM_FUNC_START(xen_save_fl_direct)
ret
SYM_FUNC_END(xen_save_fl_direct)
-
-/*
- * In principle the caller should be passing us a value return from
- * xen_save_fl_direct, but for robustness sake we test only the
- * X86_EFLAGS_IF flag rather than the whole byte. After setting the
- * interrupt mask state, it checks for unmasked pending events and
- * enters the hypervisor to get them delivered if so.
- */
-SYM_FUNC_START(xen_restore_fl_direct)
- FRAME_BEGIN
- testw $X86_EFLAGS_IF, %di
- setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- /*
- * Preempt here doesn't matter because that will deal with any
- * pending interrupts. The pending check may end up being run
- * on the wrong CPU, but that doesn't hurt.
- */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
- jnz 1f
- call check_events
-1:
- FRAME_END
- ret
-SYM_FUNC_END(xen_restore_fl_direct)
-
-
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
@@ -161,7 +133,7 @@ xen_pv_trap asm_exc_overflow
xen_pv_trap asm_exc_bounds
xen_pv_trap asm_exc_invalid_op
xen_pv_trap asm_exc_device_not_available
-xen_pv_trap asm_exc_double_fault
+xen_pv_trap asm_xenpv_exc_double_fault
xen_pv_trap asm_exc_coproc_segment_overrun
xen_pv_trap asm_exc_invalid_tss
xen_pv_trap asm_exc_segment_not_present
@@ -172,7 +144,7 @@ xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
#ifdef CONFIG_X86_MCE
-xen_pv_trap asm_exc_machine_check
+xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
@@ -215,26 +187,6 @@ SYM_CODE_START(xen_iret)
jmp hypercall_iret
SYM_CODE_END(xen_iret)
-SYM_CODE_START(xen_sysret64)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back.
- *
- * tss.sp2 is scratch space.
- */
- movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER_DS
- pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- pushq %r11
- pushq $__USER_CS
- pushq %rcx
-
- pushq $VGCF_in_syscall
- jmp hypercall_iret
-SYM_CODE_END(xen_sysret64)
-
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9546c3384c75..8d7ec49a35fb 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params)
__visible void xen_irq_enable_direct(void);
__visible void xen_irq_disable_direct(void);
__visible unsigned long xen_save_fl_direct(void);
-__visible void xen_restore_fl_direct(unsigned long);
__visible unsigned long xen_read_cr2(void);
__visible unsigned long xen_read_cr2_direct(void);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
-__visible void xen_sysret32(void);
-__visible void xen_sysret64(void);
extern int xen_panic_handler_init(void);