summaryrefslogtreecommitdiff
path: root/arch/powerpc/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kernel')
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c9
-rw-r--r--arch/powerpc/kernel/cputable.c24
-rw-r--r--arch/powerpc/kernel/dt_cpu_ftrs.c4
-rw-r--r--arch/powerpc/kernel/eeh.c46
-rw-r--r--arch/powerpc/kernel/eeh_driver.c6
-rw-r--r--arch/powerpc/kernel/eeh_pe.c8
-rw-r--r--arch/powerpc/kernel/entry_64.S4
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S49
-rw-r--r--arch/powerpc/kernel/fadump.c17
-rw-r--r--arch/powerpc/kernel/head_32.S2
-rw-r--r--arch/powerpc/kernel/head_64.S16
-rw-r--r--arch/powerpc/kernel/idle_book3s.S70
-rw-r--r--arch/powerpc/kernel/irq.c51
-rw-r--r--arch/powerpc/kernel/kprobes-ftrace.c34
-rw-r--r--arch/powerpc/kernel/kprobes.c92
-rw-r--r--arch/powerpc/kernel/machine_kexec_64.c4
-rw-r--r--arch/powerpc/kernel/mce.c147
-rw-r--r--arch/powerpc/kernel/mce_power.c115
-rw-r--r--arch/powerpc/kernel/module_64.c3
-rw-r--r--arch/powerpc/kernel/optprobes.c15
-rw-r--r--arch/powerpc/kernel/paca.c16
-rw-r--r--arch/powerpc/kernel/pci-common.c12
-rw-r--r--arch/powerpc/kernel/pci_64.c4
-rw-r--r--arch/powerpc/kernel/process.c225
-rw-r--r--arch/powerpc/kernel/prom.c37
-rw-r--r--arch/powerpc/kernel/setup-common.c7
-rw-r--r--arch/powerpc/kernel/setup.h6
-rw-r--r--arch/powerpc/kernel/setup_64.c19
-rw-r--r--arch/powerpc/kernel/signal.c2
-rw-r--r--arch/powerpc/kernel/signal_32.c37
-rw-r--r--arch/powerpc/kernel/signal_64.c7
-rw-r--r--arch/powerpc/kernel/sysfs.c11
-rw-r--r--arch/powerpc/kernel/tau_6xx.c3
-rw-r--r--arch/powerpc/kernel/tm.S59
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_mprofile.S4
-rw-r--r--arch/powerpc/kernel/traps.c256
-rw-r--r--arch/powerpc/kernel/watchdog.c29
38 files changed, 1011 insertions, 441 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 6c6cce937dd8..1b6bc7fba996 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -129,7 +129,7 @@ obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o
obj-$(CONFIG_PPC64) += $(obj64-y)
obj-$(CONFIG_PPC32) += $(obj32-y)
-ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE),)
+ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),)
obj-y += ppc_save_regs.o
endif
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8cfb20e38cfe..6b958414b4e0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,7 +185,7 @@ int main(void)
#ifdef CONFIG_PPC_MM_SLICES
OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
- DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
+ OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit);
DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
#endif /* CONFIG_PPC_MM_SLICES */
#endif
@@ -208,7 +208,7 @@ int main(void)
OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first);
#endif /* CONFIG_PPC_BOOK3E */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
OFFSET(PACASLBCACHE, paca_struct, slb_cache);
OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr);
OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp);
@@ -230,7 +230,7 @@ int main(void)
OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx);
OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count);
OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx);
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
#ifdef CONFIG_PPC_BOOK3S_64
OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
@@ -642,6 +642,7 @@ int main(void)
HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
HSTATE_FIELD(HSTATE_PTID, ptid);
+ HSTATE_FIELD(HSTATE_TID, tid);
HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
@@ -667,6 +668,8 @@ int main(void)
OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+ OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+ OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 760872916013..1350f49d81a8 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -547,11 +547,31 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check_early = __machine_check_early_realmode_p9,
.platform = "power9",
},
- { /* Power9 */
+ { /* Power9 DD2.0 */
+ .pvr_mask = 0xffffefff,
+ .pvr_value = 0x004e0200,
+ .cpu_name = "POWER9 (raw)",
+ .cpu_features = CPU_FTRS_POWER9_DD2_0,
+ .cpu_user_features = COMMON_USER_POWER9,
+ .cpu_user_features2 = COMMON_USER2_POWER9,
+ .mmu_features = MMU_FTRS_POWER9,
+ .icache_bsize = 128,
+ .dcache_bsize = 128,
+ .num_pmcs = 6,
+ .pmc_type = PPC_PMC_IBM,
+ .oprofile_cpu_type = "ppc64/power9",
+ .oprofile_type = PPC_OPROFILE_INVALID,
+ .cpu_setup = __setup_cpu_power9,
+ .cpu_restore = __restore_cpu_power9,
+ .flush_tlb = __flush_tlb_power9,
+ .machine_check_early = __machine_check_early_realmode_p9,
+ .platform = "power9",
+ },
+ { /* Power9 DD 2.1 or later (see DD2.0 above) */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004e0000,
.cpu_name = "POWER9 (raw)",
- .cpu_features = CPU_FTRS_POWER9,
+ .cpu_features = CPU_FTRS_POWER9_DD2_1,
.cpu_user_features = COMMON_USER_POWER9,
.cpu_user_features2 = COMMON_USER2_POWER9,
.mmu_features = MMU_FTRS_POWER9,
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 7275fed271af..602e0fde19b4 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -634,7 +634,7 @@ static struct dt_cpu_feature_match __initdata
{"no-execute", feat_enable, 0},
{"strong-access-ordering", feat_enable, CPU_FTR_SAO},
{"cache-inhibited-large-page", feat_enable_large_ci, 0},
- {"coprocessor-icswx", feat_enable, CPU_FTR_ICSWX},
+ {"coprocessor-icswx", feat_enable, 0},
{"hypervisor-virtualization-interrupt", feat_enable_hvi, 0},
{"program-priority-register", feat_enable, CPU_FTR_HAS_PPR},
{"wait", feat_enable, 0},
@@ -735,6 +735,8 @@ static __init void cpufeatures_cpu_quirks(void)
*/
if ((version & 0xffffff00) == 0x004e0100)
cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1;
+ else if ((version & 0xffffefff) == 0x004e0200)
+ cur_cpu_spec->cpu_features &= ~CPU_FTR_POWER9_DD2_1;
}
static void __init cpufeatures_setup_finished(void)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 116000b45531..cbca0a667682 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -972,6 +972,18 @@ static struct notifier_block eeh_reboot_nb = {
.notifier_call = eeh_reboot_notifier,
};
+void eeh_probe_devices(void)
+{
+ struct pci_controller *hose, *tmp;
+ struct pci_dn *pdn;
+
+ /* Enable EEH for all adapters */
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ pdn = hose->pci_data;
+ traverse_pci_dn(pdn, eeh_ops->probe, NULL);
+ }
+}
+
/**
* eeh_init - EEH initialization
*
@@ -987,22 +999,11 @@ static struct notifier_block eeh_reboot_nb = {
* Even if force-off is set, the EEH hardware is still enabled, so that
* newer systems can boot.
*/
-int eeh_init(void)
+static int eeh_init(void)
{
struct pci_controller *hose, *tmp;
- struct pci_dn *pdn;
- static int cnt = 0;
int ret = 0;
- /*
- * We have to delay the initialization on PowerNV after
- * the PCI hierarchy tree has been built because the PEs
- * are figured out based on PCI devices instead of device
- * tree nodes
- */
- if (machine_is(powernv) && cnt++ <= 0)
- return ret;
-
/* Register reboot notifier */
ret = register_reboot_notifier(&eeh_reboot_nb);
if (ret) {
@@ -1028,22 +1029,7 @@ int eeh_init(void)
if (ret)
return ret;
- /* Enable EEH for all adapters */
- list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
- pdn = hose->pci_data;
- traverse_pci_dn(pdn, eeh_ops->probe, NULL);
- }
-
- /*
- * Call platform post-initialization. Actually, It's good chance
- * to inform platform that EEH is ready to supply service if the
- * I/O cache stuff has been built up.
- */
- if (eeh_ops->post_init) {
- ret = eeh_ops->post_init();
- if (ret)
- return ret;
- }
+ eeh_probe_devices();
if (eeh_enabled())
pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
@@ -1757,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val)
else
eeh_add_flag(EEH_FORCE_DISABLED);
- /* Notify the backend */
- if (eeh_ops->post_init)
- eeh_ops->post_init();
-
return 0;
}
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 8b840191df59..4f71e4c9beb7 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -441,7 +441,7 @@ static void *eeh_add_virt_device(void *data, void *userdata)
}
#ifdef CONFIG_PPC_POWERNV
- pci_iov_add_virtfn(edev->physfn, pdn->vf_index, 0);
+ pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
#endif
return NULL;
}
@@ -499,7 +499,7 @@ static void *eeh_rmv_device(void *data, void *userdata)
#ifdef CONFIG_PPC_POWERNV
struct pci_dn *pdn = eeh_dev_to_pdn(edev);
- pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0);
+ pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
edev->pdev = NULL;
/*
@@ -623,7 +623,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
struct eeh_rmv_data *rmv_data)
{
struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
- struct timeval tstamp;
+ time64_t tstamp;
int cnt, rc;
struct eeh_dev *edev;
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 2e8d1b2b5af4..2d4956e97aa9 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -526,16 +526,16 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
*/
void eeh_pe_update_time_stamp(struct eeh_pe *pe)
{
- struct timeval tstamp;
+ time64_t tstamp;
if (!pe) return;
if (pe->freeze_count <= 0) {
pe->freeze_count = 0;
- do_gettimeofday(&pe->tstamp);
+ pe->tstamp = ktime_get_seconds();
} else {
- do_gettimeofday(&tstamp);
- if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+ tstamp = ktime_get_seconds();
+ if (tstamp - pe->tstamp > 3600) {
pe->tstamp = tstamp;
pe->freeze_count = 0;
}
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 4a0fd4f40245..3320bcac7192 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -539,7 +539,7 @@ _GLOBAL(_switch)
std r6,PACACURRENT(r13) /* Set new 'current' */
ld r8,KSP(r4) /* new stack pointer */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
BEGIN_MMU_FTR_SECTION
b 2f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
@@ -588,7 +588,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
slbmte r7,r0
isync
2:
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
CURRENT_THREAD_INFO(r7, r8) /* base of new stack */
/* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 1c80bd292e48..e441b469dc8f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -114,6 +114,7 @@ EXC_VIRT_NONE(0x4000, 0x100)
cmpwi cr3,r10,2 ; \
BRANCH_TO_C000(r10, system_reset_idle_common) ; \
1: \
+ KVMTEST_PR(n) ; \
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
#else
#define IDLETEST NOTEST
@@ -130,6 +131,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
EXC_REAL_END(system_reset, 0x100, 0x100)
EXC_VIRT_NONE(0x4100, 0x100)
+TRAMP_KVM(PACA_EXNMI, 0x100)
#ifdef CONFIG_PPC_P7_NAP
EXC_COMMON_BEGIN(system_reset_idle_common)
@@ -233,7 +235,7 @@ BEGIN_FTR_SECTION
addi r10,r10,1 /* increment paca->in_mce */
sth r10,PACA_IN_MCE(r13)
/* Limit nested MCE to level 4 to avoid stack overflow */
- cmpwi r10,4
+ cmpwi r10,MAX_MCE_DEPTH
bgt 2f /* Check if we hit limit of 4 */
std r11,GPR1(r1) /* Save r1 on the stack. */
std r11,0(r1) /* make stack chain pointer */
@@ -542,7 +544,7 @@ EXC_COMMON_BEGIN(instruction_access_common)
RECONCILE_IRQ_STATE(r10, r11)
ld r12,_MSR(r1)
ld r3,_NIP(r1)
- andis. r4,r12,DSISR_BAD_FAULT_64S@h
+ andis. r4,r12,DSISR_SRR1_MATCH_64S@h
li r5,0x400
std r3,_DAR(r1)
std r4,_DSISR(r1)
@@ -606,7 +608,7 @@ EXC_COMMON_BEGIN(slb_miss_common)
cmpdi cr5,r11,MSR_RI
crset 4*cr0+eq
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
BEGIN_MMU_FTR_SECTION
bl slb_allocate
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
@@ -888,12 +890,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
#define LOAD_SYSCALL_HANDLER(reg) \
__LOAD_HANDLER(reg, system_call_common)
-#define SYSCALL_FASTENDIAN_TEST \
-BEGIN_FTR_SECTION \
- cmpdi r0,0x1ebe ; \
- beq- 1f ; \
-END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
-
/*
* After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9,
* and HMT_MEDIUM.
@@ -908,6 +904,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
rfid ; \
b . ; /* prevent speculative execution */
+#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH
+#define SYSCALL_FASTENDIAN_TEST \
+BEGIN_FTR_SECTION \
+ cmpdi r0,0x1ebe ; \
+ beq- 1f ; \
+END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
+
#define SYSCALL_FASTENDIAN \
/* Fast LE/BE switch system call */ \
1: mfspr r12,SPRN_SRR1 ; \
@@ -916,6 +919,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
mr r13,r9 ; \
rfid ; /* return to userspace */ \
b . ; /* prevent speculative execution */
+#else
+#define SYSCALL_FASTENDIAN_TEST
+#define SYSCALL_FASTENDIAN
+#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */
#if defined(CONFIG_RELOCATABLE)
/*
@@ -1033,6 +1040,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
EXCEPTION_PROLOG_COMMON_3(0xe60)
addi r3,r1,STACK_FRAME_OVERHEAD
BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */
+ cmpdi cr0,r3,0
+
/* Windup the stack. */
/* Move original HSRR0 and HSRR1 into the respective regs */
ld r9,_MSR(r1)
@@ -1049,10 +1058,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
REST_8GPRS(2, r1)
REST_GPR(10, r1)
ld r11,_CCR(r1)
+ REST_2GPRS(12, r1)
+ bne 1f
mtcr r11
REST_GPR(11, r1)
- REST_2GPRS(12, r1)
- /* restore original r1. */
+ ld r1,GPR1(r1)
+ hrfid
+
+1: mtcr r11
+ REST_GPR(11, r1)
ld r1,GPR1(r1)
/*
@@ -1065,8 +1079,9 @@ hmi_exception_after_realmode:
EXCEPTION_PROLOG_0(PACA_EXGEN)
b tramp_real_hmi_exception
-EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception)
-
+EXC_COMMON_BEGIN(hmi_exception_common)
+EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception,
+ ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON)
EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20)
EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80)
@@ -1505,8 +1520,8 @@ USE_TEXT_SECTION()
*/
.balign IFETCH_ALIGN_BYTES
do_hash_page:
- #ifdef CONFIG_PPC_STD_MMU_64
- lis r0,DSISR_BAD_FAULT_64S@h
+#ifdef CONFIG_PPC_BOOK3S_64
+ lis r0,(DSISR_BAD_FAULT_64S|DSISR_DABRMATCH)@h
ori r0,r0,DSISR_BAD_FAULT_64S@l
and. r0,r4,r0 /* weird error? */
bne- handle_page_fault /* if not, try to insert a HPTE */
@@ -1536,7 +1551,7 @@ do_hash_page:
/* Reload DSISR into r4 for the DABR check below */
ld r4,_DSISR(r1)
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
/* Here we have a page fault that hash_page can't handle. */
handle_page_fault:
@@ -1565,7 +1580,7 @@ handle_dabr_fault:
12: b ret_from_except_lite
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
/* We have a page fault that hash_page could handle but HV refused
* the PTE insertion
*/
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index e1431800bfb9..04ea5c04fd24 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1270,10 +1270,15 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ int input = -1;
+
if (!fw_dump.dump_active)
return -EPERM;
- if (buf[0] == '1') {
+ if (kstrtoint(buf, 0, &input))
+ return -EINVAL;
+
+ if (input == 1) {
/*
* Take away the '/proc/vmcore'. We are releasing the dump
* memory, hence it will not be valid anymore.
@@ -1307,21 +1312,25 @@ static ssize_t fadump_register_store(struct kobject *kobj,
const char *buf, size_t count)
{
int ret = 0;
+ int input = -1;
if (!fw_dump.fadump_enabled || fdm_active)
return -EPERM;
+ if (kstrtoint(buf, 0, &input))
+ return -EINVAL;
+
mutex_lock(&fadump_mutex);
- switch (buf[0]) {
- case '0':
+ switch (input) {
+ case 0:
if (fw_dump.dump_registered == 0) {
goto unlock_out;
}
/* Un-register Firmware-assisted dump */
fadump_unregister_dump(&fdm);
break;
- case '1':
+ case 1:
if (fw_dump.dump_registered == 1) {
ret = -EEXIST;
goto unlock_out;
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 8c54166491e7..29b2fed93289 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -388,7 +388,7 @@ DataAccess:
EXCEPTION_PROLOG
mfspr r10,SPRN_DSISR
stw r10,_DSISR(r11)
- andis. r0,r10,DSISR_BAD_FAULT_32S@h
+ andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h
bne 1f /* if not, try to put a PTE */
mfspr r4,SPRN_DAR /* into the hash table */
rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index ff8511d6d8ea..aa71a90f5222 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -55,12 +55,18 @@
*
* For pSeries or server processors:
* 1. The MMU is off & open firmware is running in real mode.
- * 2. The kernel is entered at __start
+ * 2. The primary CPU enters at __start.
+ * 3. If the RTAS supports "query-cpu-stopped-state", then secondary
+ * CPUs will enter as directed by "start-cpu" RTAS call, which is
+ * generic_secondary_smp_init, with PIR in r3.
+ * 4. Else the secondary CPUs will enter at secondary_hold (0x60) as
+ * directed by the "start-cpu" RTS call, with PIR in r3.
* -or- For OPAL entry:
- * 1. The MMU is off, processor in HV mode, primary CPU enters at 0
- * with device-tree in gpr3. We also get OPAL base in r8 and
- * entry in r9 for debugging purposes
- * 2. Secondary processors enter at 0x60 with PIR in gpr3
+ * 1. The MMU is off, processor in HV mode.
+ * 2. The primary CPU enters at 0 with device-tree in r3, OPAL base
+ * in r8, and entry in r9 for debugging purposes.
+ * 3. Secondary CPUs enter as directed by OPAL_START_CPU call, which
+ * is at generic_secondary_smp_init, with PIR in r3.
*
* For Book3E processors:
* 1. The MMU is on running in AS0 in a state defined in ePAPR
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 1125c9be9e06..01e1c1997893 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -112,12 +112,14 @@ power9_save_additional_sprs:
std r4, STOP_HFSCR(r13)
mfspr r3, SPRN_MMCRA
- mfspr r4, SPRN_MMCR1
+ mfspr r4, SPRN_MMCR0
std r3, STOP_MMCRA(r13)
- std r4, STOP_MMCR1(r13)
+ std r4, _MMCR0(r1)
- mfspr r3, SPRN_MMCR2
- std r3, STOP_MMCR2(r13)
+ mfspr r3, SPRN_MMCR1
+ mfspr r4, SPRN_MMCR2
+ std r3, STOP_MMCR1(r13)
+ std r4, STOP_MMCR2(r13)
blr
power9_restore_additional_sprs:
@@ -135,11 +137,14 @@ power9_restore_additional_sprs:
ld r4, STOP_MMCRA(r13)
mtspr SPRN_HFSCR, r3
mtspr SPRN_MMCRA, r4
- /* We have already restored PACA_MMCR0 */
- ld r3, STOP_MMCR1(r13)
- ld r4, STOP_MMCR2(r13)
- mtspr SPRN_MMCR1, r3
- mtspr SPRN_MMCR2, r4
+
+ ld r3, _MMCR0(r1)
+ ld r4, STOP_MMCR1(r13)
+ mtspr SPRN_MMCR0, r3
+ mtspr SPRN_MMCR1, r4
+
+ ld r3, STOP_MMCR2(r13)
+ mtspr SPRN_MMCR2, r3
blr
/*
@@ -319,20 +324,13 @@ enter_winkle:
/*
* r3 - PSSCR value corresponding to the requested stop state.
*/
+power_enter_stop:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-power_enter_stop_kvm_rm:
- /*
- * This is currently unused because POWER9 KVM does not have to
- * gather secondary threads into sibling mode, but the code is
- * here in case that function is required.
- *
- * Tell KVM we're entering idle.
- */
+ /* Tell KVM we're entering idle */
li r4,KVM_HWTHREAD_IN_IDLE
/* DO THIS IN REAL MODE! See comment above. */
stb r4,HSTATE_HWTHREAD_STATE(r13)
#endif
-power_enter_stop:
/*
* Check if we are executing the lite variant with ESL=EC=0
*/
@@ -357,13 +355,15 @@ power_enter_stop:
b pnv_wakeup_noloss
.Lhandle_esl_ec_set:
+BEGIN_FTR_SECTION
/*
- * POWER9 DD2 can incorrectly set PMAO when waking up after a
- * state-loss idle. Saving and restoring MMCR0 over idle is a
+ * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after
+ * a state-loss idle. Saving and restoring MMCR0 over idle is a
* workaround.
*/
mfspr r4,SPRN_MMCR0
std r4,_MMCR0(r1)
+END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
/*
* Check if the requested state is a deep idle state.
@@ -496,18 +496,6 @@ pnv_powersave_wakeup_mce:
b pnv_powersave_wakeup
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-kvm_start_guest_check:
- li r0,KVM_HWTHREAD_IN_KERNEL
- stb r0,HSTATE_HWTHREAD_STATE(r13)
- /* Order setting hwthread_state vs. testing hwthread_req */
- sync
- lbz r0,HSTATE_HWTHREAD_REQ(r13)
- cmpwi r0,0
- beqlr
- b kvm_start_guest
-#endif
-
/*
* Called from reset vector for powersave wakeups.
* cr3 - set to gt if waking up with partial/complete hypervisor state loss
@@ -532,9 +520,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mr r3,r12
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-BEGIN_FTR_SECTION
- bl kvm_start_guest_check
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+ li r0,KVM_HWTHREAD_IN_KERNEL
+ stb r0,HSTATE_HWTHREAD_STATE(r13)
+ /* Order setting hwthread_state vs. testing hwthread_req */
+ sync
+ lbz r0,HSTATE_HWTHREAD_REQ(r13)
+ cmpwi r0,0
+ beq 1f
+ b kvm_start_guest
+1:
#endif
/* Return SRR1 from power7_nap() */
@@ -555,15 +549,17 @@ pnv_restore_hyp_resource_arch300:
* then clear bit 60 in MMCRA to ensure the PMU starts running.
*/
blt cr3,1f
+BEGIN_FTR_SECTION
PPC_INVALIDATE_ERAT
ld r1,PACAR1(r13)
+ ld r4,_MMCR0(r1)
+ mtspr SPRN_MMCR0,r4
+END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
mfspr r4,SPRN_MMCRA
ori r4,r4,(1 << (63-60))
mtspr SPRN_MMCRA,r4
xori r4,r4,(1 << (63-60))
mtspr SPRN_MMCRA,r4
- ld r4,_MMCR0(r1)
- mtspr SPRN_MMCR0,r4
1:
/*
* POWER ISA 3. Use PSSCR to determine if we
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..b7a84522e652 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -143,6 +143,13 @@ notrace unsigned int __check_irq_replay(void)
*/
unsigned char happened = local_paca->irq_happened;
+ /*
+ * We are responding to the next interrupt, so interrupt-off
+ * latencies should be reset here.
+ */
+ trace_hardirqs_on();
+ trace_hardirqs_off();
+
if (happened & PACA_IRQ_HARD_DIS) {
/* Clear bit 0 which we wouldn't clear otherwise */
local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
@@ -270,6 +277,7 @@ notrace void arch_local_irq_restore(unsigned long en)
#endif /* CONFIG_TRACE_IRQFLAGS */
set_soft_enabled(0);
+ trace_hardirqs_off();
/*
* Check if anything needs to be re-emitted. We haven't
@@ -279,6 +287,7 @@ notrace void arch_local_irq_restore(unsigned long en)
replay = __check_irq_replay();
/* We can soft-enable now */
+ trace_hardirqs_on();
set_soft_enabled(1);
/*
@@ -394,11 +403,19 @@ bool prep_irq_for_idle_irqsoff(void)
/*
* Take the SRR1 wakeup reason, index into this table to find the
* appropriate irq_happened bit.
+ *
+ * Sytem reset exceptions taken in idle state also come through here,
+ * but they are NMI interrupts so do not need to wait for IRQs to be
+ * restored, and should be taken as early as practical. These are marked
+ * with 0xff in the table. The Power ISA specifies 0100b as the system
+ * reset interrupt reason.
*/
+#define IRQ_SYSTEM_RESET 0xff
+
static const u8 srr1_to_lazyirq[0x10] = {
0, 0, 0,
PACA_IRQ_DBELL,
- 0,
+ IRQ_SYSTEM_RESET,
PACA_IRQ_DBELL,
PACA_IRQ_DEC,
0,
@@ -407,15 +424,43 @@ static const u8 srr1_to_lazyirq[0x10] = {
PACA_IRQ_HMI,
0, 0, 0, 0, 0 };
+void replay_system_reset(void)
+{
+ struct pt_regs regs;
+
+ ppc_save_regs(&regs);
+ regs.trap = 0x100;
+ get_paca()->in_nmi = 1;
+ system_reset_exception(&regs);
+ get_paca()->in_nmi = 0;
+}
+EXPORT_SYMBOL_GPL(replay_system_reset);
+
void irq_set_pending_from_srr1(unsigned long srr1)
{
unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
+ u8 reason = srr1_to_lazyirq[idx];
+
+ /*
+ * Take the system reset now, which is immediately after registers
+ * are restored from idle. It's an NMI, so interrupts need not be
+ * re-enabled before it is taken.
+ */
+ if (unlikely(reason == IRQ_SYSTEM_RESET)) {
+ replay_system_reset();
+ return;
+ }
/*
* The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
- * so this can be called unconditionally with srr1 wake reason.
+ * so this can be called unconditionally with the SRR1 wake
+ * reason as returned by the idle code, which uses 0 to mean no
+ * interrupt.
+ *
+ * If a future CPU was to designate this as an interrupt reason,
+ * then a new index for no interrupt must be assigned.
*/
- local_paca->irq_happened |= srr1_to_lazyirq[idx];
+ local_paca->irq_happened |= reason;
}
#endif /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 6c089d9757c9..7a1f99f1b47f 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -25,6 +25,21 @@
#include <linux/preempt.h>
#include <linux/ftrace.h>
+/*
+ * This is called from ftrace code after invoking registered handlers to
+ * disambiguate regs->nip changes done by jprobes and livepatch. We check if
+ * there is an active jprobe at the provided address (mcount location).
+ */
+int __is_active_jprobe(unsigned long addr)
+{
+ if (!preemptible()) {
+ struct kprobe *p = raw_cpu_read(current_kprobe);
+ return (p && (unsigned long)p->addr == addr) ? 1 : 0;
+ }
+
+ return 0;
+}
+
static nokprobe_inline
int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb, unsigned long orig_nip)
@@ -60,11 +75,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
{
struct kprobe *p;
struct kprobe_ctlblk *kcb;
- unsigned long flags;
- /* Disable irq for emulating a breakpoint and avoiding preempt */
- local_irq_save(flags);
- hard_irq_disable();
+ preempt_disable();
p = get_kprobe((kprobe_opcode_t *)nip);
if (unlikely(!p) || kprobe_disabled(p))
@@ -86,13 +98,17 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (!p->pre_handler || !p->pre_handler(p, regs))
__skip_singlestep(p, regs, kcb, orig_nip);
- /*
- * If pre_handler returns !0, it sets regs->nip and
- * resets current kprobe.
- */
+ else {
+ /*
+ * If pre_handler returns !0, it sets regs->nip and
+ * resets current kprobe. In this case, we should not
+ * re-enable preemption.
+ */
+ return;
+ }
}
end:
- local_irq_restore(flags);
+ preempt_enable_no_resched();
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bebc3007a793..ca5d5a081e75 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -43,12 +43,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
-int is_current_kprobe_addr(unsigned long addr)
-{
- struct kprobe *p = kprobe_running();
- return (p && (unsigned long)p->addr == addr) ? 1 : 0;
-}
-
bool arch_within_kprobe_blacklist(unsigned long addr)
{
return (addr >= (unsigned long)__kprobes_text_start &&
@@ -59,7 +53,7 @@ bool arch_within_kprobe_blacklist(unsigned long addr)
kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
{
- kprobe_opcode_t *addr;
+ kprobe_opcode_t *addr = NULL;
#ifdef PPC64_ELF_ABI_v2
/* PPC64 ABIv2 needs local entry point */
@@ -91,36 +85,29 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
* Also handle <module:symbol> format.
*/
char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
- const char *modsym;
bool dot_appended = false;
- if ((modsym = strchr(name, ':')) != NULL) {
- modsym++;
- if (*modsym != '\0' && *modsym != '.') {
- /* Convert to <module:.symbol> */
- strncpy(dot_name, name, modsym - name);
- dot_name[modsym - name] = '.';
- dot_name[modsym - name + 1] = '\0';
- strncat(dot_name, modsym,
- sizeof(dot_name) - (modsym - name) - 2);
- dot_appended = true;
- } else {
- dot_name[0] = '\0';
- strncat(dot_name, name, sizeof(dot_name) - 1);
- }
- } else if (name[0] != '.') {
- dot_name[0] = '.';
- dot_name[1] = '\0';
- strncat(dot_name, name, KSYM_NAME_LEN - 2);
+ const char *c;
+ ssize_t ret = 0;
+ int len = 0;
+
+ if ((c = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
+ c++;
+ len = c - name;
+ memcpy(dot_name, name, len);
+ } else
+ c = name;
+
+ if (*c != '\0' && *c != '.') {
+ dot_name[len++] = '.';
dot_appended = true;
- } else {
- dot_name[0] = '\0';
- strncat(dot_name, name, KSYM_NAME_LEN - 1);
}
- addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
- if (!addr && dot_appended) {
- /* Let's try the original non-dot symbol lookup */
+ ret = strscpy(dot_name + len, c, KSYM_NAME_LEN);
+ if (ret > 0)
+ addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
+
+ /* Fallback to the original non-dot symbol lookup */
+ if (!addr && dot_appended)
addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
- }
#else
addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
#endif
@@ -239,7 +226,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
+static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
{
int ret;
unsigned int insn = *p->ainsn.insn;
@@ -261,9 +248,20 @@ int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
*/
printk("Can't step on instruction %x\n", insn);
BUG();
- } else if (ret == 0)
- /* This instruction can't be boosted */
- p->ainsn.boostable = -1;
+ } else {
+ /*
+ * If we haven't previously emulated this instruction, then it
+ * can't be boosted. Note it down so we don't try to do so again.
+ *
+ * If, however, we had emulated this instruction in the past,
+ * then this is just an error with the current run (for
+ * instance, exceptions due to a load/store). We return 0 so
+ * that this is now single-stepped, but continue to try
+ * emulating it in subsequent probe hits.
+ */
+ if (unlikely(p->ainsn.boostable != 1))
+ p->ainsn.boostable = -1;
+ }
return ret;
}
@@ -639,24 +637,22 @@ NOKPROBE_SYMBOL(setjmp_pre_handler);
void __used jprobe_return(void)
{
- asm volatile("trap" ::: "memory");
+ asm volatile("jprobe_return_trap:\n"
+ "trap\n"
+ ::: "memory");
}
NOKPROBE_SYMBOL(jprobe_return);
-static void __used jprobe_return_end(void)
-{
-}
-NOKPROBE_SYMBOL(jprobe_return_end);
-
int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- /*
- * FIXME - we should ideally be validating that we got here 'cos
- * of the "trap" in jprobe_return() above, before restoring the
- * saved regs...
- */
+ if (regs->nip != ppc_kallsyms_lookup_name("jprobe_return_trap")) {
+ pr_debug("longjmp_break_handler NIP (0x%lx) does not match jprobe_return_trap (0x%lx)\n",
+ regs->nip, ppc_kallsyms_lookup_name("jprobe_return_trap"));
+ return 0;
+ }
+
memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs));
/* It's OK to start function graph tracing again */
unpause_graph_tracing();
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 5c12e21d0d1a..49d34d7271e7 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image)
/* NOTREACHED */
}
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
/* Values we need to export to the second kernel via the device tree. */
static unsigned long htab_base;
static unsigned long htab_size;
@@ -402,4 +402,4 @@ static int __init export_htab_values(void)
return 0;
}
late_initcall(export_htab_values);
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9b2ea7e71c06..742e4658c5dc 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -39,11 +39,21 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
static DEFINE_PER_CPU(int, mce_queue_count);
static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
+/* Queue for delayed MCE UE events. */
+static DEFINE_PER_CPU(int, mce_ue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
+ mce_ue_event_queue);
+
static void machine_check_process_queued_event(struct irq_work *work);
+void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_process_ue_event(struct work_struct *work);
+
static struct irq_work mce_event_process_work = {
.func = machine_check_process_queued_event,
};
+DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
+
static void mce_set_error_info(struct machine_check_event *mce,
struct mce_error_info *mce_err)
{
@@ -82,7 +92,7 @@ static void mce_set_error_info(struct machine_check_event *mce,
*/
void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
- uint64_t nip, uint64_t addr)
+ uint64_t nip, uint64_t addr, uint64_t phys_addr)
{
int index = __this_cpu_inc_return(mce_nest_count) - 1;
struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
@@ -140,6 +150,11 @@ void save_mce_event(struct pt_regs *regs, long handled,
} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
mce->u.ue_error.effective_address_provided = true;
mce->u.ue_error.effective_address = addr;
+ if (phys_addr != ULONG_MAX) {
+ mce->u.ue_error.physical_address_provided = true;
+ mce->u.ue_error.physical_address = phys_addr;
+ machine_check_ue_event(mce);
+ }
}
return;
}
@@ -193,6 +208,26 @@ void release_mce_event(void)
get_mce_event(NULL, true);
}
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_ue_event(struct machine_check_event *evt)
+{
+ int index;
+
+ index = __this_cpu_inc_return(mce_ue_count) - 1;
+ /* If queue is full, just return for now. */
+ if (index >= MAX_MC_EVT) {
+ __this_cpu_dec(mce_ue_count);
+ return;
+ }
+ memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
+
+ /* Queue work to process this event later. */
+ schedule_work(&mce_ue_event_work);
+}
+
/*
* Queue up the MCE event which then can be handled later.
*/
@@ -215,7 +250,39 @@ void machine_check_queue_event(void)
/* Queue irq work to process this event later. */
irq_work_queue(&mce_event_process_work);
}
-
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_process_ue_event(struct work_struct *work)
+{
+ int index;
+ struct machine_check_event *evt;
+
+ while (__this_cpu_read(mce_ue_count) > 0) {
+ index = __this_cpu_read(mce_ue_count) - 1;
+ evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+#ifdef CONFIG_MEMORY_FAILURE
+ /*
+ * This should probably queued elsewhere, but
+ * oh! well
+ */
+ if (evt->error_type == MCE_ERROR_TYPE_UE) {
+ if (evt->u.ue_error.physical_address_provided) {
+ unsigned long pfn;
+
+ pfn = evt->u.ue_error.physical_address >>
+ PAGE_SHIFT;
+ memory_failure(pfn, SIGBUS, 0);
+ } else
+ pr_warn("Failed to identify bad address from "
+ "where the uncorrectable error (UE) "
+ "was generated\n");
+ }
+#endif
+ __this_cpu_dec(mce_ue_count);
+ }
+}
/*
* process pending MCE event from the mce event queue. This function will be
* called during syscall exit.
@@ -223,6 +290,7 @@ void machine_check_queue_event(void)
static void machine_check_process_queued_event(struct irq_work *work)
{
int index;
+ struct machine_check_event *evt;
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -232,8 +300,8 @@ static void machine_check_process_queued_event(struct irq_work *work)
*/
while (__this_cpu_read(mce_queue_count) > 0) {
index = __this_cpu_read(mce_queue_count) - 1;
- machine_check_print_event_info(
- this_cpu_ptr(&mce_event_queue[index]), false);
+ evt = this_cpu_ptr(&mce_event_queue[index]);
+ machine_check_print_event_info(evt, false);
__this_cpu_dec(mce_queue_count);
}
}
@@ -340,7 +408,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
printk("%s Effective address: %016llx\n",
level, evt->u.ue_error.effective_address);
if (evt->u.ue_error.physical_address_provided)
- printk("%s Physical address: %016llx\n",
+ printk("%s Physical address: %016llx\n",
level, evt->u.ue_error.physical_address);
break;
case MCE_ERROR_TYPE_SLB:
@@ -411,45 +479,6 @@ void machine_check_print_event_info(struct machine_check_event *evt,
}
EXPORT_SYMBOL_GPL(machine_check_print_event_info);
-uint64_t get_mce_fault_addr(struct machine_check_event *evt)
-{
- switch (evt->error_type) {
- case MCE_ERROR_TYPE_UE:
- if (evt->u.ue_error.effective_address_provided)
- return evt->u.ue_error.effective_address;
- break;
- case MCE_ERROR_TYPE_SLB:
- if (evt->u.slb_error.effective_address_provided)
- return evt->u.slb_error.effective_address;
- break;
- case MCE_ERROR_TYPE_ERAT:
- if (evt->u.erat_error.effective_address_provided)
- return evt->u.erat_error.effective_address;
- break;
- case MCE_ERROR_TYPE_TLB:
- if (evt->u.tlb_error.effective_address_provided)
- return evt->u.tlb_error.effective_address;
- break;
- case MCE_ERROR_TYPE_USER:
- if (evt->u.user_error.effective_address_provided)
- return evt->u.user_error.effective_address;
- break;
- case MCE_ERROR_TYPE_RA:
- if (evt->u.ra_error.effective_address_provided)
- return evt->u.ra_error.effective_address;
- break;
- case MCE_ERROR_TYPE_LINK:
- if (evt->u.link_error.effective_address_provided)
- return evt->u.link_error.effective_address;
- break;
- default:
- case MCE_ERROR_TYPE_UNKNOWN:
- break;
- }
- return 0;
-}
-EXPORT_SYMBOL(get_mce_fault_addr);
-
/*
* This function is called in real mode. Strictly no printk's please.
*
@@ -470,6 +499,34 @@ long hmi_exception_realmode(struct pt_regs *regs)
{
__this_cpu_inc(irq_stat.hmi_exceptions);
+#ifdef CONFIG_PPC_BOOK3S_64
+ /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */
+ if (pvr_version_is(PVR_POWER9)) {
+ unsigned long hmer = mfspr(SPRN_HMER);
+
+ /* Do we have the debug bit set */
+ if (hmer & PPC_BIT(17)) {
+ hmer &= ~PPC_BIT(17);
+ mtspr(SPRN_HMER, hmer);
+
+ /*
+ * Now to avoid problems with soft-disable we
+ * only do the emulation if we are coming from
+ * user space
+ */
+ if (user_mode(regs))
+ local_paca->hmi_p9_special_emu = 1;
+
+ /*
+ * Don't bother going to OPAL if that's the
+ * only relevant bit.
+ */
+ if (!(hmer & mfspr(SPRN_HMEER)))
+ return local_paca->hmi_p9_special_emu;
+ }
+ }
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
wait_for_subcore_guest_exit();
if (ppc_md.hmi_exception_early)
@@ -477,5 +534,5 @@ long hmi_exception_realmode(struct pt_regs *regs)
wait_for_tb_resync();
- return 0;
+ return 1;
}
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 72f153c6f3fa..644f7040b91c 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -27,6 +27,36 @@
#include <asm/mmu.h>
#include <asm/mce.h>
#include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/pte-walk.h>
+#include <asm/sstep.h>
+#include <asm/exception-64s.h>
+
+/*
+ * Convert an address related to an mm to a PFN. NOTE: we are in real
+ * mode, we could potentially race with page table updates.
+ */
+static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+{
+ pte_t *ptep;
+ unsigned long flags;
+ struct mm_struct *mm;
+
+ if (user_mode(regs))
+ mm = current->mm;
+ else
+ mm = &init_mm;
+
+ local_irq_save(flags);
+ if (mm == current->mm)
+ ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+ else
+ ptep = find_init_mm_pte(addr, NULL);
+ local_irq_restore(flags);
+ if (!ptep || pte_special(*ptep))
+ return ULONG_MAX;
+ return pte_pfn(*ptep);
+}
static void flush_tlb_206(unsigned int num_sets, unsigned int action)
{
@@ -128,7 +158,7 @@ void __flush_tlb_power9(unsigned int action)
{
unsigned int num_sets;
- if (radix_enabled())
+ if (early_radix_enabled())
num_sets = POWER9_TLB_SETS_RADIX;
else
num_sets = POWER9_TLB_SETS_HASH;
@@ -138,7 +168,7 @@ void __flush_tlb_power9(unsigned int action)
/* flush SLBs and reload */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
static void flush_and_reload_slb(void)
{
struct slb_shadow *slb;
@@ -185,7 +215,7 @@ static void flush_erat(void)
static int mce_flush(int what)
{
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
if (what == MCE_FLUSH_SLB) {
flush_and_reload_slb();
return 1;
@@ -421,9 +451,45 @@ static const struct mce_derror_table mce_p9_derror_table[] = {
MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
{ 0, false, 0, 0, 0, 0 } };
+static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr,
+ uint64_t *phys_addr)
+{
+ /*
+ * Carefully look at the NIP to determine
+ * the instruction to analyse. Reading the NIP
+ * in real-mode is tricky and can lead to recursive
+ * faults
+ */
+ int instr;
+ unsigned long pfn, instr_addr;
+ struct instruction_op op;
+ struct pt_regs tmp = *regs;
+
+ pfn = addr_to_pfn(regs, regs->nip);
+ if (pfn != ULONG_MAX) {
+ instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+ instr = *(unsigned int *)(instr_addr);
+ if (!analyse_instr(&op, &tmp, instr)) {
+ pfn = addr_to_pfn(regs, op.ea);
+ *addr = op.ea;
+ *phys_addr = (pfn << PAGE_SHIFT);
+ return 0;
+ }
+ /*
+ * analyse_instr() might fail if the instruction
+ * is not a load/store, although this is unexpected
+ * for load/store errors or if we got the NIP
+ * wrong
+ */
+ }
+ *addr = 0;
+ return -1;
+}
+
static int mce_handle_ierror(struct pt_regs *regs,
const struct mce_ierror_table table[],
- struct mce_error_info *mce_err, uint64_t *addr)
+ struct mce_error_info *mce_err, uint64_t *addr,
+ uint64_t *phys_addr)
{
uint64_t srr1 = regs->msr;
int handled = 0;
@@ -475,8 +541,22 @@ static int mce_handle_ierror(struct pt_regs *regs,
}
mce_err->severity = table[i].severity;
mce_err->initiator = table[i].initiator;
- if (table[i].nip_valid)
+ if (table[i].nip_valid) {
*addr = regs->nip;
+ if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+ table[i].error_type == MCE_ERROR_TYPE_UE) {
+ unsigned long pfn;
+
+ if (get_paca()->in_mce < MAX_MCE_DEPTH) {
+ pfn = addr_to_pfn(regs, regs->nip);
+ if (pfn != ULONG_MAX) {
+ *phys_addr =
+ (pfn << PAGE_SHIFT);
+ handled = 1;
+ }
+ }
+ }
+ }
return handled;
}
@@ -489,7 +569,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
static int mce_handle_derror(struct pt_regs *regs,
const struct mce_derror_table table[],
- struct mce_error_info *mce_err, uint64_t *addr)
+ struct mce_error_info *mce_err, uint64_t *addr,
+ uint64_t *phys_addr)
{
uint64_t dsisr = regs->dsisr;
int handled = 0;
@@ -555,7 +636,17 @@ static int mce_handle_derror(struct pt_regs *regs,
mce_err->initiator = table[i].initiator;
if (table[i].dar_valid)
*addr = regs->dar;
-
+ else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+ table[i].error_type == MCE_ERROR_TYPE_UE) {
+ /*
+ * We do a maximum of 4 nested MCE calls, see
+ * kernel/exception-64s.h
+ */
+ if (get_paca()->in_mce < MAX_MCE_DEPTH)
+ if (!mce_find_instr_ea_and_pfn(regs, addr,
+ phys_addr))
+ handled = 1;
+ }
found = 1;
}
@@ -592,19 +683,21 @@ static long mce_handle_error(struct pt_regs *regs,
const struct mce_ierror_table itable[])
{
struct mce_error_info mce_err = { 0 };
- uint64_t addr;
+ uint64_t addr, phys_addr;
uint64_t srr1 = regs->msr;
long handled;
if (SRR1_MC_LOADSTORE(srr1))
- handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
+ handled = mce_handle_derror(regs, dtable, &mce_err, &addr,
+ &phys_addr);
else
- handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
+ handled = mce_handle_ierror(regs, itable, &mce_err, &addr,
+ &phys_addr);
if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
handled = mce_handle_ue_error(regs);
- save_mce_event(regs, handled, &mce_err, regs->nip, addr);
+ save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
return handled;
}
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 0b0f89685b67..759104b99f9f 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
/* Find this stub, or if that fails, the next avail. entry */
stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
for (i = 0; stub_func_addr(stubs[i].funcdata); i++) {
- BUG_ON(i >= num_stubs);
+ if (WARN_ON(i >= num_stubs))
+ return 0;
if (stub_func_addr(stubs[i].funcdata) == func_addr(addr))
return (unsigned long)&stubs[i];
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 91e037ab20a1..8237884ca389 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -115,32 +115,23 @@ static unsigned long can_optimize(struct kprobe *p)
static void optimized_callback(struct optimized_kprobe *op,
struct pt_regs *regs)
{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- unsigned long flags;
-
/* This is possible if op is under delayed unoptimizing */
if (kprobe_disabled(&op->kp))
return;
- local_irq_save(flags);
- hard_irq_disable();
+ preempt_disable();
if (kprobe_running()) {
kprobes_inc_nmissed_count(&op->kp);
} else {
__this_cpu_write(current_kprobe, &op->kp);
regs->nip = (unsigned long)op->kp.addr;
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
opt_pre_handler(&op->kp, regs);
__this_cpu_write(current_kprobe, NULL);
}
- /*
- * No need for an explicit __hard_irq_enable() here.
- * local_irq_restore() will re-enable interrupts,
- * if they were hard disabled.
- */
- local_irq_restore(flags);
+ preempt_enable_no_resched();
}
NOKPROBE_SYMBOL(optimized_callback);
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 2ff2b8a19f71..d6597038931d 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -90,7 +90,7 @@ static inline void free_lppacas(void) { }
#endif /* CONFIG_PPC_BOOK3S */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
/*
* 3 persistent SLBs are registered here. The buffer will be zero
@@ -135,11 +135,11 @@ static struct slb_shadow * __init init_slb_shadow(int cpu)
return s;
}
-#else /* CONFIG_PPC_STD_MMU_64 */
+#else /* !CONFIG_PPC_BOOK3S_64 */
static void __init allocate_slb_shadows(int nr_cpus, int limit) { }
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
/* The Paca is an array with one entry per processor. Each contains an
* lppaca, which contains the information shared between the
@@ -170,9 +170,9 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
new_paca->kexec_state = KEXEC_STATE_NONE;
new_paca->__current = &init_task;
new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
new_paca->slb_shadow_ptr = init_slb_shadow(cpu);
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif
#ifdef CONFIG_PPC_BOOK3E
/* For now -- if we have threads this will be adjusted later */
@@ -262,8 +262,8 @@ void copy_mm_to_paca(struct mm_struct *mm)
get_paca()->mm_ctx_id = context->id;
#ifdef CONFIG_PPC_MM_SLICES
- VM_BUG_ON(!mm->context.addr_limit);
- get_paca()->addr_limit = mm->context.addr_limit;
+ VM_BUG_ON(!mm->context.slb_addr_limit);
+ get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit;
get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
memcpy(&get_paca()->mm_ctx_high_slices_psize,
&context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
@@ -271,7 +271,7 @@ void copy_mm_to_paca(struct mm_struct *mm)
get_paca()->mm_ctx_user_psize = context->user_psize;
get_paca()->mm_ctx_sllp = context->sllp;
#endif
-#else /* CONFIG_PPC_BOOK3S */
+#else /* !CONFIG_PPC_BOOK3S */
return;
#endif
}
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 02831a396419..0ac7aa346c69 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1740,15 +1740,3 @@ static void fixup_hide_host_resource_fsl(struct pci_dev *dev)
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl);
-
-static void fixup_vga(struct pci_dev *pdev)
-{
- u16 cmd;
-
- pci_read_config_word(pdev, PCI_COMMAND, &cmd);
- if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || !vga_default_device())
- vga_set_default_device(pdev);
-
-}
-DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
- PCI_CLASS_DISPLAY_VGA, 8, fixup_vga);
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 932b9741aa8f..15ce0306b092 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -90,14 +90,14 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
* to do an appropriate TLB flush here too
*/
if (bus->self) {
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
struct resource *res = bus->resource[0];
#endif
pr_debug("IO unmapping for PCI-PCI bridge %s\n",
pci_name(bus->self));
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
__flush_hash_table_range(&init_mm, res->start + _IO_BASE,
res->end + _IO_BASE + 1);
#endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a0c74bbf3454..bfdd783e3916 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -77,6 +77,13 @@
extern unsigned long _get_SP(void);
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Are we running in "Suspend disabled" mode? If so we have to block any
+ * sigreturn that would get us into suspended state, and we also warn in some
+ * other paths that we should never reach with suspend disabled.
+ */
+bool tm_suspend_disabled __ro_after_init = false;
+
static void check_if_tm_restore_required(struct task_struct *tsk)
{
/*
@@ -97,9 +104,23 @@ static inline bool msr_tm_active(unsigned long msr)
{
return MSR_TM_ACTIVE(msr);
}
+
+static bool tm_active_with_fp(struct task_struct *tsk)
+{
+ return msr_tm_active(tsk->thread.regs->msr) &&
+ (tsk->thread.ckpt_regs.msr & MSR_FP);
+}
+
+static bool tm_active_with_altivec(struct task_struct *tsk)
+{
+ return msr_tm_active(tsk->thread.regs->msr) &&
+ (tsk->thread.ckpt_regs.msr & MSR_VEC);
+}
#else
static inline bool msr_tm_active(unsigned long msr) { return false; }
static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
+static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; }
+static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; }
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
bool strict_msr_control;
@@ -232,7 +253,7 @@ EXPORT_SYMBOL(enable_kernel_fp);
static int restore_fp(struct task_struct *tsk)
{
- if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) {
+ if (tsk->thread.load_fp || tm_active_with_fp(tsk)) {
load_fp_state(&current->thread.fp_state);
current->thread.load_fp++;
return 1;
@@ -314,7 +335,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
static int restore_altivec(struct task_struct *tsk)
{
if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
- (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) {
+ (tsk->thread.load_vec || tm_active_with_altivec(tsk))) {
load_vr_state(&tsk->thread.vr_state);
tsk->thread.used_vr = 1;
tsk->thread.load_vec++;
@@ -853,6 +874,10 @@ static void tm_reclaim_thread(struct thread_struct *thr,
if (!MSR_TM_SUSPENDED(mfmsr()))
return;
+ giveup_all(container_of(thr, struct task_struct, thread));
+
+ tm_reclaim(thr, cause);
+
/*
* If we are in a transaction and FP is off then we can't have
* used FP inside that transaction. Hence the checkpointed
@@ -871,10 +896,6 @@ static void tm_reclaim_thread(struct thread_struct *thr,
if ((thr->ckpt_regs.msr & MSR_VEC) == 0)
memcpy(&thr->ckvr_state, &thr->vr_state,
sizeof(struct thread_vr_state));
-
- giveup_all(container_of(thr, struct task_struct, thread));
-
- tm_reclaim(thr, thr->ckpt_regs.msr, cause);
}
void tm_reclaim_current(uint8_t cause)
@@ -903,6 +924,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
if (!MSR_TM_ACTIVE(thr->regs->msr))
goto out_and_saveregs;
+ WARN_ON(tm_suspend_disabled);
+
TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
"ccr=%lx, msr=%lx, trap=%lx)\n",
tsk->pid, thr->regs->nip,
@@ -923,11 +946,9 @@ out_and_saveregs:
tm_save_sprs(thr);
}
-extern void __tm_recheckpoint(struct thread_struct *thread,
- unsigned long orig_msr);
+extern void __tm_recheckpoint(struct thread_struct *thread);
-void tm_recheckpoint(struct thread_struct *thread,
- unsigned long orig_msr)
+void tm_recheckpoint(struct thread_struct *thread)
{
unsigned long flags;
@@ -946,15 +967,13 @@ void tm_recheckpoint(struct thread_struct *thread,
*/
tm_restore_sprs(thread);
- __tm_recheckpoint(thread, orig_msr);
+ __tm_recheckpoint(thread);
local_irq_restore(flags);
}
static inline void tm_recheckpoint_new_task(struct task_struct *new)
{
- unsigned long msr;
-
if (!cpu_has_feature(CPU_FTR_TM))
return;
@@ -973,13 +992,11 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new)
tm_restore_sprs(&new->thread);
return;
}
- msr = new->thread.ckpt_regs.msr;
/* Recheckpoint to restore original checkpointed register state. */
- TM_DEBUG("*** tm_recheckpoint of pid %d "
- "(new->msr 0x%lx, new->origmsr 0x%lx)\n",
- new->pid, new->thread.regs->msr, msr);
+ TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n",
+ new->pid, new->thread.regs->msr);
- tm_recheckpoint(&new->thread, msr);
+ tm_recheckpoint(&new->thread);
/*
* The checkpointed state has been restored but the live state has
@@ -1119,6 +1136,10 @@ static inline void restore_sprs(struct thread_struct *old_thread,
if (old_thread->tar != new_thread->tar)
mtspr(SPRN_TAR, new_thread->tar);
}
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+ old_thread->tidr != new_thread->tidr)
+ mtspr(SPRN_TIDR, new_thread->tidr);
#endif
}
@@ -1155,7 +1176,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
}
#endif /* CONFIG_PPC64 */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
batch = this_cpu_ptr(&ppc64_tlb_batch);
if (batch->active) {
current_thread_info()->local_flags |= _TLF_LAZY_MMU;
@@ -1163,7 +1184,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
__flush_tlb_pending(batch);
batch->active = 0;
}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
switch_booke_debug_regs(&new->thread.debug);
@@ -1209,7 +1230,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
last = _switch(old_thread, new_thread);
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
batch = this_cpu_ptr(&ppc64_tlb_batch);
@@ -1223,22 +1244,22 @@ struct task_struct *__switch_to(struct task_struct *prev,
* The copy-paste buffer can only store into foreign real
* addresses, so unprivileged processes can not see the
* data or use it in any way unless they have foreign real
- * mappings. We don't have a VAS driver that allocates those
- * yet, so no cpabort is required.
+ * mappings. If the new process has the foreign real address
+ * mappings, we must issue a cp_abort to clear any state and
+ * prevent snooping, corruption or a covert channel.
+ *
+ * DD1 allows paste into normal system memory so we do an
+ * unpaired copy, rather than cp_abort, to clear the buffer,
+ * since cp_abort is quite expensive.
*/
- if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
- /*
- * DD1 allows paste into normal system memory, so we
- * do an unpaired copy here to clear the buffer and
- * prevent a covert channel being set up.
- *
- * cpabort is not used because it is quite expensive.
- */
+ if (current_thread_info()->task->thread.used_vas) {
+ asm volatile(PPC_CP_ABORT);
+ } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
asm volatile(PPC_COPY(%0, %1)
: : "r"(dummy_copy_buffer), "r"(0));
}
}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
return last;
}
@@ -1434,6 +1455,137 @@ void flush_thread(void)
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
}
+int set_thread_uses_vas(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ return -EINVAL;
+
+ current->thread.used_vas = 1;
+
+ /*
+ * Even a process that has no foreign real address mapping can use
+ * an unpaired COPY instruction (to no real effect). Issue CP_ABORT
+ * to clear any pending COPY and prevent a covert channel.
+ *
+ * __switch_to() will issue CP_ABORT on future context switches.
+ */
+ asm volatile(PPC_CP_ABORT);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+ return 0;
+}
+
+#ifdef CONFIG_PPC64
+static DEFINE_SPINLOCK(vas_thread_id_lock);
+static DEFINE_IDA(vas_thread_ida);
+
+/*
+ * We need to assign a unique thread id to each thread in a process.
+ *
+ * This thread id, referred to as TIDR, and separate from the Linux's tgid,
+ * is intended to be used to direct an ASB_Notify from the hardware to the
+ * thread, when a suitable event occurs in the system.
+ *
+ * One such event is a "paste" instruction in the context of Fast Thread
+ * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard
+ * (VAS) in POWER9.
+ *
+ * To get a unique TIDR per process we could simply reuse task_pid_nr() but
+ * the problem is that task_pid_nr() is not yet available copy_thread() is
+ * called. Fixing that would require changing more intrusive arch-neutral
+ * code in code path in copy_process()?.
+ *
+ * Further, to assign unique TIDRs within each process, we need an atomic
+ * field (or an IDR) in task_struct, which again intrudes into the arch-
+ * neutral code. So try to assign globally unique TIDRs for now.
+ *
+ * NOTE: TIDR 0 indicates that the thread does not need a TIDR value.
+ * For now, only threads that expect to be notified by the VAS
+ * hardware need a TIDR value and we assign values > 0 for those.
+ */
+#define MAX_THREAD_CONTEXT ((1 << 16) - 1)
+static int assign_thread_tidr(void)
+{
+ int index;
+ int err;
+
+again:
+ if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(&vas_thread_id_lock);
+ err = ida_get_new_above(&vas_thread_ida, 1, &index);
+ spin_unlock(&vas_thread_id_lock);
+
+ if (err == -EAGAIN)
+ goto again;
+ else if (err)
+ return err;
+
+ if (index > MAX_THREAD_CONTEXT) {
+ spin_lock(&vas_thread_id_lock);
+ ida_remove(&vas_thread_ida, index);
+ spin_unlock(&vas_thread_id_lock);
+ return -ENOMEM;
+ }
+
+ return index;
+}
+
+static void free_thread_tidr(int id)
+{
+ spin_lock(&vas_thread_id_lock);
+ ida_remove(&vas_thread_ida, id);
+ spin_unlock(&vas_thread_id_lock);
+}
+
+/*
+ * Clear any TIDR value assigned to this thread.
+ */
+void clear_thread_tidr(struct task_struct *t)
+{
+ if (!t->thread.tidr)
+ return;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ mtspr(SPRN_TIDR, 0);
+ free_thread_tidr(t->thread.tidr);
+ t->thread.tidr = 0;
+}
+
+void arch_release_task_struct(struct task_struct *t)
+{
+ clear_thread_tidr(t);
+}
+
+/*
+ * Assign a unique TIDR (thread id) for task @t and set it in the thread
+ * structure. For now, we only support setting TIDR for 'current' task.
+ */
+int set_thread_tidr(struct task_struct *t)
+{
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ return -EINVAL;
+
+ if (t != current)
+ return -EINVAL;
+
+ t->thread.tidr = assign_thread_tidr();
+ if (t->thread.tidr < 0)
+ return t->thread.tidr;
+
+ mtspr(SPRN_TIDR, t->thread.tidr);
+
+ return 0;
+}
+
+#endif /* CONFIG_PPC64 */
+
void
release_thread(struct task_struct *t)
{
@@ -1467,7 +1619,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
{
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
unsigned long sp_vsid;
unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
@@ -1580,6 +1732,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
}
if (cpu_has_feature(CPU_FTR_HAS_PPR))
p->thread.ppr = INIT_PPR;
+
+ p->thread.tidr = 0;
#endif
kregs->nip = ppc_function_entry(f);
return 0;
@@ -1898,7 +2052,8 @@ unsigned long get_wchan(struct task_struct *p)
do {
sp = *(unsigned long *)sp;
- if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD))
+ if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) ||
+ p->state == TASK_RUNNING)
return 0;
if (count > 0) {
ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE];
@@ -2046,7 +2201,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
unsigned long base = mm->brk;
unsigned long ret;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
/*
* If we are using 1TB segments and we are allowed to randomise
* the heap, we can put it above 1TB so it is backed by a 1TB
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f83056297441..b15bae265c90 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -47,6 +47,7 @@
#include <asm/mmu.h>
#include <asm/paca.h>
#include <asm/pgtable.h>
+#include <asm/powernv.h>
#include <asm/iommu.h>
#include <asm/btext.h>
#include <asm/sections.h>
@@ -228,7 +229,7 @@ static void __init check_cpu_pa_features(unsigned long node)
ibm_pa_features, ARRAY_SIZE(ibm_pa_features));
}
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
static void __init init_mmu_slb_size(unsigned long node)
{
const __be32 *slb_size_ptr;
@@ -658,6 +659,38 @@ static void __init early_reserve_mem(void)
#endif
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static bool tm_disabled __initdata;
+
+static int __init parse_ppc_tm(char *str)
+{
+ bool res;
+
+ if (kstrtobool(str, &res))
+ return -EINVAL;
+
+ tm_disabled = !res;
+
+ return 0;
+}
+early_param("ppc_tm", parse_ppc_tm);
+
+static void __init tm_init(void)
+{
+ if (tm_disabled) {
+ pr_info("Disabling hardware transactional memory (HTM)\n");
+ cur_cpu_spec->cpu_user_features2 &=
+ ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM);
+ cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
+ return;
+ }
+
+ pnv_tm_init();
+}
+#else
+static void tm_init(void) { }
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
void __init early_init_devtree(void *params)
{
phys_addr_t limit;
@@ -767,6 +800,8 @@ void __init early_init_devtree(void *params)
powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE;
#endif
+ tm_init();
+
DBG(" <- early_init_devtree()\n");
}
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 2e3bc16d02b2..2075322cd225 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -773,7 +773,7 @@ void arch_setup_pdev_archdata(struct platform_device *pdev)
static __init void print_system_info(void)
{
pr_info("-----------------------------------------------------\n");
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size);
#endif
#ifdef CONFIG_PPC_STD_MMU_32
@@ -800,7 +800,7 @@ static __init void print_system_info(void)
pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features);
#endif
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
if (htab_address)
pr_info("htab_address = 0x%p\n", htab_address);
if (htab_hash_mask)
@@ -898,7 +898,8 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_PPC_MM_SLICES
#ifdef CONFIG_PPC64
- init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
+ if (!radix_enabled())
+ init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
#else
#error "context.addr_limit not initialized."
#endif
diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
index cfba134b3024..21c18071d9d5 100644
--- a/arch/powerpc/kernel/setup.h
+++ b/arch/powerpc/kernel/setup.h
@@ -45,6 +45,12 @@ void emergency_stack_init(void);
static inline void emergency_stack_init(void) { };
#endif
+#ifdef CONFIG_PPC64
+void record_spr_defaults(void);
+#else
+static inline void record_spr_defaults(void) { };
+#endif
+
/*
* Having this in kvm_ppc.h makes include dependencies too
* tricky to solve for setup-common.c so have it here.
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index b89c6aac48c9..8956a9856604 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -69,6 +69,8 @@
#include <asm/opal.h>
#include <asm/cputhreads.h>
+#include "setup.h"
+
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
#else
@@ -317,6 +319,13 @@ void __init early_setup(unsigned long dt_ptr)
early_init_mmu();
/*
+ * After firmware and early platform setup code has set things up,
+ * we note the SPR values for configurable control/performance
+ * registers, and use those as initial defaults.
+ */
+ record_spr_defaults();
+
+ /*
* At this point, we can let interrupts switch to virtual mode
* (the MMU has been setup), so adjust the MSR in the PACA to
* have IR and DR set and enable AIL if it exists
@@ -360,8 +369,16 @@ void early_setup_secondary(void)
#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
static bool use_spinloop(void)
{
- if (!IS_ENABLED(CONFIG_PPC_BOOK3E))
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
+ /*
+ * See comments in head_64.S -- not all platforms insert
+ * secondaries at __secondary_hold and wait at the spin
+ * loop.
+ */
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ return false;
return true;
+ }
/*
* When book3e boots from kexec, the ePAPR spin table does
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index e9436c5e1e09..3d7539b90010 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -103,7 +103,7 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
static void do_signal(struct task_struct *tsk)
{
sigset_t *oldset = sigmask_to_save();
- struct ksignal ksig;
+ struct ksignal ksig = { .sig = 0 };
int ret;
int is32 = is_32bit_task();
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 92fb1c8dbbd8..9ffd73296f64 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -94,40 +94,13 @@
*/
static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set)
{
- compat_sigset_t cset;
-
- switch (_NSIG_WORDS) {
- case 4: cset.sig[6] = set->sig[3] & 0xffffffffull;
- cset.sig[7] = set->sig[3] >> 32;
- case 3: cset.sig[4] = set->sig[2] & 0xffffffffull;
- cset.sig[5] = set->sig[2] >> 32;
- case 2: cset.sig[2] = set->sig[1] & 0xffffffffull;
- cset.sig[3] = set->sig[1] >> 32;
- case 1: cset.sig[0] = set->sig[0] & 0xffffffffull;
- cset.sig[1] = set->sig[0] >> 32;
- }
- return copy_to_user(uset, &cset, sizeof(*uset));
+ return put_compat_sigset(uset, set, sizeof(*uset));
}
static inline int get_sigset_t(sigset_t *set,
const compat_sigset_t __user *uset)
{
- compat_sigset_t s32;
-
- if (copy_from_user(&s32, uset, sizeof(*uset)))
- return -EFAULT;
-
- /*
- * Swap the 2 words of the 64-bit sigset_t (they are stored
- * in the "wrong" endian in 32-bit user storage).
- */
- switch (_NSIG_WORDS) {
- case 4: set->sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
- case 3: set->sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
- case 2: set->sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
- case 1: set->sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
- }
- return 0;
+ return get_compat_sigset(set, uset);
}
#define to_user_ptr(p) ptr_to_compat(p)
@@ -519,6 +492,8 @@ static int save_tm_user_regs(struct pt_regs *regs,
{
unsigned long msr = regs->msr;
+ WARN_ON(tm_suspend_disabled);
+
/* Remove TM bits from thread's MSR. The MSR in the sigcontext
* just indicates to userland that we were doing a transaction, but we
* don't want to return in transactional state. This also ensures
@@ -769,6 +744,8 @@ static long restore_tm_user_regs(struct pt_regs *regs,
int i;
#endif
+ if (tm_suspend_disabled)
+ return 1;
/*
* restore general registers but not including MSR or SOFTE. Also
* take care of keeping r2 (TLS) intact if not a signal.
@@ -876,7 +853,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
/* Make sure the transaction is marked as failed */
current->thread.tm_texasr |= TEXASR_FS;
/* This loads the checkpointed FP/VEC state, if used */
- tm_recheckpoint(&current->thread, msr);
+ tm_recheckpoint(&current->thread);
/* This loads the speculative FP/VEC state, if used */
msr_check_and_set(msr & (MSR_FP | MSR_VEC));
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index b2c002993d78..4b9ca3570344 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
BUG_ON(!MSR_TM_ACTIVE(regs->msr));
+ WARN_ON(tm_suspend_disabled);
+
/* Remove TM bits from thread's MSR. The MSR in the sigcontext
* just indicates to userland that we were doing a transaction, but we
* don't want to return in transactional state. This also ensures
@@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
BUG_ON(tsk != current);
+ if (tm_suspend_disabled)
+ return -EINVAL;
+
/* copy the GPRs */
err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs,
@@ -558,7 +563,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
/* Make sure the transaction is marked as failed */
tsk->thread.tm_texasr |= TEXASR_FS;
/* This loads the checkpointed FP/VEC state, if used */
- tm_recheckpoint(&tsk->thread, msr);
+ tm_recheckpoint(&tsk->thread);
msr_check_and_set(msr & (MSR_FP | MSR_VEC));
if (msr & MSR_FP) {
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 4437c70c7c2b..b8d4a1dac39f 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -590,6 +590,17 @@ static void sysfs_create_dscr_default(void)
if (cpu_has_feature(CPU_FTR_DSCR))
err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
}
+
+void __init record_spr_defaults(void)
+{
+ int cpu;
+
+ if (cpu_has_feature(CPU_FTR_DSCR)) {
+ dscr_default = mfspr(SPRN_DSCR);
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ paca[cpu].dscr_default = dscr_default;
+ }
+}
#endif /* CONFIG_PPC64 */
#ifdef HAS_PPC_PMC_PA6T
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index a3374e8a258c..e3c5f75d137c 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -230,8 +230,7 @@ int __init TAU_init(void)
/* first, set up the window shrinking timer */
- init_timer(&tau_timer);
- tau_timer.function = tau_timeout_smp;
+ setup_timer(&tau_timer, tau_timeout_smp, 0UL);
tau_timer.expires = jiffies + shrink_timer;
add_timer(&tau_timer);
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 1da12f521cb7..b92ac8e711db 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -80,15 +80,12 @@ _GLOBAL(tm_abort)
blr
/* void tm_reclaim(struct thread_struct *thread,
- * unsigned long orig_msr,
* uint8_t cause)
*
* - Performs a full reclaim. This destroys outstanding
* transactions and updates thread->regs.tm_ckpt_* with the
* original checkpointed state. Note that thread->regs is
* unchanged.
- * - FP regs are written back to thread->transact_fpr before
- * reclaiming. These are the transactional (current) versions.
*
* Purpose is to both abort transactions of, and preserve the state of,
* a transactions at a context switch. We preserve/restore both sets of process
@@ -99,9 +96,9 @@ _GLOBAL(tm_abort)
* Call with IRQs off, stacks get all out of sync for some periods in here!
*/
_GLOBAL(tm_reclaim)
- mfcr r6
+ mfcr r5
mflr r0
- stw r6, 8(r1)
+ stw r5, 8(r1)
std r0, 16(r1)
std r2, STK_GOT(r1)
stdu r1, -TM_FRAME_SIZE(r1)
@@ -109,7 +106,6 @@ _GLOBAL(tm_reclaim)
/* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */
std r3, STK_PARAM(R3)(r1)
- std r4, STK_PARAM(R4)(r1)
SAVE_NVGPRS(r1)
/* We need to setup MSR for VSX register save instructions. */
@@ -139,8 +135,8 @@ _GLOBAL(tm_reclaim)
std r1, PACAR1(r13)
/* Clear MSR RI since we are about to change r1, EE is already off. */
- li r4, 0
- mtmsrd r4, 1
+ li r5, 0
+ mtmsrd r5, 1
/*
* BE CAREFUL HERE:
@@ -152,7 +148,7 @@ _GLOBAL(tm_reclaim)
* to user register state. (FPRs, CCR etc. also!)
* Use an sprg and a tm_scratch in the PACA to shuffle.
*/
- TRECLAIM(R5) /* Cause in r5 */
+ TRECLAIM(R4) /* Cause in r4 */
/* ******************** GPRs ******************** */
/* Stash the checkpointed r13 away in the scratch SPR and get the real
@@ -243,40 +239,30 @@ _GLOBAL(tm_reclaim)
/* ******************** FPR/VR/VSRs ************
- * After reclaiming, capture the checkpointed FPRs/VRs /if used/.
- *
- * (If VSX used, FP and VMX are implied. Or, we don't need to look
- * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.)
- *
- * We're passed the thread's MSR as the second parameter
+ * After reclaiming, capture the checkpointed FPRs/VRs.
*
* We enabled VEC/FP/VSX in the msr above, so we can execute these
* instructions!
*/
- ld r4, STK_PARAM(R4)(r1) /* Second parameter, MSR * */
mr r3, r12
- andis. r0, r4, MSR_VEC@h
- beq dont_backup_vec
+ /* Altivec (VEC/VMX/VR)*/
addi r7, r3, THREAD_CKVRSTATE
SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */
mfvscr v0
li r6, VRSTATE_VSCR
stvx v0, r7, r6
-dont_backup_vec:
+
+ /* VRSAVE */
mfspr r0, SPRN_VRSAVE
std r0, THREAD_CKVRSAVE(r3)
- andi. r0, r4, MSR_FP
- beq dont_backup_fp
-
+ /* Floating Point (FP) */
addi r7, r3, THREAD_CKFPSTATE
SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */
-
mffs fr0
stfd fr0,FPSTATE_FPSCR(r7)
-dont_backup_fp:
/* TM regs, incl TEXASR -- these live in thread_struct. Note they've
* been updated by the treclaim, to explain to userland the failure
@@ -344,22 +330,19 @@ _GLOBAL(__tm_recheckpoint)
*/
subi r7, r7, STACK_FRAME_OVERHEAD
+ /* We need to setup MSR for FP/VMX/VSX register save instructions. */
mfmsr r6
- /* R4 = original MSR to indicate whether thread used FP/Vector etc. */
-
- /* Enable FP/vec in MSR if necessary! */
- lis r5, MSR_VEC@h
+ mr r5, r6
ori r5, r5, MSR_FP
- and. r5, r4, r5
- beq restore_gprs /* if neither, skip both */
-
+#ifdef CONFIG_ALTIVEC
+ oris r5, r5, MSR_VEC@h
+#endif
#ifdef CONFIG_VSX
BEGIN_FTR_SECTION
- oris r5, r5, MSR_VSX@h
+ oris r5,r5, MSR_VSX@h
END_FTR_SECTION_IFSET(CPU_FTR_VSX)
#endif
- or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */
- mtmsr r5
+ mtmsrd r5
#ifdef CONFIG_ALTIVEC
/*
@@ -368,28 +351,20 @@ _GLOBAL(__tm_recheckpoint)
* thread.fp_state[] version holds the 'live' (transactional)
* and will be loaded subsequently by any FPUnavailable trap.
*/
- andis. r0, r4, MSR_VEC@h
- beq dont_restore_vec
-
addi r8, r3, THREAD_CKVRSTATE
li r5, VRSTATE_VSCR
lvx v0, r8, r5
mtvscr v0
REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */
-dont_restore_vec:
ld r5, THREAD_CKVRSAVE(r3)
mtspr SPRN_VRSAVE, r5
#endif
- andi. r0, r4, MSR_FP
- beq dont_restore_fp
-
addi r8, r3, THREAD_CKFPSTATE
lfd fr0, FPSTATE_FPSCR(r8)
MTFSF_L(fr0)
REST_32FPRS_VSRS(0, R4, R8)
-dont_restore_fp:
mtmsr r6 /* FP/Vec off again! */
restore_gprs:
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
index b4e2b7165f79..3f3e81852422 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -110,9 +110,9 @@ ftrace_call:
/* NIP has not been altered, skip over further checks */
beq 1f
- /* Check if there is an active kprobe on us */
+ /* Check if there is an active jprobe on us */
subi r3, r14, 4
- bl is_current_kprobe_addr
+ bl __is_active_jprobe
nop
/*
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 13c9dcdcba69..f3eb61be0d30 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -37,6 +37,7 @@
#include <linux/kdebug.h>
#include <linux/ratelimit.h>
#include <linux/context_tracking.h>
+#include <linux/smp.h>
#include <asm/emulated_ops.h>
#include <asm/pgtable.h>
@@ -699,6 +700,187 @@ void SMIException(struct pt_regs *regs)
die("System Management Interrupt", regs, SIGABRT);
}
+#ifdef CONFIG_VSX
+static void p9_hmi_special_emu(struct pt_regs *regs)
+{
+ unsigned int ra, rb, t, i, sel, instr, rc;
+ const void __user *addr;
+ u8 vbuf[16], *vdst;
+ unsigned long ea, msr, msr_mask;
+ bool swap;
+
+ if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip))
+ return;
+
+ /*
+ * lxvb16x opcode: 0x7c0006d8
+ * lxvd2x opcode: 0x7c000698
+ * lxvh8x opcode: 0x7c000658
+ * lxvw4x opcode: 0x7c000618
+ */
+ if ((instr & 0xfc00073e) != 0x7c000618) {
+ pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx"
+ " instr=%08x\n",
+ smp_processor_id(), current->comm, current->pid,
+ regs->nip, instr);
+ return;
+ }
+
+ /* Grab vector registers into the task struct */
+ msr = regs->msr; /* Grab msr before we flush the bits */
+ flush_vsx_to_thread(current);
+ enable_kernel_altivec();
+
+ /*
+ * Is userspace running with a different endian (this is rare but
+ * not impossible)
+ */
+ swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE);
+
+ /* Decode the instruction */
+ ra = (instr >> 16) & 0x1f;
+ rb = (instr >> 11) & 0x1f;
+ t = (instr >> 21) & 0x1f;
+ if (instr & 1)
+ vdst = (u8 *)&current->thread.vr_state.vr[t];
+ else
+ vdst = (u8 *)&current->thread.fp_state.fpr[t][0];
+
+ /* Grab the vector address */
+ ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0);
+ if (is_32bit_task())
+ ea &= 0xfffffffful;
+ addr = (__force const void __user *)ea;
+
+ /* Check it */
+ if (!access_ok(VERIFY_READ, addr, 16)) {
+ pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx"
+ " instr=%08x addr=%016lx\n",
+ smp_processor_id(), current->comm, current->pid,
+ regs->nip, instr, (unsigned long)addr);
+ return;
+ }
+
+ /* Read the vector */
+ rc = 0;
+ if ((unsigned long)addr & 0xfUL)
+ /* unaligned case */
+ rc = __copy_from_user_inatomic(vbuf, addr, 16);
+ else
+ __get_user_atomic_128_aligned(vbuf, addr, rc);
+ if (rc) {
+ pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx"
+ " instr=%08x addr=%016lx\n",
+ smp_processor_id(), current->comm, current->pid,
+ regs->nip, instr, (unsigned long)addr);
+ return;
+ }
+
+ pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx"
+ " instr=%08x addr=%016lx\n",
+ smp_processor_id(), current->comm, current->pid, regs->nip,
+ instr, (unsigned long) addr);
+
+ /* Grab instruction "selector" */
+ sel = (instr >> 6) & 3;
+
+ /*
+ * Check to make sure the facility is actually enabled. This
+ * could happen if we get a false positive hit.
+ *
+ * lxvd2x/lxvw4x always check MSR VSX sel = 0,2
+ * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3
+ */
+ msr_mask = MSR_VSX;
+ if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */
+ msr_mask = MSR_VEC;
+ if (!(msr & msr_mask)) {
+ pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx"
+ " instr=%08x msr:%016lx\n",
+ smp_processor_id(), current->comm, current->pid,
+ regs->nip, instr, msr);
+ return;
+ }
+
+ /* Do logging here before we modify sel based on endian */
+ switch (sel) {
+ case 0: /* lxvw4x */
+ PPC_WARN_EMULATED(lxvw4x, regs);
+ break;
+ case 1: /* lxvh8x */
+ PPC_WARN_EMULATED(lxvh8x, regs);
+ break;
+ case 2: /* lxvd2x */
+ PPC_WARN_EMULATED(lxvd2x, regs);
+ break;
+ case 3: /* lxvb16x */
+ PPC_WARN_EMULATED(lxvb16x, regs);
+ break;
+ }
+
+#ifdef __LITTLE_ENDIAN__
+ /*
+ * An LE kernel stores the vector in the task struct as an LE
+ * byte array (effectively swapping both the components and
+ * the content of the components). Those instructions expect
+ * the components to remain in ascending address order, so we
+ * swap them back.
+ *
+ * If we are running a BE user space, the expectation is that
+ * of a simple memcpy, so forcing the emulation to look like
+ * a lxvb16x should do the trick.
+ */
+ if (swap)
+ sel = 3;
+
+ switch (sel) {
+ case 0: /* lxvw4x */
+ for (i = 0; i < 4; i++)
+ ((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i];
+ break;
+ case 1: /* lxvh8x */
+ for (i = 0; i < 8; i++)
+ ((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i];
+ break;
+ case 2: /* lxvd2x */
+ for (i = 0; i < 2; i++)
+ ((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i];
+ break;
+ case 3: /* lxvb16x */
+ for (i = 0; i < 16; i++)
+ vdst[i] = vbuf[15-i];
+ break;
+ }
+#else /* __LITTLE_ENDIAN__ */
+ /* On a big endian kernel, a BE userspace only needs a memcpy */
+ if (!swap)
+ sel = 3;
+
+ /* Otherwise, we need to swap the content of the components */
+ switch (sel) {
+ case 0: /* lxvw4x */
+ for (i = 0; i < 4; i++)
+ ((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]);
+ break;
+ case 1: /* lxvh8x */
+ for (i = 0; i < 8; i++)
+ ((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]);
+ break;
+ case 2: /* lxvd2x */
+ for (i = 0; i < 2; i++)
+ ((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]);
+ break;
+ case 3: /* lxvb16x */
+ memcpy(vdst, vbuf, 16);
+ break;
+ }
+#endif /* !__LITTLE_ENDIAN__ */
+
+ /* Go to next instruction */
+ regs->nip += 4;
+}
+#endif /* CONFIG_VSX */
+
void handle_hmi_exception(struct pt_regs *regs)
{
struct pt_regs *old_regs;
@@ -706,6 +888,21 @@ void handle_hmi_exception(struct pt_regs *regs)
old_regs = set_irq_regs(regs);
irq_enter();
+#ifdef CONFIG_VSX
+ /* Real mode flagged P9 special emu is needed */
+ if (local_paca->hmi_p9_special_emu) {
+ local_paca->hmi_p9_special_emu = 0;
+
+ /*
+ * We don't want to take page faults while doing the
+ * emulation, we just replay the instruction if necessary.
+ */
+ pagefault_disable();
+ p9_hmi_special_emu(regs);
+ pagefault_enable();
+ }
+#endif /* CONFIG_VSX */
+
if (ppc_md.handle_hmi_exception)
ppc_md.handle_hmi_exception(regs);
@@ -1140,13 +1337,8 @@ void program_check_exception(struct pt_regs *regs)
* - A treclaim is attempted when non transactional.
* - A tend is illegally attempted.
* - writing a TM SPR when transactional.
- */
- if (!user_mode(regs) &&
- report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
- regs->nip += 4;
- goto bail;
- }
- /* If usermode caused this, it's done something illegal and
+ *
+ * If usermode caused this, it's done something illegal and
* gets a SIGILL slap on the wrist. We call it an illegal
* operand to distinguish from the instruction just being bad
* (e.g. executing a 'tend' on a CPU without TM!); it's an
@@ -1487,7 +1679,7 @@ void fp_unavailable_tm(struct pt_regs *regs)
/* Reclaim didn't save out any FPRs to transact_fprs. */
/* Enable FP for the task: */
- regs->msr |= (MSR_FP | current->thread.fpexc_mode);
+ current->thread.load_fp = 1;
/* This loads and recheckpoints the FP registers from
* thread.fpr[]. They will remain in registers after the
@@ -1495,15 +1687,7 @@ void fp_unavailable_tm(struct pt_regs *regs)
* If VMX is in use, the VRs now hold checkpointed values,
* so we don't want to load the VRs from the thread_struct.
*/
- tm_recheckpoint(&current->thread, MSR_FP);
-
- /* If VMX is in use, get the transactional values back */
- if (regs->msr & MSR_VEC) {
- msr_check_and_set(MSR_VEC);
- load_vr_state(&current->thread.vr_state);
- /* At this point all the VSX state is loaded, so enable it */
- regs->msr |= MSR_VSX;
- }
+ tm_recheckpoint(&current->thread);
}
void altivec_unavailable_tm(struct pt_regs *regs)
@@ -1516,21 +1700,13 @@ void altivec_unavailable_tm(struct pt_regs *regs)
"MSR=%lx\n",
regs->nip, regs->msr);
tm_reclaim_current(TM_CAUSE_FAC_UNAV);
- regs->msr |= MSR_VEC;
- tm_recheckpoint(&current->thread, MSR_VEC);
+ current->thread.load_vec = 1;
+ tm_recheckpoint(&current->thread);
current->thread.used_vr = 1;
-
- if (regs->msr & MSR_FP) {
- msr_check_and_set(MSR_FP);
- load_fp_state(&current->thread.fp_state);
- regs->msr |= MSR_VSX;
- }
}
void vsx_unavailable_tm(struct pt_regs *regs)
{
- unsigned long orig_msr = regs->msr;
-
/* See the comments in fp_unavailable_tm(). This works similarly,
* though we're loading both FP and VEC registers in here.
*
@@ -1544,29 +1720,13 @@ void vsx_unavailable_tm(struct pt_regs *regs)
current->thread.used_vsr = 1;
- /* If FP and VMX are already loaded, we have all the state we need */
- if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) {
- regs->msr |= MSR_VSX;
- return;
- }
-
/* This reclaims FP and/or VR regs if they're already enabled */
tm_reclaim_current(TM_CAUSE_FAC_UNAV);
- regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode |
- MSR_VSX;
-
- /* This loads & recheckpoints FP and VRs; but we have
- * to be sure not to overwrite previously-valid state.
- */
- tm_recheckpoint(&current->thread, regs->msr & ~orig_msr);
-
- msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC));
+ current->thread.load_vec = 1;
+ current->thread.load_fp = 1;
- if (orig_msr & MSR_FP)
- load_fp_state(&current->thread.fp_state);
- if (orig_msr & MSR_VEC)
- load_vr_state(&current->thread.vr_state);
+ tm_recheckpoint(&current->thread);
}
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
@@ -1924,6 +2084,10 @@ struct ppc_emulated ppc_emulated = {
WARN_EMULATED_SETUP(mfdscr),
WARN_EMULATED_SETUP(mtdscr),
WARN_EMULATED_SETUP(lq_stq),
+ WARN_EMULATED_SETUP(lxvw4x),
+ WARN_EMULATED_SETUP(lxvh8x),
+ WARN_EMULATED_SETUP(lxvd2x),
+ WARN_EMULATED_SETUP(lxvb16x),
#endif
};
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 1d89163d67f2..87da80ccced1 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -98,8 +98,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
else
dump_stack();
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
+ /* Do not panic from here because that can recurse into NMI IPI layer */
}
static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
@@ -135,15 +134,18 @@ static void watchdog_smp_panic(int cpu, u64 tb)
pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n",
cpu, cpumask_pr_args(&wd_smp_cpus_pending));
- /*
- * Try to trigger the stuck CPUs.
- */
- for_each_cpu(c, &wd_smp_cpus_pending) {
- if (c == cpu)
- continue;
- smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+ if (!sysctl_hardlockup_all_cpu_backtrace) {
+ /*
+ * Try to trigger the stuck CPUs, unless we are going to
+ * get a backtrace on all of them anyway.
+ */
+ for_each_cpu(c, &wd_smp_cpus_pending) {
+ if (c == cpu)
+ continue;
+ smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+ }
+ smp_flush_nmi_ipi(1000000);
}
- smp_flush_nmi_ipi(1000000);
/* Take the stuck CPUs out of the watch group */
set_cpumask_stuck(&wd_smp_cpus_pending, tb);
@@ -275,9 +277,12 @@ void arch_touch_nmi_watchdog(void)
{
unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
int cpu = smp_processor_id();
+ u64 tb = get_tb();
- if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
- watchdog_timer_interrupt(cpu);
+ if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+ per_cpu(wd_timer_tb, cpu) = tb;
+ wd_smp_clear_cpu_pending(cpu, tb);
+ }
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);