diff options
Diffstat (limited to 'arch/powerpc/kernel')
38 files changed, 1011 insertions, 441 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 6c6cce937dd8..1b6bc7fba996 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -129,7 +129,7 @@ obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) -ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE),) +ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),) obj-y += ppc_save_regs.o endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8cfb20e38cfe..6b958414b4e0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -185,7 +185,7 @@ int main(void) #ifdef CONFIG_PPC_MM_SLICES OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); - DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit)); + OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ #endif @@ -208,7 +208,7 @@ int main(void) OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first); #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); @@ -230,7 +230,7 @@ int main(void) OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx); OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count); OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ OFFSET(PACAEMERGSP, paca_struct, emergency_sp); #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp); @@ -642,6 +642,7 @@ int main(void) HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_PTID, ptid); + HSTATE_FIELD(HSTATE_TID, tid); HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]); @@ -667,6 +668,8 @@ int main(void) OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); + OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set); + OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 760872916013..1350f49d81a8 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -547,11 +547,31 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, - { /* Power9 */ + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_0, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power9", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .flush_tlb = __flush_tlb_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power9 DD 2.1 or later (see DD2.0 above) */ .pvr_mask = 0xffff0000, .pvr_value = 0x004e0000, .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9, + .cpu_features = CPU_FTRS_POWER9_DD2_1, .cpu_user_features = COMMON_USER_POWER9, .cpu_user_features2 = COMMON_USER2_POWER9, .mmu_features = MMU_FTRS_POWER9, diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 7275fed271af..602e0fde19b4 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -634,7 +634,7 @@ static struct dt_cpu_feature_match __initdata {"no-execute", feat_enable, 0}, {"strong-access-ordering", feat_enable, CPU_FTR_SAO}, {"cache-inhibited-large-page", feat_enable_large_ci, 0}, - {"coprocessor-icswx", feat_enable, CPU_FTR_ICSWX}, + {"coprocessor-icswx", feat_enable, 0}, {"hypervisor-virtualization-interrupt", feat_enable_hvi, 0}, {"program-priority-register", feat_enable, CPU_FTR_HAS_PPR}, {"wait", feat_enable, 0}, @@ -735,6 +735,8 @@ static __init void cpufeatures_cpu_quirks(void) */ if ((version & 0xffffff00) == 0x004e0100) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; + else if ((version & 0xffffefff) == 0x004e0200) + cur_cpu_spec->cpu_features &= ~CPU_FTR_POWER9_DD2_1; } static void __init cpufeatures_setup_finished(void) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 116000b45531..cbca0a667682 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -972,6 +972,18 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; +void eeh_probe_devices(void) +{ + struct pci_controller *hose, *tmp; + struct pci_dn *pdn; + + /* Enable EEH for all adapters */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pdn = hose->pci_data; + traverse_pci_dn(pdn, eeh_ops->probe, NULL); + } +} + /** * eeh_init - EEH initialization * @@ -987,22 +999,11 @@ static struct notifier_block eeh_reboot_nb = { * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */ -int eeh_init(void) +static int eeh_init(void) { struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - static int cnt = 0; int ret = 0; - /* - * We have to delay the initialization on PowerNV after - * the PCI hierarchy tree has been built because the PEs - * are figured out based on PCI devices instead of device - * tree nodes - */ - if (machine_is(powernv) && cnt++ <= 0) - return ret; - /* Register reboot notifier */ ret = register_reboot_notifier(&eeh_reboot_nb); if (ret) { @@ -1028,22 +1029,7 @@ int eeh_init(void) if (ret) return ret; - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - - /* - * Call platform post-initialization. Actually, It's good chance - * to inform platform that EEH is ready to supply service if the - * I/O cache stuff has been built up. - */ - if (eeh_ops->post_init) { - ret = eeh_ops->post_init(); - if (ret) - return ret; - } + eeh_probe_devices(); if (eeh_enabled()) pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); @@ -1757,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val) else eeh_add_flag(EEH_FORCE_DISABLED); - /* Notify the backend */ - if (eeh_ops->post_init) - eeh_ops->post_init(); - return 0; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 8b840191df59..4f71e4c9beb7 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -441,7 +441,7 @@ static void *eeh_add_virt_device(void *data, void *userdata) } #ifdef CONFIG_PPC_POWERNV - pci_iov_add_virtfn(edev->physfn, pdn->vf_index, 0); + pci_iov_add_virtfn(edev->physfn, pdn->vf_index); #endif return NULL; } @@ -499,7 +499,7 @@ static void *eeh_rmv_device(void *data, void *userdata) #ifdef CONFIG_PPC_POWERNV struct pci_dn *pdn = eeh_dev_to_pdn(edev); - pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0); + pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); edev->pdev = NULL; /* @@ -623,7 +623,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, struct eeh_rmv_data *rmv_data) { struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); - struct timeval tstamp; + time64_t tstamp; int cnt, rc; struct eeh_dev *edev; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 2e8d1b2b5af4..2d4956e97aa9 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -526,16 +526,16 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ void eeh_pe_update_time_stamp(struct eeh_pe *pe) { - struct timeval tstamp; + time64_t tstamp; if (!pe) return; if (pe->freeze_count <= 0) { pe->freeze_count = 0; - do_gettimeofday(&pe->tstamp); + pe->tstamp = ktime_get_seconds(); } else { - do_gettimeofday(&tstamp); - if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + tstamp = ktime_get_seconds(); + if (tstamp - pe->tstamp > 3600) { pe->tstamp = tstamp; pe->freeze_count = 0; } diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 4a0fd4f40245..3320bcac7192 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -539,7 +539,7 @@ _GLOBAL(_switch) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION b 2f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) @@ -588,7 +588,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 1c80bd292e48..e441b469dc8f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -114,6 +114,7 @@ EXC_VIRT_NONE(0x4000, 0x100) cmpwi cr3,r10,2 ; \ BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ + KVMTEST_PR(n) ; \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else #define IDLETEST NOTEST @@ -130,6 +131,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100) EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) +TRAMP_KVM(PACA_EXNMI, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) @@ -233,7 +235,7 @@ BEGIN_FTR_SECTION addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,4 + cmpwi r10,MAX_MCE_DEPTH bgt 2f /* Check if we hit limit of 4 */ std r11,GPR1(r1) /* Save r1 on the stack. */ std r11,0(r1) /* make stack chain pointer */ @@ -542,7 +544,7 @@ EXC_COMMON_BEGIN(instruction_access_common) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) - andis. r4,r12,DSISR_BAD_FAULT_64S@h + andis. r4,r12,DSISR_SRR1_MATCH_64S@h li r5,0x400 std r3,_DAR(r1) std r4,_DSISR(r1) @@ -606,7 +608,7 @@ EXC_COMMON_BEGIN(slb_miss_common) cmpdi cr5,r11,MSR_RI crset 4*cr0+eq -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) @@ -888,12 +890,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -#define SYSCALL_FASTENDIAN_TEST \ -BEGIN_FTR_SECTION \ - cmpdi r0,0x1ebe ; \ - beq- 1f ; \ -END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - /* * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, * and HMT_MEDIUM. @@ -908,6 +904,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ +#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH +#define SYSCALL_FASTENDIAN_TEST \ +BEGIN_FTR_SECTION \ + cmpdi r0,0x1ebe ; \ + beq- 1f ; \ +END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ + #define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ @@ -916,6 +919,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ +#else +#define SYSCALL_FASTENDIAN_TEST +#define SYSCALL_FASTENDIAN +#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */ #if defined(CONFIG_RELOCATABLE) /* @@ -1033,6 +1040,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */ + cmpdi cr0,r3,0 + /* Windup the stack. */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) @@ -1049,10 +1058,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early) REST_8GPRS(2, r1) REST_GPR(10, r1) ld r11,_CCR(r1) + REST_2GPRS(12, r1) + bne 1f mtcr r11 REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ + ld r1,GPR1(r1) + hrfid + +1: mtcr r11 + REST_GPR(11, r1) ld r1,GPR1(r1) /* @@ -1065,8 +1079,9 @@ hmi_exception_after_realmode: EXCEPTION_PROLOG_0(PACA_EXGEN) b tramp_real_hmi_exception -EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception) - +EXC_COMMON_BEGIN(hmi_exception_common) +EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception, + ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON) EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20) EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80) @@ -1505,8 +1520,8 @@ USE_TEXT_SECTION() */ .balign IFETCH_ALIGN_BYTES do_hash_page: - #ifdef CONFIG_PPC_STD_MMU_64 - lis r0,DSISR_BAD_FAULT_64S@h +#ifdef CONFIG_PPC_BOOK3S_64 + lis r0,(DSISR_BAD_FAULT_64S|DSISR_DABRMATCH)@h ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ @@ -1536,7 +1551,7 @@ do_hash_page: /* Reload DSISR into r4 for the DABR check below */ ld r4,_DSISR(r1) -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1565,7 +1580,7 @@ handle_dabr_fault: 12: b ret_from_except_lite -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index e1431800bfb9..04ea5c04fd24 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1270,10 +1270,15 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + int input = -1; + if (!fw_dump.dump_active) return -EPERM; - if (buf[0] == '1') { + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { /* * Take away the '/proc/vmcore'. We are releasing the dump * memory, hence it will not be valid anymore. @@ -1307,21 +1312,25 @@ static ssize_t fadump_register_store(struct kobject *kobj, const char *buf, size_t count) { int ret = 0; + int input = -1; if (!fw_dump.fadump_enabled || fdm_active) return -EPERM; + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + mutex_lock(&fadump_mutex); - switch (buf[0]) { - case '0': + switch (input) { + case 0: if (fw_dump.dump_registered == 0) { goto unlock_out; } /* Un-register Firmware-assisted dump */ fadump_unregister_dump(&fdm); break; - case '1': + case 1: if (fw_dump.dump_registered == 1) { ret = -EEXIST; goto unlock_out; diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 8c54166491e7..29b2fed93289 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -388,7 +388,7 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) - andis. r0,r10,DSISR_BAD_FAULT_32S@h + andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index ff8511d6d8ea..aa71a90f5222 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -55,12 +55,18 @@ * * For pSeries or server processors: * 1. The MMU is off & open firmware is running in real mode. - * 2. The kernel is entered at __start + * 2. The primary CPU enters at __start. + * 3. If the RTAS supports "query-cpu-stopped-state", then secondary + * CPUs will enter as directed by "start-cpu" RTAS call, which is + * generic_secondary_smp_init, with PIR in r3. + * 4. Else the secondary CPUs will enter at secondary_hold (0x60) as + * directed by the "start-cpu" RTS call, with PIR in r3. * -or- For OPAL entry: - * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 - * with device-tree in gpr3. We also get OPAL base in r8 and - * entry in r9 for debugging purposes - * 2. Secondary processors enter at 0x60 with PIR in gpr3 + * 1. The MMU is off, processor in HV mode. + * 2. The primary CPU enters at 0 with device-tree in r3, OPAL base + * in r8, and entry in r9 for debugging purposes. + * 3. Secondary CPUs enter as directed by OPAL_START_CPU call, which + * is at generic_secondary_smp_init, with PIR in r3. * * For Book3E processors: * 1. The MMU is on running in AS0 in a state defined in ePAPR diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 1125c9be9e06..01e1c1997893 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -112,12 +112,14 @@ power9_save_additional_sprs: std r4, STOP_HFSCR(r13) mfspr r3, SPRN_MMCRA - mfspr r4, SPRN_MMCR1 + mfspr r4, SPRN_MMCR0 std r3, STOP_MMCRA(r13) - std r4, STOP_MMCR1(r13) + std r4, _MMCR0(r1) - mfspr r3, SPRN_MMCR2 - std r3, STOP_MMCR2(r13) + mfspr r3, SPRN_MMCR1 + mfspr r4, SPRN_MMCR2 + std r3, STOP_MMCR1(r13) + std r4, STOP_MMCR2(r13) blr power9_restore_additional_sprs: @@ -135,11 +137,14 @@ power9_restore_additional_sprs: ld r4, STOP_MMCRA(r13) mtspr SPRN_HFSCR, r3 mtspr SPRN_MMCRA, r4 - /* We have already restored PACA_MMCR0 */ - ld r3, STOP_MMCR1(r13) - ld r4, STOP_MMCR2(r13) - mtspr SPRN_MMCR1, r3 - mtspr SPRN_MMCR2, r4 + + ld r3, _MMCR0(r1) + ld r4, STOP_MMCR1(r13) + mtspr SPRN_MMCR0, r3 + mtspr SPRN_MMCR1, r4 + + ld r3, STOP_MMCR2(r13) + mtspr SPRN_MMCR2, r3 blr /* @@ -319,20 +324,13 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ +power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -power_enter_stop_kvm_rm: - /* - * This is currently unused because POWER9 KVM does not have to - * gather secondary threads into sibling mode, but the code is - * here in case that function is required. - * - * Tell KVM we're entering idle. - */ + /* Tell KVM we're entering idle */ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif -power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ @@ -357,13 +355,15 @@ power_enter_stop: b pnv_wakeup_noloss .Lhandle_esl_ec_set: +BEGIN_FTR_SECTION /* - * POWER9 DD2 can incorrectly set PMAO when waking up after a - * state-loss idle. Saving and restoring MMCR0 over idle is a + * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after + * a state-loss idle. Saving and restoring MMCR0 over idle is a * workaround. */ mfspr r4,SPRN_MMCR0 std r4,_MMCR0(r1) +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) /* * Check if the requested state is a deep idle state. @@ -496,18 +496,6 @@ pnv_powersave_wakeup_mce: b pnv_powersave_wakeup -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -kvm_start_guest_check: - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beqlr - b kvm_start_guest -#endif - /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss @@ -532,9 +520,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mr r3,r12 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -BEGIN_FTR_SECTION - bl kvm_start_guest_check -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 1f + b kvm_start_guest +1: #endif /* Return SRR1 from power7_nap() */ @@ -555,15 +549,17 @@ pnv_restore_hyp_resource_arch300: * then clear bit 60 in MMCRA to ensure the PMU starts running. */ blt cr3,1f +BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT ld r1,PACAR1(r13) + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) mfspr r4,SPRN_MMCRA ori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 xori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 - ld r4,_MMCR0(r1) - mtspr SPRN_MMCR0,r4 1: /* * POWER ISA 3. Use PSSCR to determine if we diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4e65bf82f5e0..b7a84522e652 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -143,6 +143,13 @@ notrace unsigned int __check_irq_replay(void) */ unsigned char happened = local_paca->irq_happened; + /* + * We are responding to the next interrupt, so interrupt-off + * latencies should be reset here. + */ + trace_hardirqs_on(); + trace_hardirqs_off(); + if (happened & PACA_IRQ_HARD_DIS) { /* Clear bit 0 which we wouldn't clear otherwise */ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; @@ -270,6 +277,7 @@ notrace void arch_local_irq_restore(unsigned long en) #endif /* CONFIG_TRACE_IRQFLAGS */ set_soft_enabled(0); + trace_hardirqs_off(); /* * Check if anything needs to be re-emitted. We haven't @@ -279,6 +287,7 @@ notrace void arch_local_irq_restore(unsigned long en) replay = __check_irq_replay(); /* We can soft-enable now */ + trace_hardirqs_on(); set_soft_enabled(1); /* @@ -394,11 +403,19 @@ bool prep_irq_for_idle_irqsoff(void) /* * Take the SRR1 wakeup reason, index into this table to find the * appropriate irq_happened bit. + * + * Sytem reset exceptions taken in idle state also come through here, + * but they are NMI interrupts so do not need to wait for IRQs to be + * restored, and should be taken as early as practical. These are marked + * with 0xff in the table. The Power ISA specifies 0100b as the system + * reset interrupt reason. */ +#define IRQ_SYSTEM_RESET 0xff + static const u8 srr1_to_lazyirq[0x10] = { 0, 0, 0, PACA_IRQ_DBELL, - 0, + IRQ_SYSTEM_RESET, PACA_IRQ_DBELL, PACA_IRQ_DEC, 0, @@ -407,15 +424,43 @@ static const u8 srr1_to_lazyirq[0x10] = { PACA_IRQ_HMI, 0, 0, 0, 0, 0 }; +void replay_system_reset(void) +{ + struct pt_regs regs; + + ppc_save_regs(®s); + regs.trap = 0x100; + get_paca()->in_nmi = 1; + system_reset_exception(®s); + get_paca()->in_nmi = 0; +} +EXPORT_SYMBOL_GPL(replay_system_reset); + void irq_set_pending_from_srr1(unsigned long srr1) { unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + u8 reason = srr1_to_lazyirq[idx]; + + /* + * Take the system reset now, which is immediately after registers + * are restored from idle. It's an NMI, so interrupts need not be + * re-enabled before it is taken. + */ + if (unlikely(reason == IRQ_SYSTEM_RESET)) { + replay_system_reset(); + return; + } /* * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, - * so this can be called unconditionally with srr1 wake reason. + * so this can be called unconditionally with the SRR1 wake + * reason as returned by the idle code, which uses 0 to mean no + * interrupt. + * + * If a future CPU was to designate this as an interrupt reason, + * then a new index for no interrupt must be assigned. */ - local_paca->irq_happened |= srr1_to_lazyirq[idx]; + local_paca->irq_happened |= reason; } #endif /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 6c089d9757c9..7a1f99f1b47f 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -25,6 +25,21 @@ #include <linux/preempt.h> #include <linux/ftrace.h> +/* + * This is called from ftrace code after invoking registered handlers to + * disambiguate regs->nip changes done by jprobes and livepatch. We check if + * there is an active jprobe at the provided address (mcount location). + */ +int __is_active_jprobe(unsigned long addr) +{ + if (!preemptible()) { + struct kprobe *p = raw_cpu_read(current_kprobe); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; + } + + return 0; +} + static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_nip) @@ -60,11 +75,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, { struct kprobe *p; struct kprobe_ctlblk *kcb; - unsigned long flags; - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); - hard_irq_disable(); + preempt_disable(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) @@ -86,13 +98,17 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) __skip_singlestep(p, regs, kcb, orig_nip); - /* - * If pre_handler returns !0, it sets regs->nip and - * resets current kprobe. - */ + else { + /* + * If pre_handler returns !0, it sets regs->nip and + * resets current kprobe. In this case, we should not + * re-enable preemption. + */ + return; + } } end: - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bebc3007a793..ca5d5a081e75 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -43,12 +43,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; -int is_current_kprobe_addr(unsigned long addr) -{ - struct kprobe *p = kprobe_running(); - return (p && (unsigned long)p->addr == addr) ? 1 : 0; -} - bool arch_within_kprobe_blacklist(unsigned long addr) { return (addr >= (unsigned long)__kprobes_text_start && @@ -59,7 +53,7 @@ bool arch_within_kprobe_blacklist(unsigned long addr) kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) { - kprobe_opcode_t *addr; + kprobe_opcode_t *addr = NULL; #ifdef PPC64_ELF_ABI_v2 /* PPC64 ABIv2 needs local entry point */ @@ -91,36 +85,29 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) * Also handle <module:symbol> format. */ char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; - const char *modsym; bool dot_appended = false; - if ((modsym = strchr(name, ':')) != NULL) { - modsym++; - if (*modsym != '\0' && *modsym != '.') { - /* Convert to <module:.symbol> */ - strncpy(dot_name, name, modsym - name); - dot_name[modsym - name] = '.'; - dot_name[modsym - name + 1] = '\0'; - strncat(dot_name, modsym, - sizeof(dot_name) - (modsym - name) - 2); - dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, sizeof(dot_name) - 1); - } - } else if (name[0] != '.') { - dot_name[0] = '.'; - dot_name[1] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 2); + const char *c; + ssize_t ret = 0; + int len = 0; + + if ((c = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { + c++; + len = c - name; + memcpy(dot_name, name, len); + } else + c = name; + + if (*c != '\0' && *c != '.') { + dot_name[len++] = '.'; dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 1); } - addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); - if (!addr && dot_appended) { - /* Let's try the original non-dot symbol lookup */ + ret = strscpy(dot_name + len, c, KSYM_NAME_LEN); + if (ret > 0) + addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); + + /* Fallback to the original non-dot symbol lookup */ + if (!addr && dot_appended) addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); - } #else addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); #endif @@ -239,7 +226,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) } NOKPROBE_SYMBOL(arch_prepare_kretprobe); -int try_to_emulate(struct kprobe *p, struct pt_regs *regs) +static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; unsigned int insn = *p->ainsn.insn; @@ -261,9 +248,20 @@ int try_to_emulate(struct kprobe *p, struct pt_regs *regs) */ printk("Can't step on instruction %x\n", insn); BUG(); - } else if (ret == 0) - /* This instruction can't be boosted */ - p->ainsn.boostable = -1; + } else { + /* + * If we haven't previously emulated this instruction, then it + * can't be boosted. Note it down so we don't try to do so again. + * + * If, however, we had emulated this instruction in the past, + * then this is just an error with the current run (for + * instance, exceptions due to a load/store). We return 0 so + * that this is now single-stepped, but continue to try + * emulating it in subsequent probe hits. + */ + if (unlikely(p->ainsn.boostable != 1)) + p->ainsn.boostable = -1; + } return ret; } @@ -639,24 +637,22 @@ NOKPROBE_SYMBOL(setjmp_pre_handler); void __used jprobe_return(void) { - asm volatile("trap" ::: "memory"); + asm volatile("jprobe_return_trap:\n" + "trap\n" + ::: "memory"); } NOKPROBE_SYMBOL(jprobe_return); -static void __used jprobe_return_end(void) -{ -} -NOKPROBE_SYMBOL(jprobe_return_end); - int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - /* - * FIXME - we should ideally be validating that we got here 'cos - * of the "trap" in jprobe_return() above, before restoring the - * saved regs... - */ + if (regs->nip != ppc_kallsyms_lookup_name("jprobe_return_trap")) { + pr_debug("longjmp_break_handler NIP (0x%lx) does not match jprobe_return_trap (0x%lx)\n", + regs->nip, ppc_kallsyms_lookup_name("jprobe_return_trap")); + return 0; + } + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); /* It's OK to start function graph tracing again */ unpause_graph_tracing(); diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 5c12e21d0d1a..49d34d7271e7 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -402,4 +402,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9b2ea7e71c06..742e4658c5dc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -39,11 +39,21 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); +/* Queue for delayed MCE UE events. */ +static DEFINE_PER_CPU(int, mce_ue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], + mce_ue_event_queue); + static void machine_check_process_queued_event(struct irq_work *work); +void machine_check_ue_event(struct machine_check_event *evt); +static void machine_process_ue_event(struct work_struct *work); + static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); + static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) { @@ -82,7 +92,7 @@ static void mce_set_error_info(struct machine_check_event *mce, */ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, - uint64_t nip, uint64_t addr) + uint64_t nip, uint64_t addr, uint64_t phys_addr) { int index = __this_cpu_inc_return(mce_nest_count) - 1; struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); @@ -140,6 +150,11 @@ void save_mce_event(struct pt_regs *regs, long handled, } else if (mce->error_type == MCE_ERROR_TYPE_UE) { mce->u.ue_error.effective_address_provided = true; mce->u.ue_error.effective_address = addr; + if (phys_addr != ULONG_MAX) { + mce->u.ue_error.physical_address_provided = true; + mce->u.ue_error.physical_address = phys_addr; + machine_check_ue_event(mce); + } } return; } @@ -193,6 +208,26 @@ void release_mce_event(void) get_mce_event(NULL, true); } + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_ue_event(struct machine_check_event *evt) +{ + int index; + + index = __this_cpu_inc_return(mce_ue_count) - 1; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __this_cpu_dec(mce_ue_count); + return; + } + memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); + + /* Queue work to process this event later. */ + schedule_work(&mce_ue_event_work); +} + /* * Queue up the MCE event which then can be handled later. */ @@ -215,7 +250,39 @@ void machine_check_queue_event(void) /* Queue irq work to process this event later. */ irq_work_queue(&mce_event_process_work); } - +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +static void machine_process_ue_event(struct work_struct *work) +{ + int index; + struct machine_check_event *evt; + + while (__this_cpu_read(mce_ue_count) > 0) { + index = __this_cpu_read(mce_ue_count) - 1; + evt = this_cpu_ptr(&mce_ue_event_queue[index]); +#ifdef CONFIG_MEMORY_FAILURE + /* + * This should probably queued elsewhere, but + * oh! well + */ + if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.physical_address_provided) { + unsigned long pfn; + + pfn = evt->u.ue_error.physical_address >> + PAGE_SHIFT; + memory_failure(pfn, SIGBUS, 0); + } else + pr_warn("Failed to identify bad address from " + "where the uncorrectable error (UE) " + "was generated\n"); + } +#endif + __this_cpu_dec(mce_ue_count); + } +} /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. @@ -223,6 +290,7 @@ void machine_check_queue_event(void) static void machine_check_process_queued_event(struct irq_work *work) { int index; + struct machine_check_event *evt; add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -232,8 +300,8 @@ static void machine_check_process_queued_event(struct irq_work *work) */ while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; - machine_check_print_event_info( - this_cpu_ptr(&mce_event_queue[index]), false); + evt = this_cpu_ptr(&mce_event_queue[index]); + machine_check_print_event_info(evt, false); __this_cpu_dec(mce_queue_count); } } @@ -340,7 +408,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, printk("%s Effective address: %016llx\n", level, evt->u.ue_error.effective_address); if (evt->u.ue_error.physical_address_provided) - printk("%s Physical address: %016llx\n", + printk("%s Physical address: %016llx\n", level, evt->u.ue_error.physical_address); break; case MCE_ERROR_TYPE_SLB: @@ -411,45 +479,6 @@ void machine_check_print_event_info(struct machine_check_event *evt, } EXPORT_SYMBOL_GPL(machine_check_print_event_info); -uint64_t get_mce_fault_addr(struct machine_check_event *evt) -{ - switch (evt->error_type) { - case MCE_ERROR_TYPE_UE: - if (evt->u.ue_error.effective_address_provided) - return evt->u.ue_error.effective_address; - break; - case MCE_ERROR_TYPE_SLB: - if (evt->u.slb_error.effective_address_provided) - return evt->u.slb_error.effective_address; - break; - case MCE_ERROR_TYPE_ERAT: - if (evt->u.erat_error.effective_address_provided) - return evt->u.erat_error.effective_address; - break; - case MCE_ERROR_TYPE_TLB: - if (evt->u.tlb_error.effective_address_provided) - return evt->u.tlb_error.effective_address; - break; - case MCE_ERROR_TYPE_USER: - if (evt->u.user_error.effective_address_provided) - return evt->u.user_error.effective_address; - break; - case MCE_ERROR_TYPE_RA: - if (evt->u.ra_error.effective_address_provided) - return evt->u.ra_error.effective_address; - break; - case MCE_ERROR_TYPE_LINK: - if (evt->u.link_error.effective_address_provided) - return evt->u.link_error.effective_address; - break; - default: - case MCE_ERROR_TYPE_UNKNOWN: - break; - } - return 0; -} -EXPORT_SYMBOL(get_mce_fault_addr); - /* * This function is called in real mode. Strictly no printk's please. * @@ -470,6 +499,34 @@ long hmi_exception_realmode(struct pt_regs *regs) { __this_cpu_inc(irq_stat.hmi_exceptions); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ + if (pvr_version_is(PVR_POWER9)) { + unsigned long hmer = mfspr(SPRN_HMER); + + /* Do we have the debug bit set */ + if (hmer & PPC_BIT(17)) { + hmer &= ~PPC_BIT(17); + mtspr(SPRN_HMER, hmer); + + /* + * Now to avoid problems with soft-disable we + * only do the emulation if we are coming from + * user space + */ + if (user_mode(regs)) + local_paca->hmi_p9_special_emu = 1; + + /* + * Don't bother going to OPAL if that's the + * only relevant bit. + */ + if (!(hmer & mfspr(SPRN_HMEER))) + return local_paca->hmi_p9_special_emu; + } + } +#endif /* CONFIG_PPC_BOOK3S_64 */ + wait_for_subcore_guest_exit(); if (ppc_md.hmi_exception_early) @@ -477,5 +534,5 @@ long hmi_exception_realmode(struct pt_regs *regs) wait_for_tb_resync(); - return 0; + return 1; } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 72f153c6f3fa..644f7040b91c 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -27,6 +27,36 @@ #include <asm/mmu.h> #include <asm/mce.h> #include <asm/machdep.h> +#include <asm/pgtable.h> +#include <asm/pte-walk.h> +#include <asm/sstep.h> +#include <asm/exception-64s.h> + +/* + * Convert an address related to an mm to a PFN. NOTE: we are in real + * mode, we could potentially race with page table updates. + */ +static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) +{ + pte_t *ptep; + unsigned long flags; + struct mm_struct *mm; + + if (user_mode(regs)) + mm = current->mm; + else + mm = &init_mm; + + local_irq_save(flags); + if (mm == current->mm) + ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); + else + ptep = find_init_mm_pte(addr, NULL); + local_irq_restore(flags); + if (!ptep || pte_special(*ptep)) + return ULONG_MAX; + return pte_pfn(*ptep); +} static void flush_tlb_206(unsigned int num_sets, unsigned int action) { @@ -128,7 +158,7 @@ void __flush_tlb_power9(unsigned int action) { unsigned int num_sets; - if (radix_enabled()) + if (early_radix_enabled()) num_sets = POWER9_TLB_SETS_RADIX; else num_sets = POWER9_TLB_SETS_HASH; @@ -138,7 +168,7 @@ void __flush_tlb_power9(unsigned int action) /* flush SLBs and reload */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -185,7 +215,7 @@ static void flush_erat(void) static int mce_flush(int what) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (what == MCE_FLUSH_SLB) { flush_and_reload_slb(); return 1; @@ -421,9 +451,45 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0, false, 0, 0, 0, 0 } }; +static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, + uint64_t *phys_addr) +{ + /* + * Carefully look at the NIP to determine + * the instruction to analyse. Reading the NIP + * in real-mode is tricky and can lead to recursive + * faults + */ + int instr; + unsigned long pfn, instr_addr; + struct instruction_op op; + struct pt_regs tmp = *regs; + + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK); + instr = *(unsigned int *)(instr_addr); + if (!analyse_instr(&op, &tmp, instr)) { + pfn = addr_to_pfn(regs, op.ea); + *addr = op.ea; + *phys_addr = (pfn << PAGE_SHIFT); + return 0; + } + /* + * analyse_instr() might fail if the instruction + * is not a load/store, although this is unexpected + * for load/store errors or if we got the NIP + * wrong + */ + } + *addr = 0; + return -1; +} + static int mce_handle_ierror(struct pt_regs *regs, const struct mce_ierror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t srr1 = regs->msr; int handled = 0; @@ -475,8 +541,22 @@ static int mce_handle_ierror(struct pt_regs *regs, } mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; - if (table[i].nip_valid) + if (table[i].nip_valid) { *addr = regs->nip; + if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + unsigned long pfn; + + if (get_paca()->in_mce < MAX_MCE_DEPTH) { + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + *phys_addr = + (pfn << PAGE_SHIFT); + handled = 1; + } + } + } + } return handled; } @@ -489,7 +569,8 @@ static int mce_handle_ierror(struct pt_regs *regs, static int mce_handle_derror(struct pt_regs *regs, const struct mce_derror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t dsisr = regs->dsisr; int handled = 0; @@ -555,7 +636,17 @@ static int mce_handle_derror(struct pt_regs *regs, mce_err->initiator = table[i].initiator; if (table[i].dar_valid) *addr = regs->dar; - + else if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + /* + * We do a maximum of 4 nested MCE calls, see + * kernel/exception-64s.h + */ + if (get_paca()->in_mce < MAX_MCE_DEPTH) + if (!mce_find_instr_ea_and_pfn(regs, addr, + phys_addr)) + handled = 1; + } found = 1; } @@ -592,19 +683,21 @@ static long mce_handle_error(struct pt_regs *regs, const struct mce_ierror_table itable[]) { struct mce_error_info mce_err = { 0 }; - uint64_t addr; + uint64_t addr, phys_addr; uint64_t srr1 = regs->msr; long handled; if (SRR1_MC_LOADSTORE(srr1)) - handled = mce_handle_derror(regs, dtable, &mce_err, &addr); + handled = mce_handle_derror(regs, dtable, &mce_err, &addr, + &phys_addr); else - handled = mce_handle_ierror(regs, itable, &mce_err, &addr); + handled = mce_handle_ierror(regs, itable, &mce_err, &addr, + &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); - save_mce_event(regs, handled, &mce_err, regs->nip, addr); + save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); return handled; } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 0b0f89685b67..759104b99f9f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { - BUG_ON(i >= num_stubs); + if (WARN_ON(i >= num_stubs)) + return 0; if (stub_func_addr(stubs[i].funcdata) == func_addr(addr)) return (unsigned long)&stubs[i]; diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 91e037ab20a1..8237884ca389 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -115,32 +115,23 @@ static unsigned long can_optimize(struct kprobe *p) static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long flags; - /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - local_irq_save(flags); - hard_irq_disable(); + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { __this_cpu_write(current_kprobe, &op->kp); regs->nip = (unsigned long)op->kp.addr; - kcb->kprobe_status = KPROBE_HIT_ACTIVE; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - /* - * No need for an explicit __hard_irq_enable() here. - * local_irq_restore() will re-enable interrupts, - * if they were hard disabled. - */ - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(optimized_callback); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 2ff2b8a19f71..d6597038931d 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -90,7 +90,7 @@ static inline void free_lppacas(void) { } #endif /* CONFIG_PPC_BOOK3S */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * 3 persistent SLBs are registered here. The buffer will be zero @@ -135,11 +135,11 @@ static struct slb_shadow * __init init_slb_shadow(int cpu) return s; } -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ static void __init allocate_slb_shadows(int nr_cpus, int limit) { } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* The Paca is an array with one entry per processor. Each contains an * lppaca, which contains the information shared between the @@ -170,9 +170,9 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 new_paca->slb_shadow_ptr = init_slb_shadow(cpu); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif #ifdef CONFIG_PPC_BOOK3E /* For now -- if we have threads this will be adjusted later */ @@ -262,8 +262,8 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_id = context->id; #ifdef CONFIG_PPC_MM_SLICES - VM_BUG_ON(!mm->context.addr_limit); - get_paca()->addr_limit = mm->context.addr_limit; + VM_BUG_ON(!mm->context.slb_addr_limit); + get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; memcpy(&get_paca()->mm_ctx_high_slices_psize, &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); @@ -271,7 +271,7 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; #endif -#else /* CONFIG_PPC_BOOK3S */ +#else /* !CONFIG_PPC_BOOK3S */ return; #endif } diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 02831a396419..0ac7aa346c69 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1740,15 +1740,3 @@ static void fixup_hide_host_resource_fsl(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl); - -static void fixup_vga(struct pci_dev *pdev) -{ - u16 cmd; - - pci_read_config_word(pdev, PCI_COMMAND, &cmd); - if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || !vga_default_device()) - vga_set_default_device(pdev); - -} -DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_DISPLAY_VGA, 8, fixup_vga); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 932b9741aa8f..15ce0306b092 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -90,14 +90,14 @@ int pcibios_unmap_io_space(struct pci_bus *bus) * to do an appropriate TLB flush here too */ if (bus->self) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct resource *res = bus->resource[0]; #endif pr_debug("IO unmapping for PCI-PCI bridge %s\n", pci_name(bus->self)); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 __flush_hash_table_range(&init_mm, res->start + _IO_BASE, res->end + _IO_BASE + 1); #endif diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a0c74bbf3454..bfdd783e3916 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -77,6 +77,13 @@ extern unsigned long _get_SP(void); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Are we running in "Suspend disabled" mode? If so we have to block any + * sigreturn that would get us into suspended state, and we also warn in some + * other paths that we should never reach with suspend disabled. + */ +bool tm_suspend_disabled __ro_after_init = false; + static void check_if_tm_restore_required(struct task_struct *tsk) { /* @@ -97,9 +104,23 @@ static inline bool msr_tm_active(unsigned long msr) { return MSR_TM_ACTIVE(msr); } + +static bool tm_active_with_fp(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_FP); +} + +static bool tm_active_with_altivec(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_VEC); +} #else static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } +static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } +static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ bool strict_msr_control; @@ -232,7 +253,7 @@ EXPORT_SYMBOL(enable_kernel_fp); static int restore_fp(struct task_struct *tsk) { - if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { + if (tsk->thread.load_fp || tm_active_with_fp(tsk)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; return 1; @@ -314,7 +335,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread); static int restore_altivec(struct task_struct *tsk) { if (cpu_has_feature(CPU_FTR_ALTIVEC) && - (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) { + (tsk->thread.load_vec || tm_active_with_altivec(tsk))) { load_vr_state(&tsk->thread.vr_state); tsk->thread.used_vr = 1; tsk->thread.load_vec++; @@ -853,6 +874,10 @@ static void tm_reclaim_thread(struct thread_struct *thr, if (!MSR_TM_SUSPENDED(mfmsr())) return; + giveup_all(container_of(thr, struct task_struct, thread)); + + tm_reclaim(thr, cause); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -871,10 +896,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, if ((thr->ckpt_regs.msr & MSR_VEC) == 0) memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - - giveup_all(container_of(thr, struct task_struct, thread)); - - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } void tm_reclaim_current(uint8_t cause) @@ -903,6 +924,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk) if (!MSR_TM_ACTIVE(thr->regs->msr)) goto out_and_saveregs; + WARN_ON(tm_suspend_disabled); + TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, " "ccr=%lx, msr=%lx, trap=%lx)\n", tsk->pid, thr->regs->nip, @@ -923,11 +946,9 @@ out_and_saveregs: tm_save_sprs(thr); } -extern void __tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void __tm_recheckpoint(struct thread_struct *thread); -void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr) +void tm_recheckpoint(struct thread_struct *thread) { unsigned long flags; @@ -946,15 +967,13 @@ void tm_recheckpoint(struct thread_struct *thread, */ tm_restore_sprs(thread); - __tm_recheckpoint(thread, orig_msr); + __tm_recheckpoint(thread); local_irq_restore(flags); } static inline void tm_recheckpoint_new_task(struct task_struct *new) { - unsigned long msr; - if (!cpu_has_feature(CPU_FTR_TM)) return; @@ -973,13 +992,11 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new) tm_restore_sprs(&new->thread); return; } - msr = new->thread.ckpt_regs.msr; /* Recheckpoint to restore original checkpointed register state. */ - TM_DEBUG("*** tm_recheckpoint of pid %d " - "(new->msr 0x%lx, new->origmsr 0x%lx)\n", - new->pid, new->thread.regs->msr, msr); + TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n", + new->pid, new->thread.regs->msr); - tm_recheckpoint(&new->thread, msr); + tm_recheckpoint(&new->thread); /* * The checkpointed state has been restored but the live state has @@ -1119,6 +1136,10 @@ static inline void restore_sprs(struct thread_struct *old_thread, if (old_thread->tar != new_thread->tar) mtspr(SPRN_TAR, new_thread->tar); } + + if (cpu_has_feature(CPU_FTR_ARCH_300) && + old_thread->tidr != new_thread->tidr) + mtspr(SPRN_TIDR, new_thread->tidr); #endif } @@ -1155,7 +1176,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1163,7 +1184,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1209,7 +1230,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1223,22 +1244,22 @@ struct task_struct *__switch_to(struct task_struct *prev, * The copy-paste buffer can only store into foreign real * addresses, so unprivileged processes can not see the * data or use it in any way unless they have foreign real - * mappings. We don't have a VAS driver that allocates those - * yet, so no cpabort is required. + * mappings. If the new process has the foreign real address + * mappings, we must issue a cp_abort to clear any state and + * prevent snooping, corruption or a covert channel. + * + * DD1 allows paste into normal system memory so we do an + * unpaired copy, rather than cp_abort, to clear the buffer, + * since cp_abort is quite expensive. */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - /* - * DD1 allows paste into normal system memory, so we - * do an unpaired copy here to clear the buffer and - * prevent a covert channel being set up. - * - * cpabort is not used because it is quite expensive. - */ + if (current_thread_info()->task->thread.used_vas) { + asm volatile(PPC_CP_ABORT); + } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { asm volatile(PPC_COPY(%0, %1) : : "r"(dummy_copy_buffer), "r"(0)); } } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ return last; } @@ -1434,6 +1455,137 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +int set_thread_uses_vas(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + current->thread.used_vas = 1; + + /* + * Even a process that has no foreign real address mapping can use + * an unpaired COPY instruction (to no real effect). Issue CP_ABORT + * to clear any pending COPY and prevent a covert channel. + * + * __switch_to() will issue CP_ABORT on future context switches. + */ + asm volatile(PPC_CP_ABORT); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + return 0; +} + +#ifdef CONFIG_PPC64 +static DEFINE_SPINLOCK(vas_thread_id_lock); +static DEFINE_IDA(vas_thread_ida); + +/* + * We need to assign a unique thread id to each thread in a process. + * + * This thread id, referred to as TIDR, and separate from the Linux's tgid, + * is intended to be used to direct an ASB_Notify from the hardware to the + * thread, when a suitable event occurs in the system. + * + * One such event is a "paste" instruction in the context of Fast Thread + * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard + * (VAS) in POWER9. + * + * To get a unique TIDR per process we could simply reuse task_pid_nr() but + * the problem is that task_pid_nr() is not yet available copy_thread() is + * called. Fixing that would require changing more intrusive arch-neutral + * code in code path in copy_process()?. + * + * Further, to assign unique TIDRs within each process, we need an atomic + * field (or an IDR) in task_struct, which again intrudes into the arch- + * neutral code. So try to assign globally unique TIDRs for now. + * + * NOTE: TIDR 0 indicates that the thread does not need a TIDR value. + * For now, only threads that expect to be notified by the VAS + * hardware need a TIDR value and we assign values > 0 for those. + */ +#define MAX_THREAD_CONTEXT ((1 << 16) - 1) +static int assign_thread_tidr(void) +{ + int index; + int err; + +again: + if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&vas_thread_id_lock); + err = ida_get_new_above(&vas_thread_ida, 1, &index); + spin_unlock(&vas_thread_id_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_THREAD_CONTEXT) { + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, index); + spin_unlock(&vas_thread_id_lock); + return -ENOMEM; + } + + return index; +} + +static void free_thread_tidr(int id) +{ + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, id); + spin_unlock(&vas_thread_id_lock); +} + +/* + * Clear any TIDR value assigned to this thread. + */ +void clear_thread_tidr(struct task_struct *t) +{ + if (!t->thread.tidr) + return; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + WARN_ON_ONCE(1); + return; + } + + mtspr(SPRN_TIDR, 0); + free_thread_tidr(t->thread.tidr); + t->thread.tidr = 0; +} + +void arch_release_task_struct(struct task_struct *t) +{ + clear_thread_tidr(t); +} + +/* + * Assign a unique TIDR (thread id) for task @t and set it in the thread + * structure. For now, we only support setting TIDR for 'current' task. + */ +int set_thread_tidr(struct task_struct *t) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + if (t != current) + return -EINVAL; + + t->thread.tidr = assign_thread_tidr(); + if (t->thread.tidr < 0) + return t->thread.tidr; + + mtspr(SPRN_TIDR, t->thread.tidr); + + return 0; +} + +#endif /* CONFIG_PPC64 */ + void release_thread(struct task_struct *t) { @@ -1467,7 +1619,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; @@ -1580,6 +1732,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, } if (cpu_has_feature(CPU_FTR_HAS_PPR)) p->thread.ppr = INIT_PPR; + + p->thread.tidr = 0; #endif kregs->nip = ppc_function_entry(f); return 0; @@ -1898,7 +2052,8 @@ unsigned long get_wchan(struct task_struct *p) do { sp = *(unsigned long *)sp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || + p->state == TASK_RUNNING) return 0; if (count > 0) { ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; @@ -2046,7 +2201,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long base = mm->brk; unsigned long ret; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * If we are using 1TB segments and we are allowed to randomise * the heap, we can put it above 1TB so it is backed by a 1TB diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f83056297441..b15bae265c90 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -47,6 +47,7 @@ #include <asm/mmu.h> #include <asm/paca.h> #include <asm/pgtable.h> +#include <asm/powernv.h> #include <asm/iommu.h> #include <asm/btext.h> #include <asm/sections.h> @@ -228,7 +229,7 @@ static void __init check_cpu_pa_features(unsigned long node) ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void __init init_mmu_slb_size(unsigned long node) { const __be32 *slb_size_ptr; @@ -658,6 +659,38 @@ static void __init early_reserve_mem(void) #endif } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static bool tm_disabled __initdata; + +static int __init parse_ppc_tm(char *str) +{ + bool res; + + if (kstrtobool(str, &res)) + return -EINVAL; + + tm_disabled = !res; + + return 0; +} +early_param("ppc_tm", parse_ppc_tm); + +static void __init tm_init(void) +{ + if (tm_disabled) { + pr_info("Disabling hardware transactional memory (HTM)\n"); + cur_cpu_spec->cpu_user_features2 &= + ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM); + cur_cpu_spec->cpu_features &= ~CPU_FTR_TM; + return; + } + + pnv_tm_init(); +} +#else +static void tm_init(void) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -767,6 +800,8 @@ void __init early_init_devtree(void *params) powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE; #endif + tm_init(); + DBG(" <- early_init_devtree()\n"); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2e3bc16d02b2..2075322cd225 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -773,7 +773,7 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); #endif #ifdef CONFIG_PPC_STD_MMU_32 @@ -800,7 +800,7 @@ static __init void print_system_info(void) pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (htab_address) pr_info("htab_address = 0x%p\n", htab_address); if (htab_hash_mask) @@ -898,7 +898,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PPC_MM_SLICES #ifdef CONFIG_PPC64 - init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64; + if (!radix_enabled()) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; #else #error "context.addr_limit not initialized." #endif diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index cfba134b3024..21c18071d9d5 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -45,6 +45,12 @@ void emergency_stack_init(void); static inline void emergency_stack_init(void) { }; #endif +#ifdef CONFIG_PPC64 +void record_spr_defaults(void); +#else +static inline void record_spr_defaults(void) { }; +#endif + /* * Having this in kvm_ppc.h makes include dependencies too * tricky to solve for setup-common.c so have it here. diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index b89c6aac48c9..8956a9856604 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -69,6 +69,8 @@ #include <asm/opal.h> #include <asm/cputhreads.h> +#include "setup.h" + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -317,6 +319,13 @@ void __init early_setup(unsigned long dt_ptr) early_init_mmu(); /* + * After firmware and early platform setup code has set things up, + * we note the SPR values for configurable control/performance + * registers, and use those as initial defaults. + */ + record_spr_defaults(); + + /* * At this point, we can let interrupts switch to virtual mode * (the MMU has been setup), so adjust the MSR in the PACA to * have IR and DR set and enable AIL if it exists @@ -360,8 +369,16 @@ void early_setup_secondary(void) #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) static bool use_spinloop(void) { - if (!IS_ENABLED(CONFIG_PPC_BOOK3E)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S)) { + /* + * See comments in head_64.S -- not all platforms insert + * secondaries at __secondary_hold and wait at the spin + * loop. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + return false; return true; + } /* * When book3e boots from kexec, the ePAPR spin table does diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e9436c5e1e09..3d7539b90010 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -103,7 +103,7 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, static void do_signal(struct task_struct *tsk) { sigset_t *oldset = sigmask_to_save(); - struct ksignal ksig; + struct ksignal ksig = { .sig = 0 }; int ret; int is32 = is_32bit_task(); diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 92fb1c8dbbd8..9ffd73296f64 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -94,40 +94,13 @@ */ static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) { - compat_sigset_t cset; - - switch (_NSIG_WORDS) { - case 4: cset.sig[6] = set->sig[3] & 0xffffffffull; - cset.sig[7] = set->sig[3] >> 32; - case 3: cset.sig[4] = set->sig[2] & 0xffffffffull; - cset.sig[5] = set->sig[2] >> 32; - case 2: cset.sig[2] = set->sig[1] & 0xffffffffull; - cset.sig[3] = set->sig[1] >> 32; - case 1: cset.sig[0] = set->sig[0] & 0xffffffffull; - cset.sig[1] = set->sig[0] >> 32; - } - return copy_to_user(uset, &cset, sizeof(*uset)); + return put_compat_sigset(uset, set, sizeof(*uset)); } static inline int get_sigset_t(sigset_t *set, const compat_sigset_t __user *uset) { - compat_sigset_t s32; - - if (copy_from_user(&s32, uset, sizeof(*uset))) - return -EFAULT; - - /* - * Swap the 2 words of the 64-bit sigset_t (they are stored - * in the "wrong" endian in 32-bit user storage). - */ - switch (_NSIG_WORDS) { - case 4: set->sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); - case 3: set->sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32); - case 2: set->sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32); - case 1: set->sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); - } - return 0; + return get_compat_sigset(set, uset); } #define to_user_ptr(p) ptr_to_compat(p) @@ -519,6 +492,8 @@ static int save_tm_user_regs(struct pt_regs *regs, { unsigned long msr = regs->msr; + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -769,6 +744,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, int i; #endif + if (tm_suspend_disabled) + return 1; /* * restore general registers but not including MSR or SOFTE. Also * take care of keeping r2 (TLS) intact if not a signal. @@ -876,7 +853,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, /* Make sure the transaction is marked as failed */ current->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(¤t->thread, msr); + tm_recheckpoint(¤t->thread); /* This loads the speculative FP/VEC state, if used */ msr_check_and_set(msr & (MSR_FP | MSR_VEC)); diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index b2c002993d78..4b9ca3570344 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, BUG_ON(tsk != current); + if (tm_suspend_disabled) + return -EINVAL; + /* copy the GPRs */ err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr)); err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs, @@ -558,7 +563,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, /* Make sure the transaction is marked as failed */ tsk->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(&tsk->thread, msr); + tm_recheckpoint(&tsk->thread); msr_check_and_set(msr & (MSR_FP | MSR_VEC)); if (msr & MSR_FP) { diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 4437c70c7c2b..b8d4a1dac39f 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -590,6 +590,17 @@ static void sysfs_create_dscr_default(void) if (cpu_has_feature(CPU_FTR_DSCR)) err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); } + +void __init record_spr_defaults(void) +{ + int cpu; + + if (cpu_has_feature(CPU_FTR_DSCR)) { + dscr_default = mfspr(SPRN_DSCR); + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + paca[cpu].dscr_default = dscr_default; + } +} #endif /* CONFIG_PPC64 */ #ifdef HAS_PPC_PMC_PA6T diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index a3374e8a258c..e3c5f75d137c 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -230,8 +230,7 @@ int __init TAU_init(void) /* first, set up the window shrinking timer */ - init_timer(&tau_timer); - tau_timer.function = tau_timeout_smp; + setup_timer(&tau_timer, tau_timeout_smp, 0UL); tau_timer.expires = jiffies + shrink_timer; add_timer(&tau_timer); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 1da12f521cb7..b92ac8e711db 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -80,15 +80,12 @@ _GLOBAL(tm_abort) blr /* void tm_reclaim(struct thread_struct *thread, - * unsigned long orig_msr, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding * transactions and updates thread->regs.tm_ckpt_* with the * original checkpointed state. Note that thread->regs is * unchanged. - * - FP regs are written back to thread->transact_fpr before - * reclaiming. These are the transactional (current) versions. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -99,9 +96,9 @@ _GLOBAL(tm_abort) * Call with IRQs off, stacks get all out of sync for some periods in here! */ _GLOBAL(tm_reclaim) - mfcr r6 + mfcr r5 mflr r0 - stw r6, 8(r1) + stw r5, 8(r1) std r0, 16(r1) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) @@ -109,7 +106,6 @@ _GLOBAL(tm_reclaim) /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ std r3, STK_PARAM(R3)(r1) - std r4, STK_PARAM(R4)(r1) SAVE_NVGPRS(r1) /* We need to setup MSR for VSX register save instructions. */ @@ -139,8 +135,8 @@ _GLOBAL(tm_reclaim) std r1, PACAR1(r13) /* Clear MSR RI since we are about to change r1, EE is already off. */ - li r4, 0 - mtmsrd r4, 1 + li r5, 0 + mtmsrd r5, 1 /* * BE CAREFUL HERE: @@ -152,7 +148,7 @@ _GLOBAL(tm_reclaim) * to user register state. (FPRs, CCR etc. also!) * Use an sprg and a tm_scratch in the PACA to shuffle. */ - TRECLAIM(R5) /* Cause in r5 */ + TRECLAIM(R4) /* Cause in r4 */ /* ******************** GPRs ******************** */ /* Stash the checkpointed r13 away in the scratch SPR and get the real @@ -243,40 +239,30 @@ _GLOBAL(tm_reclaim) /* ******************** FPR/VR/VSRs ************ - * After reclaiming, capture the checkpointed FPRs/VRs /if used/. - * - * (If VSX used, FP and VMX are implied. Or, we don't need to look - * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.) - * - * We're passed the thread's MSR as the second parameter + * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these * instructions! */ - ld r4, STK_PARAM(R4)(r1) /* Second parameter, MSR * */ mr r3, r12 - andis. r0, r4, MSR_VEC@h - beq dont_backup_vec + /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 -dont_backup_vec: + + /* VRSAVE */ mfspr r0, SPRN_VRSAVE std r0, THREAD_CKVRSAVE(r3) - andi. r0, r4, MSR_FP - beq dont_backup_fp - + /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ - mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) -dont_backup_fp: /* TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure @@ -344,22 +330,19 @@ _GLOBAL(__tm_recheckpoint) */ subi r7, r7, STACK_FRAME_OVERHEAD + /* We need to setup MSR for FP/VMX/VSX register save instructions. */ mfmsr r6 - /* R4 = original MSR to indicate whether thread used FP/Vector etc. */ - - /* Enable FP/vec in MSR if necessary! */ - lis r5, MSR_VEC@h + mr r5, r6 ori r5, r5, MSR_FP - and. r5, r4, r5 - beq restore_gprs /* if neither, skip both */ - +#ifdef CONFIG_ALTIVEC + oris r5, r5, MSR_VEC@h +#endif #ifdef CONFIG_VSX BEGIN_FTR_SECTION - oris r5, r5, MSR_VSX@h + oris r5,r5, MSR_VSX@h END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */ - mtmsr r5 + mtmsrd r5 #ifdef CONFIG_ALTIVEC /* @@ -368,28 +351,20 @@ _GLOBAL(__tm_recheckpoint) * thread.fp_state[] version holds the 'live' (transactional) * and will be loaded subsequently by any FPUnavailable trap. */ - andis. r0, r4, MSR_VEC@h - beq dont_restore_vec - addi r8, r3, THREAD_CKVRSTATE li r5, VRSTATE_VSCR lvx v0, r8, r5 mtvscr v0 REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */ -dont_restore_vec: ld r5, THREAD_CKVRSAVE(r3) mtspr SPRN_VRSAVE, r5 #endif - andi. r0, r4, MSR_FP - beq dont_restore_fp - addi r8, r3, THREAD_CKFPSTATE lfd fr0, FPSTATE_FPSCR(r8) MTFSF_L(fr0) REST_32FPRS_VSRS(0, R4, R8) -dont_restore_fp: mtmsr r6 /* FP/Vec off again! */ restore_gprs: diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index b4e2b7165f79..3f3e81852422 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -110,9 +110,9 @@ ftrace_call: /* NIP has not been altered, skip over further checks */ beq 1f - /* Check if there is an active kprobe on us */ + /* Check if there is an active jprobe on us */ subi r3, r14, 4 - bl is_current_kprobe_addr + bl __is_active_jprobe nop /* diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 13c9dcdcba69..f3eb61be0d30 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,6 +37,7 @@ #include <linux/kdebug.h> #include <linux/ratelimit.h> #include <linux/context_tracking.h> +#include <linux/smp.h> #include <asm/emulated_ops.h> #include <asm/pgtable.h> @@ -699,6 +700,187 @@ void SMIException(struct pt_regs *regs) die("System Management Interrupt", regs, SIGABRT); } +#ifdef CONFIG_VSX +static void p9_hmi_special_emu(struct pt_regs *regs) +{ + unsigned int ra, rb, t, i, sel, instr, rc; + const void __user *addr; + u8 vbuf[16], *vdst; + unsigned long ea, msr, msr_mask; + bool swap; + + if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + return; + + /* + * lxvb16x opcode: 0x7c0006d8 + * lxvd2x opcode: 0x7c000698 + * lxvh8x opcode: 0x7c000658 + * lxvw4x opcode: 0x7c000618 + */ + if ((instr & 0xfc00073e) != 0x7c000618) { + pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx" + " instr=%08x\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr); + return; + } + + /* Grab vector registers into the task struct */ + msr = regs->msr; /* Grab msr before we flush the bits */ + flush_vsx_to_thread(current); + enable_kernel_altivec(); + + /* + * Is userspace running with a different endian (this is rare but + * not impossible) + */ + swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + + /* Decode the instruction */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + t = (instr >> 21) & 0x1f; + if (instr & 1) + vdst = (u8 *)¤t->thread.vr_state.vr[t]; + else + vdst = (u8 *)¤t->thread.fp_state.fpr[t][0]; + + /* Grab the vector address */ + ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0); + if (is_32bit_task()) + ea &= 0xfffffffful; + addr = (__force const void __user *)ea; + + /* Check it */ + if (!access_ok(VERIFY_READ, addr, 16)) { + pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + /* Read the vector */ + rc = 0; + if ((unsigned long)addr & 0xfUL) + /* unaligned case */ + rc = __copy_from_user_inatomic(vbuf, addr, 16); + else + __get_user_atomic_128_aligned(vbuf, addr, rc); + if (rc) { + pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, regs->nip, + instr, (unsigned long) addr); + + /* Grab instruction "selector" */ + sel = (instr >> 6) & 3; + + /* + * Check to make sure the facility is actually enabled. This + * could happen if we get a false positive hit. + * + * lxvd2x/lxvw4x always check MSR VSX sel = 0,2 + * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3 + */ + msr_mask = MSR_VSX; + if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */ + msr_mask = MSR_VEC; + if (!(msr & msr_mask)) { + pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx" + " instr=%08x msr:%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, msr); + return; + } + + /* Do logging here before we modify sel based on endian */ + switch (sel) { + case 0: /* lxvw4x */ + PPC_WARN_EMULATED(lxvw4x, regs); + break; + case 1: /* lxvh8x */ + PPC_WARN_EMULATED(lxvh8x, regs); + break; + case 2: /* lxvd2x */ + PPC_WARN_EMULATED(lxvd2x, regs); + break; + case 3: /* lxvb16x */ + PPC_WARN_EMULATED(lxvb16x, regs); + break; + } + +#ifdef __LITTLE_ENDIAN__ + /* + * An LE kernel stores the vector in the task struct as an LE + * byte array (effectively swapping both the components and + * the content of the components). Those instructions expect + * the components to remain in ascending address order, so we + * swap them back. + * + * If we are running a BE user space, the expectation is that + * of a simple memcpy, so forcing the emulation to look like + * a lxvb16x should do the trick. + */ + if (swap) + sel = 3; + + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i]; + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i]; + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i]; + break; + case 3: /* lxvb16x */ + for (i = 0; i < 16; i++) + vdst[i] = vbuf[15-i]; + break; + } +#else /* __LITTLE_ENDIAN__ */ + /* On a big endian kernel, a BE userspace only needs a memcpy */ + if (!swap) + sel = 3; + + /* Otherwise, we need to swap the content of the components */ + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]); + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]); + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]); + break; + case 3: /* lxvb16x */ + memcpy(vdst, vbuf, 16); + break; + } +#endif /* !__LITTLE_ENDIAN__ */ + + /* Go to next instruction */ + regs->nip += 4; +} +#endif /* CONFIG_VSX */ + void handle_hmi_exception(struct pt_regs *regs) { struct pt_regs *old_regs; @@ -706,6 +888,21 @@ void handle_hmi_exception(struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_VSX + /* Real mode flagged P9 special emu is needed */ + if (local_paca->hmi_p9_special_emu) { + local_paca->hmi_p9_special_emu = 0; + + /* + * We don't want to take page faults while doing the + * emulation, we just replay the instruction if necessary. + */ + pagefault_disable(); + p9_hmi_special_emu(regs); + pagefault_enable(); + } +#endif /* CONFIG_VSX */ + if (ppc_md.handle_hmi_exception) ppc_md.handle_hmi_exception(regs); @@ -1140,13 +1337,8 @@ void program_check_exception(struct pt_regs *regs) * - A treclaim is attempted when non transactional. * - A tend is illegally attempted. * - writing a TM SPR when transactional. - */ - if (!user_mode(regs) && - report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { - regs->nip += 4; - goto bail; - } - /* If usermode caused this, it's done something illegal and + * + * If usermode caused this, it's done something illegal and * gets a SIGILL slap on the wrist. We call it an illegal * operand to distinguish from the instruction just being bad * (e.g. executing a 'tend' on a CPU without TM!); it's an @@ -1487,7 +1679,7 @@ void fp_unavailable_tm(struct pt_regs *regs) /* Reclaim didn't save out any FPRs to transact_fprs. */ /* Enable FP for the task: */ - regs->msr |= (MSR_FP | current->thread.fpexc_mode); + current->thread.load_fp = 1; /* This loads and recheckpoints the FP registers from * thread.fpr[]. They will remain in registers after the @@ -1495,15 +1687,7 @@ void fp_unavailable_tm(struct pt_regs *regs) * If VMX is in use, the VRs now hold checkpointed values, * so we don't want to load the VRs from the thread_struct. */ - tm_recheckpoint(¤t->thread, MSR_FP); - - /* If VMX is in use, get the transactional values back */ - if (regs->msr & MSR_VEC) { - msr_check_and_set(MSR_VEC); - load_vr_state(¤t->thread.vr_state); - /* At this point all the VSX state is loaded, so enable it */ - regs->msr |= MSR_VSX; - } + tm_recheckpoint(¤t->thread); } void altivec_unavailable_tm(struct pt_regs *regs) @@ -1516,21 +1700,13 @@ void altivec_unavailable_tm(struct pt_regs *regs) "MSR=%lx\n", regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC; - tm_recheckpoint(¤t->thread, MSR_VEC); + current->thread.load_vec = 1; + tm_recheckpoint(¤t->thread); current->thread.used_vr = 1; - - if (regs->msr & MSR_FP) { - msr_check_and_set(MSR_FP); - load_fp_state(¤t->thread.fp_state); - regs->msr |= MSR_VSX; - } } void vsx_unavailable_tm(struct pt_regs *regs) { - unsigned long orig_msr = regs->msr; - /* See the comments in fp_unavailable_tm(). This works similarly, * though we're loading both FP and VEC registers in here. * @@ -1544,29 +1720,13 @@ void vsx_unavailable_tm(struct pt_regs *regs) current->thread.used_vsr = 1; - /* If FP and VMX are already loaded, we have all the state we need */ - if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) { - regs->msr |= MSR_VSX; - return; - } - /* This reclaims FP and/or VR regs if they're already enabled */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode | - MSR_VSX; - - /* This loads & recheckpoints FP and VRs; but we have - * to be sure not to overwrite previously-valid state. - */ - tm_recheckpoint(¤t->thread, regs->msr & ~orig_msr); - - msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC)); + current->thread.load_vec = 1; + current->thread.load_fp = 1; - if (orig_msr & MSR_FP) - load_fp_state(¤t->thread.fp_state); - if (orig_msr & MSR_VEC) - load_vr_state(¤t->thread.vr_state); + tm_recheckpoint(¤t->thread); } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ @@ -1924,6 +2084,10 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(mfdscr), WARN_EMULATED_SETUP(mtdscr), WARN_EMULATED_SETUP(lq_stq), + WARN_EMULATED_SETUP(lxvw4x), + WARN_EMULATED_SETUP(lxvh8x), + WARN_EMULATED_SETUP(lxvd2x), + WARN_EMULATED_SETUP(lxvb16x), #endif }; diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 1d89163d67f2..87da80ccced1 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -98,8 +98,7 @@ static void wd_lockup_ipi(struct pt_regs *regs) else dump_stack(); - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); + /* Do not panic from here because that can recurse into NMI IPI layer */ } static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) @@ -135,15 +134,18 @@ static void watchdog_smp_panic(int cpu, u64 tb) pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", cpu, cpumask_pr_args(&wd_smp_cpus_pending)); - /* - * Try to trigger the stuck CPUs. - */ - for_each_cpu(c, &wd_smp_cpus_pending) { - if (c == cpu) - continue; - smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + if (!sysctl_hardlockup_all_cpu_backtrace) { + /* + * Try to trigger the stuck CPUs, unless we are going to + * get a backtrace on all of them anyway. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); } - smp_flush_nmi_ipi(1000000); /* Take the stuck CPUs out of the watch group */ set_cpumask_stuck(&wd_smp_cpus_pending, tb); @@ -275,9 +277,12 @@ void arch_touch_nmi_watchdog(void) { unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); + u64 tb = get_tb(); - if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks) - watchdog_timer_interrupt(cpu); + if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { + per_cpu(wd_timer_tb, cpu) = tb; + wd_smp_clear_cpu_pending(cpu, tb); + } } EXPORT_SYMBOL(arch_touch_nmi_watchdog); |