diff options
author | Ingo Molnar <mingo@kernel.org> | 2017-07-30 12:15:13 +0300 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-07-30 12:15:13 +0300 |
commit | f5db340f19f14a8df9dfd22d71fba1513e9f1f7e (patch) | |
tree | 131d3345bc987aee3c922624de816492e7f323a4 /arch/powerpc/platforms/powernv | |
parent | ee438ec8f33c5af0d4a4ffb935c5b9272e8c2680 (diff) | |
parent | 38115f2f8cec8087d558c062e779c443a01f87d6 (diff) | |
download | linux-f5db340f19f14a8df9dfd22d71fba1513e9f1f7e.tar.xz |
Merge branch 'perf/urgent' into perf/core, to pick up latest fixes and refresh the tree
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/eeh-powernv.c | 16 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/idle.c | 198 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-wrappers.S | 6 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal.c | 19 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 133 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 160 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 13 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/setup.c | 11 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/smp.c | 34 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/subcore.c | 3 |
10 files changed, 482 insertions, 111 deletions
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index d12ea7b9fd47..3f48f6df1cf3 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -48,6 +48,7 @@ static int pnv_eeh_init(void) { struct pci_controller *hose; struct pnv_phb *phb; + int max_diag_size = PNV_PCI_DIAG_BUF_SIZE; if (!firmware_has_feature(FW_FEATURE_OPAL)) { pr_warn("%s: OPAL is required !\n", @@ -69,6 +70,9 @@ static int pnv_eeh_init(void) if (phb->model == PNV_PHB_MODEL_P7IOC) eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); + if (phb->diag_data_size > max_diag_size) + max_diag_size = phb->diag_data_size; + /* * PE#0 should be regarded as valid by EEH core * if it's not the reserved one. Currently, we @@ -82,6 +86,8 @@ static int pnv_eeh_init(void) break; } + eeh_set_pe_aux_size(max_diag_size); + return 0; } @@ -540,7 +546,7 @@ static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) s64 rc; rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data, - PNV_PCI_DIAG_BUF_SIZE); + phb->diag_data_size); if (rc != OPAL_SUCCESS) pr_warn("%s: Failure %lld getting PHB#%x diag-data\n", __func__, rc, pe->phb->global_number); @@ -1314,7 +1320,8 @@ static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data) static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; - struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag; + struct OpalIoP7IOCErrorData *data = + (struct OpalIoP7IOCErrorData*)phb->diag_data; long rc; rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data)); @@ -1549,10 +1556,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) /* Dump PHB diag-data */ rc = opal_pci_get_phb_diag_data2(phb->opal_id, - phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + phb->diag_data, phb->diag_data_size); if (rc == OPAL_SUCCESS) pnv_pci_dump_phb_diag_data(hose, - phb->diag.blob); + phb->diag_data); /* Try best to clear it */ opal_pci_eeh_freeze_clear(phb->opal_id, @@ -1795,7 +1802,6 @@ static int __init eeh_powernv_init(void) { int ret = -EINVAL; - eeh_set_pe_aux_size(PNV_PCI_DIAG_BUF_SIZE); ret = eeh_ops_register(&pnv_eeh_ops); if (!ret) pr_info("EEH: PowerNV platform initialized\n"); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 445f30a2c5ef..2abee070373f 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -23,6 +23,7 @@ #include <asm/cpuidle.h> #include <asm/code-patching.h> #include <asm/smp.h> +#include <asm/runlatch.h> #include "powernv.h" #include "subcore.h" @@ -30,8 +31,33 @@ /* Power ISA 3.0 allows for stop states 0x0 - 0xF */ #define MAX_STOP_STATE 0xF +#define P9_STOP_SPR_MSR 2000 +#define P9_STOP_SPR_PSSCR 855 + static u32 supported_cpuidle_states; +/* + * The default stop state that will be used by ppc_md.power_save + * function on platforms that support stop instruction. + */ +static u64 pnv_default_stop_val; +static u64 pnv_default_stop_mask; +static bool default_stop_found; + +/* + * First deep stop state. Used to figure out when to save/restore + * hypervisor context. + */ +u64 pnv_first_deep_stop_state = MAX_STOP_STATE; + +/* + * psscr value and mask of the deepest stop idle state. + * Used when a cpu is offlined. + */ +static u64 pnv_deepest_stop_psscr_val; +static u64 pnv_deepest_stop_psscr_mask; +static bool deepest_stop_found; + static int pnv_save_sprs_for_deep_states(void) { int cpu; @@ -48,6 +74,8 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t hid4_val = mfspr(SPRN_HID4); uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); + uint64_t msr_val = MSR_IDLE; + uint64_t psscr_val = pnv_deepest_stop_psscr_val; for_each_possible_cpu(cpu) { uint64_t pir = get_hard_smp_processor_id(cpu); @@ -61,6 +89,18 @@ static int pnv_save_sprs_for_deep_states(void) if (rc != 0) return rc; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + rc = opal_slw_set_reg(pir, P9_STOP_SPR_MSR, msr_val); + if (rc) + return rc; + + rc = opal_slw_set_reg(pir, + P9_STOP_SPR_PSSCR, psscr_val); + + if (rc) + return rc; + } + /* HIDs are per core registers */ if (cpu_thread_in_core(cpu) == 0) { @@ -72,17 +112,21 @@ static int pnv_save_sprs_for_deep_states(void) if (rc != 0) return rc; - rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); - if (rc != 0) - return rc; + /* Only p8 needs to set extra HID regiters */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { - rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); - if (rc != 0) - return rc; + rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); + if (rc != 0) + return rc; - rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); - if (rc != 0) - return rc; + rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); + if (rc != 0) + return rc; + } } } @@ -96,15 +140,24 @@ static void pnv_alloc_idle_core_states(void) u32 *core_idle_state; /* - * core_idle_state - First 8 bits track the idle state of each thread - * of the core. The 8th bit is the lock bit. Initially all thread bits - * are set. They are cleared when the thread enters deep idle state - * like sleep and winkle. Initially the lock bit is cleared. - * The lock bit has 2 purposes - * a. While the first thread is restoring core state, it prevents - * other threads in the core from switching to process context. - * b. While the last thread in the core is saving the core state, it - * prevents a different thread from waking up. + * core_idle_state - The lower 8 bits track the idle state of + * each thread of the core. + * + * The most significant bit is the lock bit. + * + * Initially all the bits corresponding to threads_per_core + * are set. They are cleared when the thread enters deep idle + * state like sleep and winkle/stop. + * + * Initially the lock bit is cleared. The lock bit has 2 + * purposes: + * a. While the first thread in the core waking up from + * idle is restoring core state, it prevents other + * threads in the core from switching to process + * context. + * b. While the last thread in the core is saving the + * core state, it prevents a different thread from + * waking up. */ for (i = 0; i < nr_cores; i++) { int first_cpu = i * threads_per_core; @@ -112,7 +165,7 @@ static void pnv_alloc_idle_core_states(void) size_t paca_ptr_array_size; core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); - *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + *core_idle_state = (1 << threads_per_core) - 1; paca_ptr_array_size = (threads_per_core * sizeof(struct paca_struct *)); @@ -231,56 +284,104 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); -/* - * The default stop state that will be used by ppc_md.power_save - * function on platforms that support stop instruction. - */ -static u64 pnv_default_stop_val; -static u64 pnv_default_stop_mask; -static bool default_stop_found; +static unsigned long __power7_idle_type(unsigned long type) +{ + unsigned long srr1; -/* - * Used for ppc_md.power_save which needs a function with no parameters - */ -static void power9_idle(void) + if (!prep_irq_for_idle_irqsoff()) + return 0; + + __ppc64_runlatch_off(); + srr1 = power7_idle_insn(type); + __ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power7_idle_type(unsigned long type) +{ + unsigned long srr1; + + srr1 = __power7_idle_type(type); + irq_set_pending_from_srr1(srr1); +} + +void power7_idle(void) { - power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask); + if (!powersave_nap) + return; + + power7_idle_type(PNV_THREAD_NAP); } -/* - * First deep stop state. Used to figure out when to save/restore - * hypervisor context. - */ -u64 pnv_first_deep_stop_state = MAX_STOP_STATE; +static unsigned long __power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long psscr; + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return 0; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + __ppc64_runlatch_off(); + srr1 = power9_idle_stop(psscr); + __ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long srr1; + + srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask); + irq_set_pending_from_srr1(srr1); +} /* - * psscr value and mask of the deepest stop idle state. - * Used when a cpu is offlined. + * Used for ppc_md.power_save which needs a function with no parameters */ -static u64 pnv_deepest_stop_psscr_val; -static u64 pnv_deepest_stop_psscr_mask; -static bool deepest_stop_found; +void power9_idle(void) +{ + power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); +} +#ifdef CONFIG_HOTPLUG_CPU /* * pnv_cpu_offline: A function that puts the CPU into the deepest * available platform idle state on a CPU-Offline. + * interrupts hard disabled and no lazy irq pending. */ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; - u32 idle_states = pnv_get_supported_cpuidle_states(); + __ppc64_runlatch_off(); + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, - pnv_deepest_stop_psscr_mask); + unsigned long psscr; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | + pnv_deepest_stop_psscr_val; + srr1 = power9_idle_stop(psscr); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); + srr1 = power7_idle_insn(PNV_THREAD_WINKLE); } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); + srr1 = power7_idle_insn(PNV_THREAD_SLEEP); } else if (idle_states & OPAL_PM_NAP_ENABLED) { - srr1 = power7_nap(1); + srr1 = power7_idle_insn(PNV_THREAD_NAP); } else { /* This is the fallback method. We emulate snooze */ while (!generic_check_cpu_restart(cpu)) { @@ -291,8 +392,11 @@ unsigned long pnv_cpu_offline(unsigned int cpu) HMT_medium(); } + __ppc64_runlatch_on(); + return srr1; } +#endif /* * Power ISA 3.0 idle initialization. diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index f620572f891f..4ca6c26a56d5 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -99,10 +99,10 @@ opal_return: lwz r4,8(r1); ld r5,PPC_LR_STKOFF(r1); ld r6,PACASAVEDMSR(r13); - mtspr SPRN_SRR0,r5; - mtspr SPRN_SRR1,r6; mtcr r4; - rfid + mtspr SPRN_HSRR0,r5; + mtspr SPRN_HSRR1,r6; + hrfid opal_real_call: mfcr r11 diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 59684b4af4d1..cad6b57ce494 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -59,6 +59,8 @@ static struct task_struct *kopald_tsk; void opal_configure_cores(void) { + u64 reinit_flags = 0; + /* Do the actual re-init, This will clobber all FPRs, VRs, etc... * * It will preserve non volatile GPRs and HSPRG0/1. It will @@ -66,11 +68,24 @@ void opal_configure_cores(void) * but it might clobber a bunch. */ #ifdef __BIG_ENDIAN__ - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); + reinit_flags |= OPAL_REINIT_CPUS_HILE_BE; #else - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); + reinit_flags |= OPAL_REINIT_CPUS_HILE_LE; #endif + /* + * POWER9 always support running hash: + * ie. Host hash supports hash guests + * Host radix supports hash/radix guests + */ + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) { + reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH; + if (early_radix_enabled()) + reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX; + } + + opal_reinit_cpus(reinit_flags); + /* Restore some bits */ if (cur_cpu_spec->cpu_restore) cur_cpu_spec->cpu_restore(); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 283caf1070c9..437613588df1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1718,6 +1718,100 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev */ } +static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) +{ + unsigned short vendor = 0; + struct pci_dev *pdev; + + if (pe->device_count == 1) + return true; + + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + if (!pe->pbus) + return true; + + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { + if (!vendor) { + vendor = pdev->vendor; + continue; + } + + if (pdev->vendor != vendor) + return false; + } + + return true; +} + +/* + * Reconfigure TVE#0 to be usable as 64-bit DMA space. + * + * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses. + * Devices can only access more than that if bit 59 of the PCI address is set + * by hardware, which indicates TVE#1 should be used instead of TVE#0. + * Many PCI devices are not capable of addressing that many bits, and as a + * result are limited to the 4GB of virtual memory made available to 32-bit + * devices in TVE#0. + * + * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit + * devices by configuring the virtual memory past the first 4GB inaccessible + * by 64-bit DMAs. This should only be used by devices that want more than + * 4GB, and only on PEs that have no 32-bit devices. + * + * Currently this will only work on PHB3 (POWER8). + */ +static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe) +{ + u64 window_size, table_size, tce_count, addr; + struct page *table_pages; + u64 tce_order = 28; /* 256MB TCEs */ + __be64 *tces; + s64 rc; + + /* + * Window size needs to be a power of two, but needs to account for + * shifting memory by the 4GB offset required to skip 32bit space. + */ + window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32)); + tce_count = window_size >> tce_order; + table_size = tce_count << 3; + + if (table_size < PAGE_SIZE) + table_size = PAGE_SIZE; + + table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL, + get_order(table_size)); + if (!table_pages) + goto err; + + tces = page_address(table_pages); + if (!tces) + goto err; + + memset(tces, 0, table_size); + + for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) { + tces[(addr + (1ULL << 32)) >> tce_order] = + cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); + } + + rc = opal_pci_map_pe_dma_window(pe->phb->opal_id, + pe->pe_number, + /* reconfigure window 0 */ + (pe->pe_number << 1) + 0, + 1, + __pa(tces), + table_size, + 1 << tce_order); + if (rc == OPAL_SUCCESS) { + pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n"); + return 0; + } +err: + pe_err(pe, "Error configuring 64-bit DMA bypass\n"); + return -EIO; +} + static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -1726,6 +1820,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; + s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV;; @@ -1740,8 +1835,27 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); set_dma_ops(&pdev->dev, &dma_direct_ops); } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); + /* + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. + */ + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + pnv_pci_ioda_pe_single_vendor(pe) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return rc; + /* 4GB offset bypasses 32-bit space */ + set_dma_offset(&pdev->dev, (1ULL << 32)); + set_dma_ops(&pdev->dev, &dma_direct_ops); + } else { + dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(&pdev->dev, &dma_iommu_ops); + } } *pdev->dev.dma_mask = dma_mask; @@ -3123,13 +3237,13 @@ static int pnv_pci_diag_data_set(void *data, u64 val) phb = hose->private_data; /* Retrieve the diag data from firmware */ - ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, - PNV_PCI_DIAG_BUF_SIZE); + ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, + phb->diag_data_size); if (ret != OPAL_SUCCESS) return -EIO; /* Print the diag data to the kernel log */ - pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data); return 0; } @@ -3725,6 +3839,15 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, else phb->model = PNV_PHB_MODEL_UNKNOWN; + /* Initialize diagnostic data buffer */ + prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL); + if (prop32) + phb->diag_data_size = be32_to_cpup(prop32); + else + phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE; + + phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0); + /* Parse 32-bit and IO ranges (if any) */ pci_process_bridge_OF_ranges(hose, np, !hose->global_number); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 935ccb249a8a..7905d179d036 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -227,11 +227,39 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev) } #endif /* CONFIG_PCI_MSI */ +/* Nicely print the contents of the PE State Tables (PEST). */ +static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) +{ + __be64 prevA = ULONG_MAX, prevB = ULONG_MAX; + bool dup = false; + int i; + + for (i = 0; i < pest_size; i++) { + __be64 peA = be64_to_cpu(pestA[i]); + __be64 peB = be64_to_cpu(pestB[i]); + + if (peA != prevA || peB != prevB) { + if (dup) { + pr_info("PE[..%03x] A/B: as above\n", i-1); + dup = false; + } + prevA = peA; + prevB = peB; + if (peA & PNV_IODA_STOPPED_STATE || + peB & PNV_IODA_STOPPED_STATE) + pr_info("PE[%03x] A/B: %016llx %016llx\n", + i, peA, peB); + } else if (!dup && (peA & PNV_IODA_STOPPED_STATE || + peB & PNV_IODA_STOPPED_STATE)) { + dup = true; + } + } +} + static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, struct OpalIoPhbErrorCommon *common) { struct OpalIoP7IOCPhbErrorData *data; - int i; data = (struct OpalIoP7IOCPhbErrorData *)common; pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n", @@ -308,22 +336,13 @@ static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, be64_to_cpu(data->dma1ErrorLog0), be64_to_cpu(data->dma1ErrorLog1)); - for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { - if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && - (be64_to_cpu(data->pestB[i]) >> 63) == 0) - continue; - - pr_info("PE[%3d] A/B: %016llx %016llx\n", - i, be64_to_cpu(data->pestA[i]), - be64_to_cpu(data->pestB[i])); - } + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_P7IOC_NUM_PEST_REGS); } static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, struct OpalIoPhbErrorCommon *common) { struct OpalIoPhb3ErrorData *data; - int i; data = (struct OpalIoPhb3ErrorData*)common; pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n", @@ -404,15 +423,109 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, be64_to_cpu(data->dma1ErrorLog0), be64_to_cpu(data->dma1ErrorLog1)); - for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { - if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && - (be64_to_cpu(data->pestB[i]) >> 63) == 0) - continue; + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS); +} - pr_info("PE[%3d] A/B: %016llx %016llx\n", - i, be64_to_cpu(data->pestA[i]), - be64_to_cpu(data->pestB[i])); - } +static void pnv_pci_dump_phb4_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoPhb4ErrorData *data; + + data = (struct OpalIoPhb4ErrorData*)common; + pr_info("PHB4 PHB#%d Diag-data (Version: %d)\n", + hose->global_number, be32_to_cpu(common->version)); + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + be32_to_cpu(data->brdgCtl)); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + be32_to_cpu(data->deviceStatus), + be32_to_cpu(data->slotStatus), + be32_to_cpu(data->linkStatus), + be32_to_cpu(data->devCmdStatus), + be32_to_cpu(data->devSecStatus)); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + be32_to_cpu(data->rootErrorStatus), + be32_to_cpu(data->uncorrErrorStatus), + be32_to_cpu(data->corrErrorStatus)); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + be32_to_cpu(data->tlpHdr1), + be32_to_cpu(data->tlpHdr2), + be32_to_cpu(data->tlpHdr3), + be32_to_cpu(data->tlpHdr4)); + if (data->sourceId) + pr_info("sourceId: %08x\n", be32_to_cpu(data->sourceId)); + if (data->nFir) + pr_info("nFir: %016llx %016llx %016llx\n", + be64_to_cpu(data->nFir), + be64_to_cpu(data->nFirMask), + be64_to_cpu(data->nFirWOF)); + if (data->phbPlssr || data->phbCsr) + pr_info("PhbSts: %016llx %016llx\n", + be64_to_cpu(data->phbPlssr), + be64_to_cpu(data->phbCsr)); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrorMask), + be64_to_cpu(data->lemWOF)); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbErrorStatus), + be64_to_cpu(data->phbFirstErrorStatus), + be64_to_cpu(data->phbErrorLog0), + be64_to_cpu(data->phbErrorLog1)); + if (data->phbTxeErrorStatus) + pr_info("PhbTxeErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbTxeErrorStatus), + be64_to_cpu(data->phbTxeFirstErrorStatus), + be64_to_cpu(data->phbTxeErrorLog0), + be64_to_cpu(data->phbTxeErrorLog1)); + if (data->phbRxeArbErrorStatus) + pr_info("RxeArbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeArbErrorStatus), + be64_to_cpu(data->phbRxeArbFirstErrorStatus), + be64_to_cpu(data->phbRxeArbErrorLog0), + be64_to_cpu(data->phbRxeArbErrorLog1)); + if (data->phbRxeMrgErrorStatus) + pr_info("RxeMrgErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeMrgErrorStatus), + be64_to_cpu(data->phbRxeMrgFirstErrorStatus), + be64_to_cpu(data->phbRxeMrgErrorLog0), + be64_to_cpu(data->phbRxeMrgErrorLog1)); + if (data->phbRxeTceErrorStatus) + pr_info("RxeTceErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeTceErrorStatus), + be64_to_cpu(data->phbRxeTceFirstErrorStatus), + be64_to_cpu(data->phbRxeTceErrorLog0), + be64_to_cpu(data->phbRxeTceErrorLog1)); + + if (data->phbPblErrorStatus) + pr_info("PblErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbPblErrorStatus), + be64_to_cpu(data->phbPblFirstErrorStatus), + be64_to_cpu(data->phbPblErrorLog0), + be64_to_cpu(data->phbPblErrorLog1)); + if (data->phbPcieDlpErrorStatus) + pr_info("PcieDlp: %016llx %016llx %016llx\n", + be64_to_cpu(data->phbPcieDlpErrorLog1), + be64_to_cpu(data->phbPcieDlpErrorLog2), + be64_to_cpu(data->phbPcieDlpErrorStatus)); + if (data->phbRegbErrorStatus) + pr_info("RegbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRegbErrorStatus), + be64_to_cpu(data->phbRegbFirstErrorStatus), + be64_to_cpu(data->phbRegbErrorLog0), + be64_to_cpu(data->phbRegbErrorLog1)); + + + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB4_NUM_PEST_REGS); } void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, @@ -431,6 +544,9 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, case OPAL_PHB_ERROR_DATA_TYPE_PHB3: pnv_pci_dump_phb3_diag_data(hose, common); break; + case OPAL_PHB_ERROR_DATA_TYPE_PHB4: + pnv_pci_dump_phb4_diag_data(hose, common); + break; default: pr_warn("%s: Unrecognized ioType %d\n", __func__, be32_to_cpu(common->ioType)); @@ -445,8 +561,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_lock_irqsave(&phb->lock, flags); /* Fetch PHB diag-data */ - rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, - PNV_PCI_DIAG_BUF_SIZE); + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, + phb->diag_data_size); has_diag = (rc == OPAL_SUCCESS); /* If PHB supports compound PE, to handle it */ @@ -474,7 +590,7 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) * with the normal errors generated when probing empty slots */ if (has_diag && ret) - pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data); spin_unlock_irqrestore(&phb->lock, flags); } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 18c8a2fa03b8..f16bc403ec03 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -33,6 +33,9 @@ enum pnv_phb_model { #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ +/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */ +#define PNV_IODA_STOPPED_STATE 0x8000000000000000 + /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; struct pnv_ioda_pe { @@ -169,13 +172,9 @@ struct pnv_phb { unsigned int pe_rmap[0x10000]; } ioda; - /* PHB and hub status structure */ - union { - unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; - struct OpalIoP7IOCPhbErrorData p7ioc; - struct OpalIoPhb3ErrorData phb3; - struct OpalIoP7IOCErrorData hub_diag; - } diag; + /* PHB and hub diagnostics */ + unsigned int diag_data_size; + u8 *diag_data; /* Nvlink2 data */ struct npu { diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 2dc7e5fb86c3..897aa1400eb8 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -225,6 +225,8 @@ static void pnv_kexec_wait_secondaries_down(void) static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { + u64 reinit_flags; + if (xive_enabled()) xive_kexec_teardown_cpu(secondary); else @@ -254,8 +256,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) * We might be running as little-endian - now that interrupts * are disabled, reset the HILE bit to big-endian so we don't * take interrupts in the wrong endian later + * + * We reinit to enable both radix and hash on P9 to ensure + * the mode used by the next kernel is always supported. */ - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); + reinit_flags = OPAL_REINIT_CPUS_HILE_BE; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX | + OPAL_REINIT_CPUS_MMU_HASH; + opal_reinit_cpus(reinit_flags); } } #endif /* CONFIG_KEXEC_CORE */ diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 4aff754b6f2c..40dae96f7e20 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -63,7 +63,8 @@ static int pnv_smp_kick_cpu(int nr) long rc; uint8_t status; - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= nr_cpu_ids) + return -EINVAL; /* * If we already started or OPAL is not supported, we just @@ -144,7 +145,14 @@ static void pnv_smp_cpu_kill_self(void) unsigned long srr1, wmask; /* Standard hot unplug procedure */ - local_irq_disable(); + /* + * This hard disables local interurpts, ensuring we have no lazy + * irqs pending. + */ + WARN_ON(irqs_disabled()); + hard_irq_disable(); + WARN_ON(lazy_irq_pending()); + idle_task_exit(); current->active_mm = NULL; /* for sanity */ cpu = smp_processor_id(); @@ -162,16 +170,6 @@ static void pnv_smp_cpu_kill_self(void) */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); - /* - * Hard-disable interrupts, and then clear irq_happened flags - * that we can safely ignore while off-line, since they - * are for things for which we do no processing when off-line - * (or in the case of HMI, all the processing we need to do - * is done in lower-level real-mode code). - */ - hard_irq_disable(); - local_paca->irq_happened &= ~(PACA_IRQ_DEC | PACA_IRQ_HMI); - while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -182,9 +180,9 @@ static void pnv_smp_cpu_kill_self(void) */ kvmppc_set_host_ipi(cpu, 0); - ppc64_runlatch_off(); srr1 = pnv_cpu_offline(cpu); - ppc64_runlatch_on(); + + WARN_ON(lazy_irq_pending()); /* * If the SRR1 value indicates that we woke up due to @@ -198,8 +196,7 @@ static void pnv_smp_cpu_kill_self(void) * contains 0. */ if (((srr1 & wmask) == SRR1_WAKEEE) || - ((srr1 & wmask) == SRR1_WAKEHVI) || - (local_paca->irq_happened & PACA_IRQ_EE)) { + ((srr1 & wmask) == SRR1_WAKEHVI)) { if (cpu_has_feature(CPU_FTR_ARCH_300)) { if (xive_enabled()) xive_flush_interrupt(); @@ -211,14 +208,15 @@ static void pnv_smp_cpu_kill_self(void) unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); asm volatile(PPC_MSGCLR(%0) : : "r" (msg)); } - local_paca->irq_happened &= ~(PACA_IRQ_EE | PACA_IRQ_DBELL); smp_mb(); if (cpu_core_split_required()) continue; if (srr1 && !generic_check_cpu_restart(cpu)) - DBG("CPU%d Unexpected exit while offline !\n", cpu); + DBG("CPU%d Unexpected exit while offline srr1=%lx!\n", + cpu, srr1); + } /* Re-enable decrementer interrupts */ diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 309876d699e9..596ae2e98040 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -18,6 +18,7 @@ #include <linux/stop_machine.h> #include <asm/cputhreads.h> +#include <asm/cpuidle.h> #include <asm/kvm_ppc.h> #include <asm/machdep.h> #include <asm/opal.h> @@ -182,7 +183,7 @@ static void unsplit_core(void) cpu = smp_processor_id(); if (cpu_thread_in_core(cpu) != 0) { while (mfspr(SPRN_HID0) & mask) - power7_nap(0); + power7_idle_insn(PNV_THREAD_NAP); per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; return; |