summaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-07-30 12:15:13 +0300
committerIngo Molnar <mingo@kernel.org>2017-07-30 12:15:13 +0300
commitf5db340f19f14a8df9dfd22d71fba1513e9f1f7e (patch)
tree131d3345bc987aee3c922624de816492e7f323a4 /arch/powerpc/platforms/powernv
parentee438ec8f33c5af0d4a4ffb935c5b9272e8c2680 (diff)
parent38115f2f8cec8087d558c062e779c443a01f87d6 (diff)
downloadlinux-f5db340f19f14a8df9dfd22d71fba1513e9f1f7e.tar.xz
Merge branch 'perf/urgent' into perf/core, to pick up latest fixes and refresh the tree
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c16
-rw-r--r--arch/powerpc/platforms/powernv/idle.c198
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S6
-rw-r--r--arch/powerpc/platforms/powernv/opal.c19
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c133
-rw-r--r--arch/powerpc/platforms/powernv/pci.c160
-rw-r--r--arch/powerpc/platforms/powernv/pci.h13
-rw-r--r--arch/powerpc/platforms/powernv/setup.c11
-rw-r--r--arch/powerpc/platforms/powernv/smp.c34
-rw-r--r--arch/powerpc/platforms/powernv/subcore.c3
10 files changed, 482 insertions, 111 deletions
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index d12ea7b9fd47..3f48f6df1cf3 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -48,6 +48,7 @@ static int pnv_eeh_init(void)
{
struct pci_controller *hose;
struct pnv_phb *phb;
+ int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
if (!firmware_has_feature(FW_FEATURE_OPAL)) {
pr_warn("%s: OPAL is required !\n",
@@ -69,6 +70,9 @@ static int pnv_eeh_init(void)
if (phb->model == PNV_PHB_MODEL_P7IOC)
eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+ if (phb->diag_data_size > max_diag_size)
+ max_diag_size = phb->diag_data_size;
+
/*
* PE#0 should be regarded as valid by EEH core
* if it's not the reserved one. Currently, we
@@ -82,6 +86,8 @@ static int pnv_eeh_init(void)
break;
}
+ eeh_set_pe_aux_size(max_diag_size);
+
return 0;
}
@@ -540,7 +546,7 @@ static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
s64 rc;
rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data,
- PNV_PCI_DIAG_BUF_SIZE);
+ phb->diag_data_size);
if (rc != OPAL_SUCCESS)
pr_warn("%s: Failure %lld getting PHB#%x diag-data\n",
__func__, rc, pe->phb->global_number);
@@ -1314,7 +1320,8 @@ static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data)
static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose)
{
struct pnv_phb *phb = hose->private_data;
- struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag;
+ struct OpalIoP7IOCErrorData *data =
+ (struct OpalIoP7IOCErrorData*)phb->diag_data;
long rc;
rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data));
@@ -1549,10 +1556,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
/* Dump PHB diag-data */
rc = opal_pci_get_phb_diag_data2(phb->opal_id,
- phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
+ phb->diag_data, phb->diag_data_size);
if (rc == OPAL_SUCCESS)
pnv_pci_dump_phb_diag_data(hose,
- phb->diag.blob);
+ phb->diag_data);
/* Try best to clear it */
opal_pci_eeh_freeze_clear(phb->opal_id,
@@ -1795,7 +1802,6 @@ static int __init eeh_powernv_init(void)
{
int ret = -EINVAL;
- eeh_set_pe_aux_size(PNV_PCI_DIAG_BUF_SIZE);
ret = eeh_ops_register(&pnv_eeh_ops);
if (!ret)
pr_info("EEH: PowerNV platform initialized\n");
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 445f30a2c5ef..2abee070373f 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -23,6 +23,7 @@
#include <asm/cpuidle.h>
#include <asm/code-patching.h>
#include <asm/smp.h>
+#include <asm/runlatch.h>
#include "powernv.h"
#include "subcore.h"
@@ -30,8 +31,33 @@
/* Power ISA 3.0 allows for stop states 0x0 - 0xF */
#define MAX_STOP_STATE 0xF
+#define P9_STOP_SPR_MSR 2000
+#define P9_STOP_SPR_PSSCR 855
+
static u32 supported_cpuidle_states;
+/*
+ * The default stop state that will be used by ppc_md.power_save
+ * function on platforms that support stop instruction.
+ */
+static u64 pnv_default_stop_val;
+static u64 pnv_default_stop_mask;
+static bool default_stop_found;
+
+/*
+ * First deep stop state. Used to figure out when to save/restore
+ * hypervisor context.
+ */
+u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
+
+/*
+ * psscr value and mask of the deepest stop idle state.
+ * Used when a cpu is offlined.
+ */
+static u64 pnv_deepest_stop_psscr_val;
+static u64 pnv_deepest_stop_psscr_mask;
+static bool deepest_stop_found;
+
static int pnv_save_sprs_for_deep_states(void)
{
int cpu;
@@ -48,6 +74,8 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t hid4_val = mfspr(SPRN_HID4);
uint64_t hid5_val = mfspr(SPRN_HID5);
uint64_t hmeer_val = mfspr(SPRN_HMEER);
+ uint64_t msr_val = MSR_IDLE;
+ uint64_t psscr_val = pnv_deepest_stop_psscr_val;
for_each_possible_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
@@ -61,6 +89,18 @@ static int pnv_save_sprs_for_deep_states(void)
if (rc != 0)
return rc;
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ rc = opal_slw_set_reg(pir, P9_STOP_SPR_MSR, msr_val);
+ if (rc)
+ return rc;
+
+ rc = opal_slw_set_reg(pir,
+ P9_STOP_SPR_PSSCR, psscr_val);
+
+ if (rc)
+ return rc;
+ }
+
/* HIDs are per core registers */
if (cpu_thread_in_core(cpu) == 0) {
@@ -72,17 +112,21 @@ static int pnv_save_sprs_for_deep_states(void)
if (rc != 0)
return rc;
- rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
- if (rc != 0)
- return rc;
+ /* Only p8 needs to set extra HID regiters */
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
- rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
- if (rc != 0)
- return rc;
+ rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
+ if (rc != 0)
+ return rc;
- rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
- if (rc != 0)
- return rc;
+ rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
+ if (rc != 0)
+ return rc;
+
+ rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
+ if (rc != 0)
+ return rc;
+ }
}
}
@@ -96,15 +140,24 @@ static void pnv_alloc_idle_core_states(void)
u32 *core_idle_state;
/*
- * core_idle_state - First 8 bits track the idle state of each thread
- * of the core. The 8th bit is the lock bit. Initially all thread bits
- * are set. They are cleared when the thread enters deep idle state
- * like sleep and winkle. Initially the lock bit is cleared.
- * The lock bit has 2 purposes
- * a. While the first thread is restoring core state, it prevents
- * other threads in the core from switching to process context.
- * b. While the last thread in the core is saving the core state, it
- * prevents a different thread from waking up.
+ * core_idle_state - The lower 8 bits track the idle state of
+ * each thread of the core.
+ *
+ * The most significant bit is the lock bit.
+ *
+ * Initially all the bits corresponding to threads_per_core
+ * are set. They are cleared when the thread enters deep idle
+ * state like sleep and winkle/stop.
+ *
+ * Initially the lock bit is cleared. The lock bit has 2
+ * purposes:
+ * a. While the first thread in the core waking up from
+ * idle is restoring core state, it prevents other
+ * threads in the core from switching to process
+ * context.
+ * b. While the last thread in the core is saving the
+ * core state, it prevents a different thread from
+ * waking up.
*/
for (i = 0; i < nr_cores; i++) {
int first_cpu = i * threads_per_core;
@@ -112,7 +165,7 @@ static void pnv_alloc_idle_core_states(void)
size_t paca_ptr_array_size;
core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
- *core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+ *core_idle_state = (1 << threads_per_core) - 1;
paca_ptr_array_size = (threads_per_core *
sizeof(struct paca_struct *));
@@ -231,56 +284,104 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
show_fastsleep_workaround_applyonce,
store_fastsleep_workaround_applyonce);
-/*
- * The default stop state that will be used by ppc_md.power_save
- * function on platforms that support stop instruction.
- */
-static u64 pnv_default_stop_val;
-static u64 pnv_default_stop_mask;
-static bool default_stop_found;
+static unsigned long __power7_idle_type(unsigned long type)
+{
+ unsigned long srr1;
-/*
- * Used for ppc_md.power_save which needs a function with no parameters
- */
-static void power9_idle(void)
+ if (!prep_irq_for_idle_irqsoff())
+ return 0;
+
+ __ppc64_runlatch_off();
+ srr1 = power7_idle_insn(type);
+ __ppc64_runlatch_on();
+
+ fini_irq_for_idle_irqsoff();
+
+ return srr1;
+}
+
+void power7_idle_type(unsigned long type)
+{
+ unsigned long srr1;
+
+ srr1 = __power7_idle_type(type);
+ irq_set_pending_from_srr1(srr1);
+}
+
+void power7_idle(void)
{
- power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask);
+ if (!powersave_nap)
+ return;
+
+ power7_idle_type(PNV_THREAD_NAP);
}
-/*
- * First deep stop state. Used to figure out when to save/restore
- * hypervisor context.
- */
-u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
+static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
+ unsigned long stop_psscr_mask)
+{
+ unsigned long psscr;
+ unsigned long srr1;
+
+ if (!prep_irq_for_idle_irqsoff())
+ return 0;
+
+ psscr = mfspr(SPRN_PSSCR);
+ psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+ __ppc64_runlatch_off();
+ srr1 = power9_idle_stop(psscr);
+ __ppc64_runlatch_on();
+
+ fini_irq_for_idle_irqsoff();
+
+ return srr1;
+}
+
+void power9_idle_type(unsigned long stop_psscr_val,
+ unsigned long stop_psscr_mask)
+{
+ unsigned long srr1;
+
+ srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask);
+ irq_set_pending_from_srr1(srr1);
+}
/*
- * psscr value and mask of the deepest stop idle state.
- * Used when a cpu is offlined.
+ * Used for ppc_md.power_save which needs a function with no parameters
*/
-static u64 pnv_deepest_stop_psscr_val;
-static u64 pnv_deepest_stop_psscr_mask;
-static bool deepest_stop_found;
+void power9_idle(void)
+{
+ power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* pnv_cpu_offline: A function that puts the CPU into the deepest
* available platform idle state on a CPU-Offline.
+ * interrupts hard disabled and no lazy irq pending.
*/
unsigned long pnv_cpu_offline(unsigned int cpu)
{
unsigned long srr1;
-
u32 idle_states = pnv_get_supported_cpuidle_states();
+ __ppc64_runlatch_off();
+
if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
- srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
- pnv_deepest_stop_psscr_mask);
+ unsigned long psscr;
+
+ psscr = mfspr(SPRN_PSSCR);
+ psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
+ pnv_deepest_stop_psscr_val;
+ srr1 = power9_idle_stop(psscr);
+
} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
- srr1 = power7_winkle();
+ srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
(idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
- srr1 = power7_sleep();
+ srr1 = power7_idle_insn(PNV_THREAD_SLEEP);
} else if (idle_states & OPAL_PM_NAP_ENABLED) {
- srr1 = power7_nap(1);
+ srr1 = power7_idle_insn(PNV_THREAD_NAP);
} else {
/* This is the fallback method. We emulate snooze */
while (!generic_check_cpu_restart(cpu)) {
@@ -291,8 +392,11 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
HMT_medium();
}
+ __ppc64_runlatch_on();
+
return srr1;
}
+#endif
/*
* Power ISA 3.0 idle initialization.
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index f620572f891f..4ca6c26a56d5 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -99,10 +99,10 @@ opal_return:
lwz r4,8(r1);
ld r5,PPC_LR_STKOFF(r1);
ld r6,PACASAVEDMSR(r13);
- mtspr SPRN_SRR0,r5;
- mtspr SPRN_SRR1,r6;
mtcr r4;
- rfid
+ mtspr SPRN_HSRR0,r5;
+ mtspr SPRN_HSRR1,r6;
+ hrfid
opal_real_call:
mfcr r11
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 59684b4af4d1..cad6b57ce494 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -59,6 +59,8 @@ static struct task_struct *kopald_tsk;
void opal_configure_cores(void)
{
+ u64 reinit_flags = 0;
+
/* Do the actual re-init, This will clobber all FPRs, VRs, etc...
*
* It will preserve non volatile GPRs and HSPRG0/1. It will
@@ -66,11 +68,24 @@ void opal_configure_cores(void)
* but it might clobber a bunch.
*/
#ifdef __BIG_ENDIAN__
- opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE);
+ reinit_flags |= OPAL_REINIT_CPUS_HILE_BE;
#else
- opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE);
+ reinit_flags |= OPAL_REINIT_CPUS_HILE_LE;
#endif
+ /*
+ * POWER9 always support running hash:
+ * ie. Host hash supports hash guests
+ * Host radix supports hash/radix guests
+ */
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+ reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH;
+ if (early_radix_enabled())
+ reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX;
+ }
+
+ opal_reinit_cpus(reinit_flags);
+
/* Restore some bits */
if (cur_cpu_spec->cpu_restore)
cur_cpu_spec->cpu_restore();
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 283caf1070c9..437613588df1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1718,6 +1718,100 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
*/
}
+static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
+{
+ unsigned short vendor = 0;
+ struct pci_dev *pdev;
+
+ if (pe->device_count == 1)
+ return true;
+
+ /* pe->pdev should be set if it's a single device, pe->pbus if not */
+ if (!pe->pbus)
+ return true;
+
+ list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
+ if (!vendor) {
+ vendor = pdev->vendor;
+ continue;
+ }
+
+ if (pdev->vendor != vendor)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Reconfigure TVE#0 to be usable as 64-bit DMA space.
+ *
+ * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
+ * Devices can only access more than that if bit 59 of the PCI address is set
+ * by hardware, which indicates TVE#1 should be used instead of TVE#0.
+ * Many PCI devices are not capable of addressing that many bits, and as a
+ * result are limited to the 4GB of virtual memory made available to 32-bit
+ * devices in TVE#0.
+ *
+ * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
+ * devices by configuring the virtual memory past the first 4GB inaccessible
+ * by 64-bit DMAs. This should only be used by devices that want more than
+ * 4GB, and only on PEs that have no 32-bit devices.
+ *
+ * Currently this will only work on PHB3 (POWER8).
+ */
+static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
+{
+ u64 window_size, table_size, tce_count, addr;
+ struct page *table_pages;
+ u64 tce_order = 28; /* 256MB TCEs */
+ __be64 *tces;
+ s64 rc;
+
+ /*
+ * Window size needs to be a power of two, but needs to account for
+ * shifting memory by the 4GB offset required to skip 32bit space.
+ */
+ window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
+ tce_count = window_size >> tce_order;
+ table_size = tce_count << 3;
+
+ if (table_size < PAGE_SIZE)
+ table_size = PAGE_SIZE;
+
+ table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
+ get_order(table_size));
+ if (!table_pages)
+ goto err;
+
+ tces = page_address(table_pages);
+ if (!tces)
+ goto err;
+
+ memset(tces, 0, table_size);
+
+ for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
+ tces[(addr + (1ULL << 32)) >> tce_order] =
+ cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+ }
+
+ rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
+ pe->pe_number,
+ /* reconfigure window 0 */
+ (pe->pe_number << 1) + 0,
+ 1,
+ __pa(tces),
+ table_size,
+ 1 << tce_order);
+ if (rc == OPAL_SUCCESS) {
+ pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
+ return 0;
+ }
+err:
+ pe_err(pe, "Error configuring 64-bit DMA bypass\n");
+ return -EIO;
+}
+
static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
{
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -1726,6 +1820,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
struct pnv_ioda_pe *pe;
uint64_t top;
bool bypass = false;
+ s64 rc;
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
return -ENODEV;;
@@ -1740,8 +1835,27 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
set_dma_ops(&pdev->dev, &dma_direct_ops);
} else {
- dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
- set_dma_ops(&pdev->dev, &dma_iommu_ops);
+ /*
+ * If the device can't set the TCE bypass bit but still wants
+ * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+ * bypass the 32-bit region and be usable for 64-bit DMAs.
+ * The device needs to be able to address all of this space.
+ */
+ if (dma_mask >> 32 &&
+ dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
+ pnv_pci_ioda_pe_single_vendor(pe) &&
+ phb->model == PNV_PHB_MODEL_PHB3) {
+ /* Configure the bypass mode */
+ rc = pnv_pci_ioda_dma_64bit_bypass(pe);
+ if (rc)
+ return rc;
+ /* 4GB offset bypasses 32-bit space */
+ set_dma_offset(&pdev->dev, (1ULL << 32));
+ set_dma_ops(&pdev->dev, &dma_direct_ops);
+ } else {
+ dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
+ set_dma_ops(&pdev->dev, &dma_iommu_ops);
+ }
}
*pdev->dev.dma_mask = dma_mask;
@@ -3123,13 +3237,13 @@ static int pnv_pci_diag_data_set(void *data, u64 val)
phb = hose->private_data;
/* Retrieve the diag data from firmware */
- ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
- PNV_PCI_DIAG_BUF_SIZE);
+ ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
+ phb->diag_data_size);
if (ret != OPAL_SUCCESS)
return -EIO;
/* Print the diag data to the kernel log */
- pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
+ pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
return 0;
}
@@ -3725,6 +3839,15 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
else
phb->model = PNV_PHB_MODEL_UNKNOWN;
+ /* Initialize diagnostic data buffer */
+ prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
+ if (prop32)
+ phb->diag_data_size = be32_to_cpup(prop32);
+ else
+ phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
+
+ phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0);
+
/* Parse 32-bit and IO ranges (if any) */
pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 935ccb249a8a..7905d179d036 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -227,11 +227,39 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev)
}
#endif /* CONFIG_PCI_MSI */
+/* Nicely print the contents of the PE State Tables (PEST). */
+static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
+{
+ __be64 prevA = ULONG_MAX, prevB = ULONG_MAX;
+ bool dup = false;
+ int i;
+
+ for (i = 0; i < pest_size; i++) {
+ __be64 peA = be64_to_cpu(pestA[i]);
+ __be64 peB = be64_to_cpu(pestB[i]);
+
+ if (peA != prevA || peB != prevB) {
+ if (dup) {
+ pr_info("PE[..%03x] A/B: as above\n", i-1);
+ dup = false;
+ }
+ prevA = peA;
+ prevB = peB;
+ if (peA & PNV_IODA_STOPPED_STATE ||
+ peB & PNV_IODA_STOPPED_STATE)
+ pr_info("PE[%03x] A/B: %016llx %016llx\n",
+ i, peA, peB);
+ } else if (!dup && (peA & PNV_IODA_STOPPED_STATE ||
+ peB & PNV_IODA_STOPPED_STATE)) {
+ dup = true;
+ }
+ }
+}
+
static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
struct OpalIoPhbErrorCommon *common)
{
struct OpalIoP7IOCPhbErrorData *data;
- int i;
data = (struct OpalIoP7IOCPhbErrorData *)common;
pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n",
@@ -308,22 +336,13 @@ static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
be64_to_cpu(data->dma1ErrorLog0),
be64_to_cpu(data->dma1ErrorLog1));
- for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
- if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 &&
- (be64_to_cpu(data->pestB[i]) >> 63) == 0)
- continue;
-
- pr_info("PE[%3d] A/B: %016llx %016llx\n",
- i, be64_to_cpu(data->pestA[i]),
- be64_to_cpu(data->pestB[i]));
- }
+ pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_P7IOC_NUM_PEST_REGS);
}
static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
struct OpalIoPhbErrorCommon *common)
{
struct OpalIoPhb3ErrorData *data;
- int i;
data = (struct OpalIoPhb3ErrorData*)common;
pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n",
@@ -404,15 +423,109 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
be64_to_cpu(data->dma1ErrorLog0),
be64_to_cpu(data->dma1ErrorLog1));
- for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
- if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 &&
- (be64_to_cpu(data->pestB[i]) >> 63) == 0)
- continue;
+ pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS);
+}
- pr_info("PE[%3d] A/B: %016llx %016llx\n",
- i, be64_to_cpu(data->pestA[i]),
- be64_to_cpu(data->pestB[i]));
- }
+static void pnv_pci_dump_phb4_diag_data(struct pci_controller *hose,
+ struct OpalIoPhbErrorCommon *common)
+{
+ struct OpalIoPhb4ErrorData *data;
+
+ data = (struct OpalIoPhb4ErrorData*)common;
+ pr_info("PHB4 PHB#%d Diag-data (Version: %d)\n",
+ hose->global_number, be32_to_cpu(common->version));
+ if (data->brdgCtl)
+ pr_info("brdgCtl: %08x\n",
+ be32_to_cpu(data->brdgCtl));
+ if (data->deviceStatus || data->slotStatus ||
+ data->linkStatus || data->devCmdStatus ||
+ data->devSecStatus)
+ pr_info("RootSts: %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(data->deviceStatus),
+ be32_to_cpu(data->slotStatus),
+ be32_to_cpu(data->linkStatus),
+ be32_to_cpu(data->devCmdStatus),
+ be32_to_cpu(data->devSecStatus));
+ if (data->rootErrorStatus || data->uncorrErrorStatus ||
+ data->corrErrorStatus)
+ pr_info("RootErrSts: %08x %08x %08x\n",
+ be32_to_cpu(data->rootErrorStatus),
+ be32_to_cpu(data->uncorrErrorStatus),
+ be32_to_cpu(data->corrErrorStatus));
+ if (data->tlpHdr1 || data->tlpHdr2 ||
+ data->tlpHdr3 || data->tlpHdr4)
+ pr_info("RootErrLog: %08x %08x %08x %08x\n",
+ be32_to_cpu(data->tlpHdr1),
+ be32_to_cpu(data->tlpHdr2),
+ be32_to_cpu(data->tlpHdr3),
+ be32_to_cpu(data->tlpHdr4));
+ if (data->sourceId)
+ pr_info("sourceId: %08x\n", be32_to_cpu(data->sourceId));
+ if (data->nFir)
+ pr_info("nFir: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->nFir),
+ be64_to_cpu(data->nFirMask),
+ be64_to_cpu(data->nFirWOF));
+ if (data->phbPlssr || data->phbCsr)
+ pr_info("PhbSts: %016llx %016llx\n",
+ be64_to_cpu(data->phbPlssr),
+ be64_to_cpu(data->phbCsr));
+ if (data->lemFir)
+ pr_info("Lem: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->lemFir),
+ be64_to_cpu(data->lemErrorMask),
+ be64_to_cpu(data->lemWOF));
+ if (data->phbErrorStatus)
+ pr_info("PhbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbErrorStatus),
+ be64_to_cpu(data->phbFirstErrorStatus),
+ be64_to_cpu(data->phbErrorLog0),
+ be64_to_cpu(data->phbErrorLog1));
+ if (data->phbTxeErrorStatus)
+ pr_info("PhbTxeErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbTxeErrorStatus),
+ be64_to_cpu(data->phbTxeFirstErrorStatus),
+ be64_to_cpu(data->phbTxeErrorLog0),
+ be64_to_cpu(data->phbTxeErrorLog1));
+ if (data->phbRxeArbErrorStatus)
+ pr_info("RxeArbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRxeArbErrorStatus),
+ be64_to_cpu(data->phbRxeArbFirstErrorStatus),
+ be64_to_cpu(data->phbRxeArbErrorLog0),
+ be64_to_cpu(data->phbRxeArbErrorLog1));
+ if (data->phbRxeMrgErrorStatus)
+ pr_info("RxeMrgErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRxeMrgErrorStatus),
+ be64_to_cpu(data->phbRxeMrgFirstErrorStatus),
+ be64_to_cpu(data->phbRxeMrgErrorLog0),
+ be64_to_cpu(data->phbRxeMrgErrorLog1));
+ if (data->phbRxeTceErrorStatus)
+ pr_info("RxeTceErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRxeTceErrorStatus),
+ be64_to_cpu(data->phbRxeTceFirstErrorStatus),
+ be64_to_cpu(data->phbRxeTceErrorLog0),
+ be64_to_cpu(data->phbRxeTceErrorLog1));
+
+ if (data->phbPblErrorStatus)
+ pr_info("PblErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbPblErrorStatus),
+ be64_to_cpu(data->phbPblFirstErrorStatus),
+ be64_to_cpu(data->phbPblErrorLog0),
+ be64_to_cpu(data->phbPblErrorLog1));
+ if (data->phbPcieDlpErrorStatus)
+ pr_info("PcieDlp: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbPcieDlpErrorLog1),
+ be64_to_cpu(data->phbPcieDlpErrorLog2),
+ be64_to_cpu(data->phbPcieDlpErrorStatus));
+ if (data->phbRegbErrorStatus)
+ pr_info("RegbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRegbErrorStatus),
+ be64_to_cpu(data->phbRegbFirstErrorStatus),
+ be64_to_cpu(data->phbRegbErrorLog0),
+ be64_to_cpu(data->phbRegbErrorLog1));
+
+
+ pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB4_NUM_PEST_REGS);
}
void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
@@ -431,6 +544,9 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
case OPAL_PHB_ERROR_DATA_TYPE_PHB3:
pnv_pci_dump_phb3_diag_data(hose, common);
break;
+ case OPAL_PHB_ERROR_DATA_TYPE_PHB4:
+ pnv_pci_dump_phb4_diag_data(hose, common);
+ break;
default:
pr_warn("%s: Unrecognized ioType %d\n",
__func__, be32_to_cpu(common->ioType));
@@ -445,8 +561,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
spin_lock_irqsave(&phb->lock, flags);
/* Fetch PHB diag-data */
- rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
- PNV_PCI_DIAG_BUF_SIZE);
+ rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
+ phb->diag_data_size);
has_diag = (rc == OPAL_SUCCESS);
/* If PHB supports compound PE, to handle it */
@@ -474,7 +590,7 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
* with the normal errors generated when probing empty slots
*/
if (has_diag && ret)
- pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
+ pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
spin_unlock_irqrestore(&phb->lock, flags);
}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 18c8a2fa03b8..f16bc403ec03 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -33,6 +33,9 @@ enum pnv_phb_model {
#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
+/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */
+#define PNV_IODA_STOPPED_STATE 0x8000000000000000
+
/* Data associated with a PE, including IOMMU tracking etc.. */
struct pnv_phb;
struct pnv_ioda_pe {
@@ -169,13 +172,9 @@ struct pnv_phb {
unsigned int pe_rmap[0x10000];
} ioda;
- /* PHB and hub status structure */
- union {
- unsigned char blob[PNV_PCI_DIAG_BUF_SIZE];
- struct OpalIoP7IOCPhbErrorData p7ioc;
- struct OpalIoPhb3ErrorData phb3;
- struct OpalIoP7IOCErrorData hub_diag;
- } diag;
+ /* PHB and hub diagnostics */
+ unsigned int diag_data_size;
+ u8 *diag_data;
/* Nvlink2 data */
struct npu {
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 2dc7e5fb86c3..897aa1400eb8 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -225,6 +225,8 @@ static void pnv_kexec_wait_secondaries_down(void)
static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
{
+ u64 reinit_flags;
+
if (xive_enabled())
xive_kexec_teardown_cpu(secondary);
else
@@ -254,8 +256,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
* We might be running as little-endian - now that interrupts
* are disabled, reset the HILE bit to big-endian so we don't
* take interrupts in the wrong endian later
+ *
+ * We reinit to enable both radix and hash on P9 to ensure
+ * the mode used by the next kernel is always supported.
*/
- opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE);
+ reinit_flags = OPAL_REINIT_CPUS_HILE_BE;
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX |
+ OPAL_REINIT_CPUS_MMU_HASH;
+ opal_reinit_cpus(reinit_flags);
}
}
#endif /* CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 4aff754b6f2c..40dae96f7e20 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -63,7 +63,8 @@ static int pnv_smp_kick_cpu(int nr)
long rc;
uint8_t status;
- BUG_ON(nr < 0 || nr >= NR_CPUS);
+ if (nr < 0 || nr >= nr_cpu_ids)
+ return -EINVAL;
/*
* If we already started or OPAL is not supported, we just
@@ -144,7 +145,14 @@ static void pnv_smp_cpu_kill_self(void)
unsigned long srr1, wmask;
/* Standard hot unplug procedure */
- local_irq_disable();
+ /*
+ * This hard disables local interurpts, ensuring we have no lazy
+ * irqs pending.
+ */
+ WARN_ON(irqs_disabled());
+ hard_irq_disable();
+ WARN_ON(lazy_irq_pending());
+
idle_task_exit();
current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
@@ -162,16 +170,6 @@ static void pnv_smp_cpu_kill_self(void)
*/
mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1);
- /*
- * Hard-disable interrupts, and then clear irq_happened flags
- * that we can safely ignore while off-line, since they
- * are for things for which we do no processing when off-line
- * (or in the case of HMI, all the processing we need to do
- * is done in lower-level real-mode code).
- */
- hard_irq_disable();
- local_paca->irq_happened &= ~(PACA_IRQ_DEC | PACA_IRQ_HMI);
-
while (!generic_check_cpu_restart(cpu)) {
/*
* Clear IPI flag, since we don't handle IPIs while
@@ -182,9 +180,9 @@ static void pnv_smp_cpu_kill_self(void)
*/
kvmppc_set_host_ipi(cpu, 0);
- ppc64_runlatch_off();
srr1 = pnv_cpu_offline(cpu);
- ppc64_runlatch_on();
+
+ WARN_ON(lazy_irq_pending());
/*
* If the SRR1 value indicates that we woke up due to
@@ -198,8 +196,7 @@ static void pnv_smp_cpu_kill_self(void)
* contains 0.
*/
if (((srr1 & wmask) == SRR1_WAKEEE) ||
- ((srr1 & wmask) == SRR1_WAKEHVI) ||
- (local_paca->irq_happened & PACA_IRQ_EE)) {
+ ((srr1 & wmask) == SRR1_WAKEHVI)) {
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
if (xive_enabled())
xive_flush_interrupt();
@@ -211,14 +208,15 @@ static void pnv_smp_cpu_kill_self(void)
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
}
- local_paca->irq_happened &= ~(PACA_IRQ_EE | PACA_IRQ_DBELL);
smp_mb();
if (cpu_core_split_required())
continue;
if (srr1 && !generic_check_cpu_restart(cpu))
- DBG("CPU%d Unexpected exit while offline !\n", cpu);
+ DBG("CPU%d Unexpected exit while offline srr1=%lx!\n",
+ cpu, srr1);
+
}
/* Re-enable decrementer interrupts */
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 309876d699e9..596ae2e98040 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -18,6 +18,7 @@
#include <linux/stop_machine.h>
#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
#include <asm/kvm_ppc.h>
#include <asm/machdep.h>
#include <asm/opal.h>
@@ -182,7 +183,7 @@ static void unsplit_core(void)
cpu = smp_processor_id();
if (cpu_thread_in_core(cpu) != 0) {
while (mfspr(SPRN_HID0) & mask)
- power7_nap(0);
+ power7_idle_insn(PNV_THREAD_NAP);
per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
return;