diff options
Diffstat (limited to 'arch/powerpc/platforms')
44 files changed, 3056 insertions, 1062 deletions
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index b369ed4e3675..25ebe634a661 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -272,14 +272,6 @@ config PPC4xx_GPIO help Enable gpiolib support for ppc440 based boards -config PPC4xx_OCM - bool "PPC4xx On Chip Memory (OCM) support" - depends on 4xx - select PPC_LIB_RHEAP - help - Enable OCM support for PowerPC 4xx platforms with on chip memory, - OCM provides the fast place for memory access to improve performance. - # 44x specific CPU modules, selected based on the board above. config 440EP bool diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile index f5ae27ca131b..d009d2e0b9e8 100644 --- a/arch/powerpc/platforms/4xx/Makefile +++ b/arch/powerpc/platforms/4xx/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += uic.o machine_check.o -obj-$(CONFIG_PPC4xx_OCM) += ocm.o obj-$(CONFIG_4xx_SOC) += soc.o obj-$(CONFIG_PCI) += pci.o obj-$(CONFIG_PPC4xx_HSTA_MSI) += hsta_msi.o diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c deleted file mode 100644 index ba3257406ced..000000000000 --- a/arch/powerpc/platforms/4xx/ocm.c +++ /dev/null @@ -1,390 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * PowerPC 4xx OCM memory allocation support - * - * (C) Copyright 2009, Applied Micro Circuits Corporation - * Victor Gallardo (vgallardo@amcc.com) - * - * See file CREDITS for list of people who contributed to this - * project. - */ - -#include <linux/kernel.h> -#include <linux/dma-mapping.h> -#include <linux/of.h> -#include <linux/of_address.h> -#include <asm/rheap.h> -#include <asm/ppc4xx_ocm.h> -#include <linux/slab.h> -#include <linux/debugfs.h> - -#define OCM_DISABLED 0 -#define OCM_ENABLED 1 - -struct ocm_block { - struct list_head list; - void __iomem *addr; - int size; - const char *owner; -}; - -/* non-cached or cached region */ -struct ocm_region { - phys_addr_t phys; - void __iomem *virt; - - int memtotal; - int memfree; - - rh_info_t *rh; - struct list_head list; -}; - -struct ocm_info { - int index; - int status; - int ready; - - phys_addr_t phys; - - int alignment; - int memtotal; - int cache_size; - - struct ocm_region nc; /* non-cached region */ - struct ocm_region c; /* cached region */ -}; - -static struct ocm_info *ocm_nodes; -static int ocm_count; - -static struct ocm_info *ocm_get_node(unsigned int index) -{ - if (index >= ocm_count) { - printk(KERN_ERR "PPC4XX OCM: invalid index"); - return NULL; - } - - return &ocm_nodes[index]; -} - -static int ocm_free_region(struct ocm_region *ocm_reg, const void *addr) -{ - struct ocm_block *blk, *tmp; - unsigned long offset; - - if (!ocm_reg->virt) - return 0; - - list_for_each_entry_safe(blk, tmp, &ocm_reg->list, list) { - if (blk->addr == addr) { - offset = addr - ocm_reg->virt; - ocm_reg->memfree += blk->size; - rh_free(ocm_reg->rh, offset); - list_del(&blk->list); - kfree(blk); - return 1; - } - } - - return 0; -} - -static void __init ocm_init_node(int count, struct device_node *node) -{ - struct ocm_info *ocm; - - const unsigned int *cell_index; - const unsigned int *cache_size; - int len; - - struct resource rsrc; - - ocm = ocm_get_node(count); - - cell_index = of_get_property(node, "cell-index", &len); - if (!cell_index) { - printk(KERN_ERR "PPC4XX OCM: missing cell-index property"); - return; - } - ocm->index = *cell_index; - - if (of_device_is_available(node)) - ocm->status = OCM_ENABLED; - - cache_size = of_get_property(node, "cached-region-size", &len); - if (cache_size) - ocm->cache_size = *cache_size; - - if (of_address_to_resource(node, 0, &rsrc)) { - printk(KERN_ERR "PPC4XX OCM%d: could not get resource address\n", - ocm->index); - return; - } - - ocm->phys = rsrc.start; - ocm->memtotal = (rsrc.end - rsrc.start + 1); - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (%s)\n", - ocm->index, ocm->memtotal, - (ocm->status == OCM_DISABLED) ? "disabled" : "enabled"); - - if (ocm->status == OCM_DISABLED) - return; - - /* request region */ - - if (!request_mem_region(ocm->phys, ocm->memtotal, "ppc4xx_ocm")) { - printk(KERN_ERR "PPC4XX OCM%d: could not request region\n", - ocm->index); - return; - } - - /* Configure non-cached and cached regions */ - - ocm->nc.phys = ocm->phys; - ocm->nc.memtotal = ocm->memtotal - ocm->cache_size; - ocm->nc.memfree = ocm->nc.memtotal; - - ocm->c.phys = ocm->phys + ocm->nc.memtotal; - ocm->c.memtotal = ocm->cache_size; - ocm->c.memfree = ocm->c.memtotal; - - if (ocm->nc.memtotal == 0) - ocm->nc.phys = 0; - - if (ocm->c.memtotal == 0) - ocm->c.phys = 0; - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (non-cached)\n", - ocm->index, ocm->nc.memtotal); - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (cached)\n", - ocm->index, ocm->c.memtotal); - - /* ioremap the non-cached region */ - if (ocm->nc.memtotal) { - ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal, - _PAGE_EXEC | pgprot_val(PAGE_KERNEL_NCG)); - - if (!ocm->nc.virt) { - printk(KERN_ERR - "PPC4XX OCM%d: failed to ioremap non-cached memory\n", - ocm->index); - ocm->nc.memfree = 0; - return; - } - } - - /* ioremap the cached region */ - - if (ocm->c.memtotal) { - ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal, - _PAGE_EXEC | pgprot_val(PAGE_KERNEL)); - - if (!ocm->c.virt) { - printk(KERN_ERR - "PPC4XX OCM%d: failed to ioremap cached memory\n", - ocm->index); - ocm->c.memfree = 0; - return; - } - } - - /* Create Remote Heaps */ - - ocm->alignment = 4; /* default 4 byte alignment */ - - if (ocm->nc.virt) { - ocm->nc.rh = rh_create(ocm->alignment); - rh_attach_region(ocm->nc.rh, 0, ocm->nc.memtotal); - } - - if (ocm->c.virt) { - ocm->c.rh = rh_create(ocm->alignment); - rh_attach_region(ocm->c.rh, 0, ocm->c.memtotal); - } - - INIT_LIST_HEAD(&ocm->nc.list); - INIT_LIST_HEAD(&ocm->c.list); - - ocm->ready = 1; -} - -static int ocm_debugfs_show(struct seq_file *m, void *v) -{ - struct ocm_block *blk, *tmp; - unsigned int i; - - for (i = 0; i < ocm_count; i++) { - struct ocm_info *ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - seq_printf(m, "PPC4XX OCM : %d\n", ocm->index); - seq_printf(m, "PhysAddr : %pa\n", &(ocm->phys)); - seq_printf(m, "MemTotal : %d Bytes\n", ocm->memtotal); - seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal); - seq_printf(m, "MemTotal(C) : %d Bytes\n\n", ocm->c.memtotal); - - seq_printf(m, "NC.PhysAddr : %pa\n", &(ocm->nc.phys)); - seq_printf(m, "NC.VirtAddr : 0x%p\n", ocm->nc.virt); - seq_printf(m, "NC.MemTotal : %d Bytes\n", ocm->nc.memtotal); - seq_printf(m, "NC.MemFree : %d Bytes\n", ocm->nc.memfree); - - list_for_each_entry_safe(blk, tmp, &ocm->nc.list, list) { - seq_printf(m, "NC.MemUsed : %d Bytes (%s)\n", - blk->size, blk->owner); - } - - seq_printf(m, "\nC.PhysAddr : %pa\n", &(ocm->c.phys)); - seq_printf(m, "C.VirtAddr : 0x%p\n", ocm->c.virt); - seq_printf(m, "C.MemTotal : %d Bytes\n", ocm->c.memtotal); - seq_printf(m, "C.MemFree : %d Bytes\n", ocm->c.memfree); - - list_for_each_entry_safe(blk, tmp, &ocm->c.list, list) { - seq_printf(m, "C.MemUsed : %d Bytes (%s)\n", - blk->size, blk->owner); - } - - seq_putc(m, '\n'); - } - - return 0; -} - -static int ocm_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, ocm_debugfs_show, NULL); -} - -static const struct file_operations ocm_debugfs_fops = { - .open = ocm_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int ocm_debugfs_init(void) -{ - struct dentry *junk; - - junk = debugfs_create_dir("ppc4xx_ocm", 0); - if (!junk) { - printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create dir\n"); - return -1; - } - - if (debugfs_create_file("info", 0644, junk, NULL, &ocm_debugfs_fops)) { - printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create file\n"); - return -1; - } - - return 0; -} - -void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align, - int flags, const char *owner) -{ - void __iomem *addr = NULL; - unsigned long offset; - struct ocm_info *ocm; - struct ocm_region *ocm_reg; - struct ocm_block *ocm_blk; - int i; - - for (i = 0; i < ocm_count; i++) { - ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - if (flags == PPC4XX_OCM_NON_CACHED) - ocm_reg = &ocm->nc; - else - ocm_reg = &ocm->c; - - if (!ocm_reg->virt) - continue; - - if (align < ocm->alignment) - align = ocm->alignment; - - offset = rh_alloc_align(ocm_reg->rh, size, align, NULL); - - if (IS_ERR_VALUE(offset)) - continue; - - ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL); - if (!ocm_blk) { - rh_free(ocm_reg->rh, offset); - break; - } - - *phys = ocm_reg->phys + offset; - addr = ocm_reg->virt + offset; - size = ALIGN(size, align); - - ocm_blk->addr = addr; - ocm_blk->size = size; - ocm_blk->owner = owner; - list_add_tail(&ocm_blk->list, &ocm_reg->list); - - ocm_reg->memfree -= size; - - break; - } - - return addr; -} - -void ppc4xx_ocm_free(const void *addr) -{ - int i; - - if (!addr) - return; - - for (i = 0; i < ocm_count; i++) { - struct ocm_info *ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - if (ocm_free_region(&ocm->nc, addr) || - ocm_free_region(&ocm->c, addr)) - return; - } -} - -static int __init ppc4xx_ocm_init(void) -{ - struct device_node *np; - int count; - - count = 0; - for_each_compatible_node(np, NULL, "ibm,ocm") - count++; - - if (!count) - return 0; - - ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL); - if (!ocm_nodes) - return -ENOMEM; - - ocm_count = count; - count = 0; - - for_each_compatible_node(np, NULL, "ibm,ocm") { - ocm_init_node(count, np); - count++; - } - - ocm_debugfs_init(); - - return 0; -} - -arch_initcall(ppc4xx_ocm_init); diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index f3fb79fccc72..d82e3664ffdf 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -197,7 +197,8 @@ endmenu config PPC601_SYNC_FIX bool "Workarounds for PPC601 bugs" - depends on PPC_BOOK3S_32 && PPC_PMAC + depends on PPC_BOOK3S_601 && PPC_PMAC + default y help Some versions of the PPC601 (the first PowerPC chip) have bugs which mean that extra synchronization instructions are required near diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 56a7c814160d..12543e53fa96 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -6,6 +6,9 @@ config PPC64 This option selects whether a 32-bit or a 64-bit kernel will be built. +config PPC_BOOK3S_32 + bool + menu "Processor support" choice prompt "Processor Type" @@ -21,13 +24,20 @@ choice If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx. -config PPC_BOOK3S_32 - bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" +config PPC_BOOK3S_6xx + bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601" + select PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP select PPC_HAVE_KUAP +config PPC_BOOK3S_601 + bool "PowerPC 601" + select PPC_BOOK3S_32 + select PPC_FPU + select PPC_HAVE_KUAP + config PPC_85xx bool "Freescale 85xx" select E500 @@ -450,8 +460,10 @@ config NOT_COHERENT_CACHE depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_SYNC_DMA_FOR_CPU + select DMA_DIRECT_REMAP default n if PPC_47x default y diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 16dfee29aa41..ca9ffc1c8685 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, window->table.it_size = size >> window->table.it_page_shift; window->table.it_ops = &cell_iommu_ops; - iommu_init_table(&window->table, iommu->nid); + iommu_init_table(&window->table, iommu->nid, 0, 0); pr_debug("\tioid %d\n", window->ioid); pr_debug("\tblocksize %ld\n", window->table.it_blocksize); diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index 77fee09104f8..b500a6e47e6b 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void) */ iommu_table_iobmap.it_blocksize = 4; iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; - iommu_init_table(&iommu_table_iobmap, 0); + iommu_init_table(&iommu_table_iobmap, 0, 0, 0); pr_debug(" <- %s\n", __func__); } diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 850eee860cf2..938803eab0ad 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -12,7 +12,6 @@ config PPC_POWERNV select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 - select PPC_SCOM select ARCH_RANDOM select CPU_FREQ select PPC_DOORBELL @@ -47,3 +46,7 @@ config PPC_VAS VAS adapters are found in POWER9 based systems. If unsure, say N. + +config SCOM_DEBUGFS + bool "Expose SCOM controllers via debugfs" + depends on DEBUG_FS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index da2e99efbd04..a3ac9646119d 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -4,15 +4,19 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o +obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o +obj-$(CONFIG_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_OPAL_CORE) += opal-core.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o -obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o obj-$(CONFIG_OCXL_BASE) += ocxl.o +obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 620a986209f5..6bc24a47e9ef 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -34,6 +34,7 @@ #include "powernv.h" #include "pci.h" +#include "../../../../drivers/pci/pci.h" static int eeh_event_irq = -EINVAL; @@ -41,13 +42,10 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); eeh_add_device_early(pdn); eeh_add_device_late(pdev); eeh_sysfs_add_device(pdev); @@ -199,6 +197,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); #endif /* CONFIG_DEBUG_FS */ +void pnv_eeh_enable_phbs(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + /* + * If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; + } +} + /** * pnv_eeh_post_init - EEH platform dependent post initialization * @@ -213,9 +230,7 @@ int pnv_eeh_post_init(void) struct pnv_phb *phb; int ret = 0; - /* Probe devices & build address cache */ - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); @@ -237,19 +252,11 @@ int pnv_eeh_post_init(void) if (!eeh_enabled()) disable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; - /* - * If EEH is enabled, we're going to rely on that. - * Otherwise, we restore to conventional mechanism - * to clear frozen PE during PCI config access. - */ - if (eeh_enabled()) - phb->flags |= PNV_PHB_FLAG_EEH; - else - phb->flags &= ~PNV_PHB_FLAG_EEH; - /* Create debugfs entries */ #ifdef CONFIG_DEBUG_FS if (phb->has_dbgfs || !phb->dbgfs) @@ -377,6 +384,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -402,9 +411,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) /* Create PE */ ret = eeh_add_to_parent_pe(edev); if (ret) { - pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n", - __func__, hose->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret); + eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret); return NULL; } @@ -453,11 +460,17 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * Enable EEH explicitly so that we will do EEH check * while accessing I/O stuff */ - eeh_add_flag(EEH_ENABLED); + if (!eeh_has_flag(EEH_ENABLED)) { + enable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + eeh_add_flag(EEH_ENABLED); + } /* Save memory bars */ eeh_save_bars(edev); + eeh_edev_dbg(edev, "EEH enabled on device\n"); + return NULL; } @@ -837,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) int aer = edev ? edev->aer_cap : 0; u32 ctrl; - pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n", __func__, pci_domain_nr(dev->bus), dev->bus->number, option); @@ -895,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option) if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL)) return __pnv_eeh_bridge_reset(pdev, option); + pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(pdev->bus), + pdev->bus->number, option); + switch (option) { case EEH_RESET_FUNDAMENTAL: scope = OPAL_RESET_PCI_FUNDAMENTAL; @@ -1113,17 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return -EIO; } + if (pci_is_root_bus(bus)) + return pnv_eeh_root_reset(hose, option); + /* - * If dealing with the root bus (or the bus underneath the - * root port), we reset the bus underneath the root port. + * For hot resets try use the generic PCI error recovery reset + * functions. These correctly handles the case where the secondary + * bus is behind a hotplug slot and it will use the slot provided + * reset methods to prevent spurious hotplug events during the reset. * - * The cxl driver depends on this behaviour for bi-modal card - * switching. + * Fundemental resets need to be handled internally to EEH since the + * PCI core doesn't really have a concept of a fundemental reset, + * mainly because there's no standard way to generate one. Only a + * few devices require an FRESET so it should be fine. */ - if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) - return pnv_eeh_root_reset(hose, option); + if (option != EEH_RESET_FUNDAMENTAL) { + /* + * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the + * de-assert step. It's like the OPAL reset API was + * poorly designed or something... + */ + if (option == EEH_RESET_DEACTIVATE) + return 0; + rc = pci_bus_error_reset(bus->self); + if (!rc) + return 0; + } + + /* otherwise, use the generic bridge reset. this might call into FW */ + if (pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); return pnv_eeh_bridge_reset(bus->self, option); } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 09f49eed7fb8..78599bca66c2 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) sprs.ptcr = mfspr(SPRN_PTCR); sprs.rpr = mfspr(SPRN_RPR); sprs.tscr = mfspr(SPRN_TSCR); - sprs.ldbar = mfspr(SPRN_LDBAR); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + sprs.ldbar = mfspr(SPRN_LDBAR); sprs_saved = true; @@ -789,7 +790,8 @@ core_woken: mtspr(SPRN_MMCR0, sprs.mmcr0); mtspr(SPRN_MMCR1, sprs.mmcr1); mtspr(SPRN_MMCR2, sprs.mmcr2); - mtspr(SPRN_LDBAR, sprs.ldbar); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_LDBAR, sprs.ldbar); mtspr(SPRN_SPRG3, local_paca->sprg_vdso); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index c16249d251f1..b95b9e3c4c98 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -89,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) } EXPORT_SYMBOL(pnv_pci_get_npu_dev); +#ifdef CONFIG_IOMMU_API /* * Returns the PE assoicated with the PCI device of the given * NPU. Returns the linked pci device if pci_dev != NULL. @@ -192,106 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) return 0; } -/* - * Enables 32 bit DMA on NPU. - */ -static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) -{ - struct pci_dev *gpdev; - struct pnv_ioda_pe *gpe; - int64_t rc; - - /* - * Find the assoicated PCI devices and get the dma window - * information from there. - */ - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - rc = pnv_npu_set_window(&npe->table_group, 0, - gpe->table_group.tables[0]); - - /* - * NVLink devices use the same TCE table configuration as - * their parent device so drivers shouldn't be doing DMA - * operations directly on these devices. - */ - set_dma_ops(&npe->pdev->dev, &dma_dummy_ops); -} - -/* - * Enables bypass mode on the NPU. The NPU only supports one - * window per link, so bypass needs to be explicitly enabled or - * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be - * active at the same time. - */ -static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - int64_t rc = 0; - phys_addr_t top = memblock_end_of_DRAM(); - - if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) - return -EINVAL; - - rc = pnv_npu_unset_window(&npe->table_group, 0); - if (rc != OPAL_SUCCESS) - return rc; - - /* Enable the bypass window */ - - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - 0 /* bypass base */, top); - - if (rc == OPAL_SUCCESS) - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - return rc; -} - -void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) -{ - int i; - struct pnv_phb *phb; - struct pci_dn *pdn; - struct pnv_ioda_pe *npe; - struct pci_dev *npdev; - - for (i = 0; ; ++i) { - npdev = pnv_pci_get_npu_dev(gpdev, i); - - if (!npdev) - break; - - pdn = pci_get_pdn(npdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return; - - phb = pci_bus_to_host(npdev->bus)->private_data; - - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - - if (bypass) { - dev_info(&npdev->dev, - "Using 64-bit DMA iommu bypass\n"); - pnv_npu_dma_set_bypass(npe); - } else { - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); - pnv_npu_dma_set_32(npe); - } - } -} - -#ifdef CONFIG_IOMMU_API /* Switch ownership from platform code to external user (e.g. VFIO) */ static void pnv_npu_take_ownership(struct iommu_table_group *table_group) { diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 29ca523c1c79..a2aa5e433ac8 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ); OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); @@ -287,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); +OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE); +OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG); +OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG); diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c new file mode 100644 index 000000000000..ed895d82c048 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interface for exporting the OPAL ELF core. + * Heavily inspired from fs/proc/vmcore.c + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal core: " fmt + +#include <linux/memblock.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/crash_core.h> +#include <linux/of.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + +#define MAX_PT_LOAD_CNT 8 + +/* NT_AUXV note related info */ +#define AUXV_CNT 1 +#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off)) + +struct opalcore_config { + u32 num_cpus; + /* PIR value of crashing CPU */ + u32 crashing_cpu; + + /* CPU state data info from F/W */ + u64 cpu_state_destination_vaddr; + u64 cpu_state_data_size; + u64 cpu_state_entry_size; + + /* OPAL memory to be exported as PT_LOAD segments */ + u64 ptload_addr[MAX_PT_LOAD_CNT]; + u64 ptload_size[MAX_PT_LOAD_CNT]; + u64 ptload_cnt; + + /* Pointer to the first PT_LOAD in the ELF core file */ + Elf64_Phdr *ptload_phdr; + + /* Total size of opalcore file. */ + size_t opalcore_size; + + /* Buffer for all the ELF core headers and the PT_NOTE */ + size_t opalcorebuf_sz; + char *opalcorebuf; + + /* NT_AUXV buffer */ + char auxv_buf[AUXV_DESC_SZ]; +}; + +struct opalcore { + struct list_head list; + u64 paddr; + size_t size; + loff_t offset; +}; + +static LIST_HEAD(opalcore_list); +static struct opalcore_config *oc_conf; +static const struct opal_mpipl_fadump *opalc_metadata; +static const struct opal_mpipl_fadump *opalc_cpu_metadata; + +/* + * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered + * by kernel, SIGTERM otherwise. + */ +bool kernel_initiated; + +static struct opalcore * __init get_new_element(void) +{ + return kzalloc(sizeof(struct opalcore), GFP_KERNEL); +} + +static inline int is_opalcore_usable(void) +{ + return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0; +} + +static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name, + u32 type, void *data, + size_t data_len) +{ + Elf64_Nhdr *note = (Elf64_Nhdr *)buf; + Elf64_Word namesz = strlen(name) + 1; + + note->n_namesz = cpu_to_be32(namesz); + note->n_descsz = cpu_to_be32(data_len); + note->n_type = cpu_to_be32(type); + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word)); + memcpy(buf, name, namesz); + buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word)); + + return buf; +} + +static void fill_prstatus(struct elf_prstatus *prstatus, int pir, + struct pt_regs *regs) +{ + memset(prstatus, 0, sizeof(struct elf_prstatus)); + elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs); + + /* + * Overload PID with PIR value. + * As a PIR value could also be '0', add an offset of '100' + * to every PIR to avoid misinterpretations in GDB. + */ + prstatus->pr_pid = cpu_to_be32(100 + pir); + prstatus->pr_ppid = cpu_to_be32(1); + + /* + * Indicate SIGUSR1 for crash initiated from kernel. + * SIGTERM otherwise. + */ + if (pir == oc_conf->crashing_cpu) { + short sig; + + sig = kernel_initiated ? SIGUSR1 : SIGTERM; + prstatus->pr_cursig = cpu_to_be16(sig); + } +} + +static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf, + u64 opal_boot_entry) +{ + Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf; + int idx = 0; + + memset(bufp, 0, AUXV_DESC_SZ); + + /* Entry point of OPAL */ + bufp[idx++] = cpu_to_be64(AT_ENTRY); + bufp[idx++] = cpu_to_be64(opal_boot_entry); + + /* end of vector */ + bufp[idx++] = cpu_to_be64(AT_NULL); + + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV, + oc_conf->auxv_buf, AUXV_DESC_SZ); + return buf; +} + +/* + * Read from the ELF header and then the crash dump. + * Returns number of bytes read on success, -errno on failure. + */ +static ssize_t read_opalcore(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct opalcore *m; + ssize_t tsz, avail; + loff_t tpos = pos; + + if (pos >= oc_conf->opalcore_size) + return 0; + + /* Adjust count if it goes beyond opalcore size */ + avail = oc_conf->opalcore_size - pos; + if (count > avail) + count = avail; + + if (count == 0) + return 0; + + /* Read ELF core header and/or PT_NOTE segment */ + if (tpos < oc_conf->opalcorebuf_sz) { + tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count); + memcpy(to, oc_conf->opalcorebuf + tpos, tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + + list_for_each_entry(m, &opalcore_list, list) { + /* nothing more to read here */ + if (count == 0) + break; + + if (tpos < m->offset + m->size) { + void *addr; + + tsz = min_t(size_t, m->offset + m->size - tpos, count); + addr = (void *)(m->paddr + tpos - m->offset); + memcpy(to, __va(addr), tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + } + + return (tpos - pos); +} + +static struct bin_attribute opal_core_attr = { + .attr = {.name = "core", .mode = 0400}, + .read = read_opalcore +}; + +/* + * Read CPU state dump data and convert it into ELF notes. + * + * Each register entry is of 16 bytes, A numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. + */ +static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + struct elf_prstatus prstatus; + Elf64_Word *first_cpu_note; + struct pt_regs regs; + char *bufp; + int i; + + size_per_thread = oc_conf->cpu_state_entry_size; + bufp = __va(oc_conf->cpu_state_destination_vaddr); + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", oc_conf->num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + /* + * Skip past the first CPU note. Fill this note with the + * crashing CPU's prstatus. + */ + first_cpu_note = buf; + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + + for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + thread_pir = be32_to_cpu(thdr->pir); + + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, false, ®s); + + pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir, + be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip)); + fill_prstatus(&prstatus, thread_pir, ®s); + + if (thread_pir != oc_conf->crashing_cpu) { + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } else { + /* + * Add crashing CPU as the first NT_PRSTATUS note for + * GDB to process the core file appropriately. + */ + append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } + } + + return buf; +} + +static int __init create_opalcore(void) +{ + u64 opal_boot_entry, opal_base_addr, paddr; + u32 hdr_size, cpu_notes_size, count; + struct device_node *dn; + struct opalcore *new; + loff_t opalcore_off; + struct page *page; + Elf64_Phdr *phdr; + Elf64_Ehdr *elf; + int i, ret; + char *bufp; + + /* Get size of header & CPU notes for OPAL core */ + hdr_size = (sizeof(Elf64_Ehdr) + + ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr))); + cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + + CRASH_CORE_NOTE_DESC_BYTES)) + + (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ)); + + /* Allocate buffer to setup OPAL core */ + oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size); + oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz, + GFP_KERNEL | __GFP_ZERO); + if (!oc_conf->opalcorebuf) { + pr_err("Not enough memory to setup OPAL core (size: %lu)\n", + oc_conf->opalcorebuf_sz); + oc_conf->opalcorebuf_sz = 0; + return -ENOMEM; + } + count = oc_conf->opalcorebuf_sz / PAGE_SIZE; + page = virt_to_page(oc_conf->opalcorebuf); + for (i = 0; i < count; i++) + mark_page_reserved(page + i); + + pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf); + + /* Read OPAL related device-tree entries */ + dn = of_find_node_by_name(NULL, "ibm,opal"); + if (dn) { + ret = of_property_read_u64(dn, "opal-base-address", + &opal_base_addr); + pr_debug("opal-base-address: %llx\n", opal_base_addr); + ret |= of_property_read_u64(dn, "opal-boot-address", + &opal_boot_entry); + pr_debug("opal-boot-address: %llx\n", opal_boot_entry); + } + if (!dn || ret) + pr_warn("WARNING: Failed to read OPAL base & entry values\n"); + + /* Use count to keep track of the program headers */ + count = 0; + + bufp = oc_conf->opalcorebuf; + elf = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELFDATA2MSB; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = cpu_to_be16(ET_CORE); + elf->e_machine = cpu_to_be16(ELF_ARCH); + elf->e_version = cpu_to_be32(EV_CURRENT); + elf->e_entry = 0; + elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr)); + elf->e_shoff = 0; + elf->e_flags = 0; + + elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr)); + elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr)); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_NOTE); + phdr->p_flags = 0; + phdr->p_align = 0; + phdr->p_paddr = phdr->p_vaddr = 0; + phdr->p_offset = cpu_to_be64(hdr_size); + phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size); + count++; + + opalcore_off = oc_conf->opalcorebuf_sz; + oc_conf->ptload_phdr = (Elf64_Phdr *)bufp; + paddr = 0; + for (i = 0; i < oc_conf->ptload_cnt; i++) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_LOAD); + phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X); + phdr->p_align = 0; + + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = oc_conf->ptload_addr[i]; + new->size = oc_conf->ptload_size[i]; + new->offset = opalcore_off; + list_add_tail(&new->list, &opalcore_list); + + phdr->p_paddr = cpu_to_be64(paddr); + phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr); + phdr->p_filesz = phdr->p_memsz = + cpu_to_be64(oc_conf->ptload_size[i]); + phdr->p_offset = cpu_to_be64(opalcore_off); + + count++; + opalcore_off += oc_conf->ptload_size[i]; + paddr += oc_conf->ptload_size[i]; + } + + elf->e_phnum = cpu_to_be16(count); + + bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp); + bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry); + + oc_conf->opalcore_size = opalcore_off; + return 0; +} + +static void opalcore_cleanup(void) +{ + if (oc_conf == NULL) + return; + + /* Remove OPAL core sysfs file */ + sysfs_remove_bin_file(opal_kobj, &opal_core_attr); + oc_conf->ptload_phdr = NULL; + oc_conf->ptload_cnt = 0; + + /* free the buffer used for setting up OPAL core */ + if (oc_conf->opalcorebuf) { + void *end = (void *)((u64)oc_conf->opalcorebuf + + oc_conf->opalcorebuf_sz); + + free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL); + oc_conf->opalcorebuf = NULL; + oc_conf->opalcorebuf_sz = 0; + } + + kfree(oc_conf); + oc_conf = NULL; +} +__exitcall(opalcore_cleanup); + +static void __init opalcore_config_init(void) +{ + u32 idx, cpu_data_version; + struct device_node *np; + const __be32 *prop; + u64 addr = 0; + int i, ret; + + np = of_find_node_by_path("/ibm,opal/dump"); + if (np == NULL) + return; + + if (!of_device_is_compatible(np, "ibm,opal-dump")) { + pr_warn("Support missing for this f/w version!\n"); + return; + } + + /* Check if dump has been initiated on last reboot */ + prop = of_get_property(np, "mpipl-boot", NULL); + if (!prop) { + of_node_put(np); + return; + } + + /* Get OPAL metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("OPAL metadata addr: %llx\n", addr); + opalc_metadata = __va(addr); + + /* Get OPAL CPU metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL CPU metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opalc_cpu_metadata = __va(addr); + + /* Allocate memory for config buffer */ + oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL); + if (oc_conf == NULL) + goto error_out; + + /* Parse OPAL metadata */ + if (opalc_metadata->version != OPAL_MPIPL_VERSION) { + pr_warn("Supported OPAL metadata version: %u, found: %u!\n", + OPAL_MPIPL_VERSION, opalc_metadata->version); + pr_warn("WARNING: F/W using newer OPAL metadata format!!\n"); + } + + oc_conf->ptload_cnt = 0; + idx = be32_to_cpu(opalc_metadata->region_cnt); + if (idx > MAX_PT_LOAD_CNT) { + pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)", + MAX_PT_LOAD_CNT, idx); + idx = MAX_PT_LOAD_CNT; + } + for (i = 0; i < idx; i++) { + oc_conf->ptload_addr[oc_conf->ptload_cnt] = + be64_to_cpu(opalc_metadata->region[i].dest); + oc_conf->ptload_size[oc_conf->ptload_cnt++] = + be64_to_cpu(opalc_metadata->region[i].size); + } + oc_conf->ptload_cnt = i; + oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir); + + if (!oc_conf->ptload_cnt) { + pr_err("OPAL memory regions not found\n"); + goto error_out; + } + + /* Parse OPAL CPU metadata */ + cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version); + if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU data version: %u, found: %u!\n", + HDAT_FADUMP_CPU_DATA_VER, cpu_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest); + if (!addr) { + pr_err("CPU state data not found!\n"); + goto error_out; + } + oc_conf->cpu_state_destination_vaddr = (u64)__va(addr); + + oc_conf->cpu_state_data_size = + be64_to_cpu(opalc_cpu_metadata->region[0].size); + oc_conf->cpu_state_entry_size = + be32_to_cpu(opalc_cpu_metadata->cpu_data_size); + + if ((oc_conf->cpu_state_entry_size == 0) || + (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid.\n"); + goto error_out; + } + oc_conf->num_cpus = (oc_conf->cpu_state_data_size / + oc_conf->cpu_state_entry_size); + + of_node_put(np); + return; + +error_out: + pr_err("Could not export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + of_node_put(np); +} + +static ssize_t fadump_release_opalcore_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int input = -1; + + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { + if (oc_conf == NULL) { + pr_err("'/sys/firmware/opal/core' file not accessible!\n"); + return -EPERM; + } + + /* + * Take away '/sys/firmware/opal/core' and release all memory + * used for exporting this file. + */ + opalcore_cleanup(); + } else + return -EINVAL; + + return count; +} + +static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore, + 0200, NULL, + fadump_release_opalcore_store); + +static int __init opalcore_init(void) +{ + int rc = -1; + + opalcore_config_init(); + + if (oc_conf == NULL) + return rc; + + create_opalcore(); + + /* + * If oc_conf->opalcorebuf= is set in the 2nd kernel, + * then capture the dump. + */ + if (!(is_opalcore_usable())) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + /* Set OPAL core file size */ + opal_core_attr.size = oc_conf->opalcore_size; + + /* Export OPAL core sysfs file */ + rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr); + if (rc != 0) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr); + if (rc) { + pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n", + rc); + } + + return 0; +} +fs_initcall(opalcore_init); diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c new file mode 100644 index 000000000000..d361d37d975f --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal fadump: " fmt + +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + + +#ifdef CONFIG_PRESERVE_FA_DUMP +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * ensure crash data is preserved in hope that the subsequent memory + * preserving kernel boot is going to process this crash data. + */ +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const struct opal_fadump_mem_struct *opal_fdm_active; + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + s64 ret; + + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) + return; + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_debug("Could not get Kernel metadata (%lld)\n", ret); + return; + } + + /* + * Preserve memory only if kernel memory regions are registered + * with f/w for MPIPL. + */ + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + opal_fdm_active = (void *)addr; + if (opal_fdm_active->registered_regions == 0) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get boot memory tag (%lld)\n", ret); + return; + } + + /* + * Memory below this address can be used for booting a + * capture kernel or petitboot kernel. Preserve everything + * above this address for processing crashdump. + */ + fadump_conf->boot_mem_top = be64_to_cpu(addr); + pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top); + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; +} + +#else /* CONFIG_PRESERVE_FA_DUMP */ +static const struct opal_fadump_mem_struct *opal_fdm_active; +static const struct opal_mpipl_fadump *opal_cpu_metadata; +static struct opal_fadump_mem_struct *opal_fdm; + +#ifdef CONFIG_OPAL_CORE +extern bool kernel_initiated; +#endif + +static int opal_fadump_unregister(struct fw_dump *fadump_conf); + +static void opal_fadump_update_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + pr_debug("Boot memory regions count: %d\n", fdm->region_cnt); + + /* + * The destination address of the first boot memory region is the + * destination address of boot memory regions. + */ + fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest; + pr_debug("Destination address of boot memory regions: %#016llx\n", + fadump_conf->boot_mem_dest_addr); + + fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr; +} + +/* + * This function is called in the capture kernel to get configuration details + * from metadata setup by the first kernel. + */ +static void opal_fadump_get_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + unsigned long base, size, last_end, hole_size; + int i; + + if (!fadump_conf->dump_active) + return; + + last_end = 0; + hole_size = 0; + fadump_conf->boot_memory_size = 0; + + pr_debug("Boot memory regions:\n"); + for (i = 0; i < fdm->region_cnt; i++) { + base = fdm->rgn[i].src; + size = fdm->rgn[i].size; + pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size); + + fadump_conf->boot_mem_addr[i] = base; + fadump_conf->boot_mem_sz[i] = size; + fadump_conf->boot_memory_size += size; + hole_size += (base - last_end); + + last_end = base + size; + } + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest; + + /* + * Rarely, but it can so happen that system crashes before all + * boot memory regions are registered for MPIPL. In such + * cases, warn that the vmcore may not be accurate and proceed + * anyway as that is the best bet considering free pages, cache + * pages, user pages, etc are usually filtered out. + * + * Hope the memory that could not be preserved only has pages + * that are usually filtered out while saving the vmcore. + */ + if (fdm->region_cnt > fdm->registered_regions) { + pr_warn("Not all memory regions were saved!!!\n"); + pr_warn(" Unsaved memory regions:\n"); + i = fdm->registered_regions; + while (i < fdm->region_cnt) { + pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n", + i, fdm->rgn[i].src, fdm->rgn[i].size); + i++; + } + + pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n"); + pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n"); + } + + fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size); + fadump_conf->boot_mem_regs_cnt = fdm->region_cnt; + opal_fadump_update_config(fadump_conf, fdm); +} + +/* Initialize kernel metadata */ +static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) +{ + fdm->version = OPAL_FADUMP_VERSION; + fdm->region_cnt = 0; + fdm->registered_regions = 0; + fdm->fadumphdr_addr = 0; +} + +static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + u64 addr = fadump_conf->reserve_dump_area_start; + int i; + + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* Boot memory regions */ + for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) { + opal_fdm->rgn[i].src = fadump_conf->boot_mem_addr[i]; + opal_fdm->rgn[i].dest = addr; + opal_fdm->rgn[i].size = fadump_conf->boot_mem_sz[i]; + + opal_fdm->region_cnt++; + addr += fadump_conf->boot_mem_sz[i]; + } + + /* + * Kernel metadata is passed to f/w and retrieved in capture kerenl. + * So, use it to save fadump header address instead of calculating it. + */ + opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest + + fadump_conf->boot_memory_size); + + opal_fadump_update_config(fadump_conf, opal_fdm); + + return addr; +} + +static u64 opal_fadump_get_metadata_size(void) +{ + return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct)); +} + +static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) +{ + int err = 0; + s64 ret; + + /* + * Use the last page(s) in FADump memory reservation for + * kernel metadata. + */ + fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start + + fadump_conf->reserve_dump_area_size - + opal_fadump_get_metadata_size()); + pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata); + + /* Initialize kernel metadata before registering the address with f/w */ + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* + * Register metadata address with f/w. Can be retrieved in + * the capture kernel. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, + fadump_conf->kernel_metadata); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set kernel metadata tag!\n"); + err = -EPERM; + } + + /* + * Register boot memory top address with f/w. Should be retrieved + * by a kernel that intends to preserve crash'ed kernel's memory. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM, + fadump_conf->boot_mem_top); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set boot memory tag!\n"); + err = -EPERM; + } + + return err; +} + +static u64 opal_fadump_get_bootmem_min(void) +{ + return OPAL_FADUMP_MIN_BOOT_MEM; +} + +static int opal_fadump_register(struct fw_dump *fadump_conf) +{ + s64 rc = OPAL_PARAMETER; + int i, err = -EIO; + + for (i = 0; i < opal_fdm->region_cnt; i++) { + rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE, + opal_fdm->rgn[i].src, + opal_fdm->rgn[i].dest, + opal_fdm->rgn[i].size); + if (rc != OPAL_SUCCESS) + break; + + opal_fdm->registered_regions++; + } + + switch (rc) { + case OPAL_SUCCESS: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_RESOURCE: + /* If MAX regions limit in f/w is hit, warn and proceed. */ + pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n", + (opal_fdm->region_cnt - opal_fdm->registered_regions)); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_PARAMETER: + pr_err("Failed to register. Parameter Error(%lld).\n", rc); + break; + case OPAL_HARDWARE: + pr_err("Support not available.\n"); + fadump_conf->fadump_supported = 0; + fadump_conf->fadump_enabled = 0; + break; + default: + pr_err("Failed to register. Unknown Error(%lld).\n", rc); + break; + } + + /* + * If some regions were registered before OPAL_MPIPL_ADD_RANGE + * OPAL call failed, unregister all regions. + */ + if ((err < 0) && (opal_fdm->registered_regions > 0)) + opal_fadump_unregister(fadump_conf); + + return err; +} + +static int opal_fadump_unregister(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0); + if (rc) { + pr_err("Failed to un-register - unexpected Error(%lld).\n", rc); + return -EIO; + } + + opal_fdm->registered_regions = 0; + fadump_conf->dump_registered = 0; + return 0; +} + +static int opal_fadump_invalidate(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0); + if (rc) { + pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + opal_fdm_active = NULL; + return 0; +} + +static void opal_fadump_cleanup(struct fw_dump *fadump_conf) +{ + s64 ret; + + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0); + if (ret != OPAL_SUCCESS) + pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); +} + +/* + * Verify if CPU state data is available. If available, do a bit of sanity + * checking before processing this data. + */ +static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf) +{ + if (!opal_cpu_metadata) + return false; + + fadump_conf->cpu_state_data_version = + be32_to_cpu(opal_cpu_metadata->cpu_data_version); + fadump_conf->cpu_state_entry_size = + be32_to_cpu(opal_cpu_metadata->cpu_data_size); + fadump_conf->cpu_state_dest_vaddr = + (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest)); + fadump_conf->cpu_state_data_size = + be64_to_cpu(opal_cpu_metadata->region[0].size); + + if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU state data version: %u, found: %d!\n", + HDAT_FADUMP_CPU_DATA_VER, + fadump_conf->cpu_state_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + if ((fadump_conf->cpu_state_dest_vaddr == 0) || + (fadump_conf->cpu_state_entry_size == 0) || + (fadump_conf->cpu_state_entry_size > + fadump_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid. Ignoring!\n"); + return false; + } + + return true; +} + +/* + * Convert CPU state data saved at the time of crash into ELF notes. + * + * While the crashing CPU's register data is saved by the kernel, CPU state + * data for all CPUs is saved by f/w. In CPU state data provided by f/w, + * each register entry is of 16 bytes, a numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. If this data is + * missing or in unsupported format, append crashing CPU's register data + * saved by the kernel in the PT_NOTE, to have something to work with in + * the vmcore file. + */ +static int __init +opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf, + struct fadump_crash_info_header *fdh) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + bool is_cpu_data_valid = false; + u32 num_cpus = 1, *note_buf; + struct pt_regs regs; + char *bufp; + int rc, i; + + if (is_opal_fadump_cpu_data_valid(fadump_conf)) { + size_per_thread = fadump_conf->cpu_state_entry_size; + num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread); + bufp = __va(fadump_conf->cpu_state_dest_vaddr); + is_cpu_data_valid = true; + } + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + if (!is_cpu_data_valid) + goto out; + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + for (i = 0; i < num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + + thread_pir = be32_to_cpu(thdr->pir); + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * If this is kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu == thread_pir) { + note_buf = fadump_regs_to_elf_notes(note_buf, + &fdh->regs); + pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + fdh->crashing_cpu, fdh->regs.gpr[1], + fdh->regs.nip); + continue; + } + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, true, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + thread_pir, regs.gpr[1], regs.nip); + } + +out: + /* + * CPU state data is invalid/unsupported. Try appending crashing CPU's + * register data, if it is saved by the kernel. + */ + if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) { + if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) { + fadump_free_cpu_notes_buf(); + return -ENODEV; + } + + pr_warn("WARNING: appending only crashing CPU's register data\n"); + note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); + } + + final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + return 0; +} + +static int __init opal_fadump_process(struct fw_dump *fadump_conf) +{ + struct fadump_crash_info_header *fdh; + int rc = -EINVAL; + + if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) + return rc; + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return rc; + } + +#ifdef CONFIG_OPAL_CORE + /* + * If this is a kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN) + kernel_initiated = true; +#endif + + rc = opal_fadump_build_cpu_notes(fadump_conf, fdh); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return rc; +} + +static void opal_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ + const struct opal_fadump_mem_struct *fdm_ptr; + u64 dumped_bytes = 0; + int i; + + if (fadump_conf->dump_active) + fdm_ptr = opal_fdm_active; + else + fdm_ptr = opal_fdm; + + for (i = 0; i < fdm_ptr->region_cnt; i++) { + /* + * Only regions that are registered for MPIPL + * would have dump data. + */ + if ((fadump_conf->dump_active) && + (i < fdm_ptr->registered_regions)) + dumped_bytes = fdm_ptr->rgn[i].size; + + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + fdm_ptr->rgn[i].size, dumped_bytes); + } + + /* Dump is active. Show reserved area start address. */ + if (fadump_conf->dump_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } +} + +static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + int rc; + + /* + * Unlike on pSeries platform, logical CPU number is not provided + * with architected register state data. So, store the crashing + * CPU's PIR instead to plug the appropriate register data for + * crashing CPU in the vmcore file. + */ + fdh->crashing_cpu = (u32)mfspr(SPRN_PIR); + + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg); + if (rc == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported.\n", + OPAL_REBOOT_MPIPL); + } else if (rc == OPAL_HARDWARE) + pr_emerg("No backend support for MPIPL!\n"); +} + +static struct fadump_ops opal_fadump_ops = { + .fadump_init_mem_struct = opal_fadump_init_mem_struct, + .fadump_get_metadata_size = opal_fadump_get_metadata_size, + .fadump_setup_metadata = opal_fadump_setup_metadata, + .fadump_get_bootmem_min = opal_fadump_get_bootmem_min, + .fadump_register = opal_fadump_register, + .fadump_unregister = opal_fadump_unregister, + .fadump_invalidate = opal_fadump_invalidate, + .fadump_cleanup = opal_fadump_cleanup, + .fadump_process = opal_fadump_process, + .fadump_region_show = opal_fadump_region_show, + .fadump_trigger = opal_fadump_trigger, +}; + +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + int i, len; + s64 ret; + + /* + * Check if Firmware-Assisted Dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) { + pr_debug("FADump support is missing!\n"); + return; + } + + if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) { + pr_err("Support missing for this f/w version!\n"); + return; + } + + prop = of_get_flat_dt_prop(dn, "fw-load-area", &len); + if (prop) { + /* + * Each f/w load area is an (address,size) pair, + * 2 cells each, totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, end; + + base = of_read_number(prop + (i * 4) + 0, 2); + end = base; + end += of_read_number(prop + (i * 4) + 2, 2); + if (end > OPAL_FADUMP_MIN_BOOT_MEM) { + pr_err("F/W load area: 0x%llx-0x%llx\n", + base, end); + pr_err("F/W version not supported!\n"); + return; + } + } + } + + fadump_conf->ops = &opal_fadump_ops; + fadump_conf->fadump_supported = 1; + + /* + * Firmware supports 32-bit field for size. Align it to PAGE_SIZE + * and request firmware to copy multiple kernel boot memory regions. + */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE); + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get Kernel metadata (%lld)\n", ret); + return; + } + + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + + opal_fdm_active = __va(addr); + if (opal_fdm_active->version != OPAL_FADUMP_VERSION) { + pr_warn("Supported kernel metadata version: %u, found: %d!\n", + OPAL_FADUMP_VERSION, opal_fdm_active->version); + pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n"); + } + + /* Kernel regions not registered with f/w for MPIPL */ + if (opal_fdm_active->registered_regions == 0) { + opal_fdm_active = NULL; + return; + } + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if (addr) { + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opal_cpu_metadata = __va(addr); + } + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + opal_fadump_get_config(fadump_conf, opal_fdm_active); +} +#endif /* !CONFIG_PRESERVE_FA_DUMP */ diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h new file mode 100644 index 000000000000..f1e9ecf548c5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _POWERNV_OPAL_FADUMP_H +#define _POWERNV_OPAL_FADUMP_H + +#include <asm/reg.h> + +/* + * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum + * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't + * mess with crash'ed kernel's memory during MPIPL. + */ +#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL) + +/* + * OPAL FADump metadata structure format version + * + * OPAL FADump kernel metadata structure stores kernel metadata needed to + * register-for/process crash dump. Format version is used to keep a tab on + * the changes in the structure format. The changes, if any, to the format + * are expected to be minimal and backward compatible. + */ +#define OPAL_FADUMP_VERSION 0x1 + +/* + * OPAL FADump kernel metadata + * + * The address of this structure will be registered with f/w for retrieving + * and processing during crash dump. + */ +struct opal_fadump_mem_struct { + u8 version; + u8 reserved[3]; + u16 region_cnt; /* number of regions */ + u16 registered_regions; /* Regions registered for MPIPL */ + u64 fadumphdr_addr; + struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS]; +} __packed; + +/* + * CPU state data + * + * CPU state data information is provided by f/w. The format for this data + * is defined in the HDAT spec. Version is used to keep a tab on the changes + * in this CPU state data format. Changes to this format are unlikely, but + * if there are any changes, please refer to latest HDAT specification. + */ +#define HDAT_FADUMP_CPU_DATA_VER 1 + +#define HDAT_FADUMP_CORE_INACTIVE (0x0F) + +/* HDAT thread header for register entries */ +struct hdat_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __packed; + +/* Register types populated by f/w */ +#define HDAT_FADUMP_REG_TYPE_GPR 0x01 +#define HDAT_FADUMP_REG_TYPE_SPR 0x02 + +/* ID numbers used by f/w while populating certain registers */ +#define HDAT_FADUMP_REG_ID_NIP 0x7D0 +#define HDAT_FADUMP_REG_ID_MSR 0x7D1 +#define HDAT_FADUMP_REG_ID_CCR 0x7D2 + +/* HDAT register entry. */ +struct hdat_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +} __packed; + +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, + u32 reg_type, u32 reg_num, + u64 reg_val) +{ + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case HDAT_FADUMP_REG_ID_NIP: + regs->nip = reg_val; + break; + case HDAT_FADUMP_REG_ID_MSR: + regs->msr = reg_val; + break; + case HDAT_FADUMP_REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, + unsigned int reg_entry_size, + bool cpu_endian, + struct pt_regs *regs) +{ + struct hdat_fadump_reg_entry *reg_entry; + u64 val; + int i; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct hdat_fadump_reg_entry *)bufp; + val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) : + reg_entry->reg_val); + opal_fadump_set_regval_regnum(regs, + be32_to_cpu(reg_entry->reg_type), + be32_to_cpu(reg_entry->reg_num), + val); + } +} + +#endif /* _POWERNV_OPAL_FADUMP_H */ diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 186109bdd41b..e04b20625cb9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node, struct imc_pmu *pmu_ptr) { static u64 loc, *imc_mode_addr, *imc_cmd_addr; - int chip = 0, nid; char mode[16], cmd[16]; u32 cb_offset; + struct imc_mem_info *ptr = pmu_ptr->mem_info; imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); @@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node, if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; - for_each_node(nid) { - loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; + while (ptr->vbase != NULL) { + loc = (u64)(ptr->vbase) + cb_offset; imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET); - sprintf(mode, "imc_mode_%d", nid); + sprintf(mode, "imc_mode_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent, imc_mode_addr)) goto err; imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET); - sprintf(cmd, "imc_cmd_%d", nid); + sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent, imc_cmd_addr)) goto err; - chip++; + ptr++; } return; diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index dc51d03c6370..d26da19a611f 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -29,23 +29,23 @@ struct memcons { static struct memcons *opal_memcons = NULL; -ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count) { const char *conbuf; ssize_t ret; size_t first_read = 0; uint32_t out_pos, avail; - if (!opal_memcons) + if (!mc) return -ENODEV; - out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos)); + out_pos = be32_to_cpu(READ_ONCE(mc->out_pos)); /* Now we've read out_pos, put a barrier in before reading the new * data it points to in conbuf. */ smp_rmb(); - conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys)); + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); /* When the buffer has wrapped, read from the out_pos marker to the end * of the buffer, and then read the remaining data as in the un-wrapped @@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) if (out_pos & MEMCONS_OUT_POS_WRAP) { out_pos &= MEMCONS_OUT_POS_MASK; - avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos; + avail = be32_to_cpu(mc->obuf_size) - out_pos; ret = memory_read_from_buffer(to, count, &pos, conbuf + out_pos, avail); @@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) } /* Sanity check. The firmware should not do this to us. */ - if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) { + if (out_pos > be32_to_cpu(mc->obuf_size)) { pr_err("OPAL: memory console corruption. Aborting read.\n"); return -EINVAL; } @@ -86,6 +86,11 @@ out: return ret; } +ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +{ + return memcons_copy(opal_memcons, to, pos, count); +} + static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *to, loff_t pos, size_t count) @@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = { .read = opal_msglog_read }; -void __init opal_msglog_init(void) +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name) { u64 mcaddr; struct memcons *mc; - if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { - pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); - return; + if (of_property_read_u64(node, mc_prop_name, &mcaddr)) { + pr_warn("%s property not found, no message log\n", + mc_prop_name); + goto out_err; } mc = phys_to_virt(mcaddr); if (!mc) { - pr_warn("OPAL: memory console address is invalid\n"); - return; + pr_warn("memory console address is invalid\n"); + goto out_err; } if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { - pr_warn("OPAL: memory console version is invalid\n"); - return; + pr_warn("memory console version is invalid\n"); + goto out_err; } - /* Report maximum size */ - opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) + - be32_to_cpu(mc->obuf_size); + return mc; + +out_err: + return NULL; +} + +u32 memcons_get_size(struct memcons *mc) +{ + return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size); +} + +void __init opal_msglog_init(void) +{ + opal_memcons = memcons_init(opal_node, "ibm,opal-memcons"); + if (!opal_memcons) { + pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n"); + return; + } - opal_memcons = mc; + opal_msglog_attr.size = memcons_get_size(opal_memcons); } void __init opal_msglog_sysfs_init(void) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index e072bf157d62..45f4223a790f 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, int msg_size, item_size; unsigned long flags; - if (msg_type != OPAL_MSG_PRD) + if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2) return 0; /* Calculate total size of the message and item we need to store. The @@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev) return rc; } + rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb); + if (rc) { + pr_err("Couldn't register PRD2 event notifier\n"); + return rc; + } + rc = misc_register(&opal_prd_dev); if (rc) { pr_err("failed to register miscdev\n"); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 66430eebe869..fd510d961b8c 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * PowerNV LPC bus handling. + * PowerNV SCOM bus debugfs interface * + * Copyright 2010 Benjamin Herrenschmidt, IBM Corp + * <benh@kernel.crashing.org> + * and David Gibson, IBM Corporation. * Copyright 2013 IBM Corp. */ @@ -10,62 +13,13 @@ #include <linux/bug.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/opal.h> -#include <asm/scom.h> - -/* - * We could probably fit that inside the scom_map_t - * which is a void* after all but it's really too ugly - * so let's kmalloc it for now - */ -struct opal_scom_map { - uint32_t chip; - uint64_t addr; -}; - -static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) -{ - struct opal_scom_map *m; - const __be32 *gcid; - - if (!of_get_property(dev, "scom-controller", NULL)) { - pr_err("%s: device %pOF is not a SCOM controller\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - gcid = of_get_property(dev, "ibm,chip-id", NULL); - if (!gcid) { - pr_err("%s: device %pOF has no ibm,chip-id\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - m = kmalloc(sizeof(*m), GFP_KERNEL); - if (!m) - return NULL; - m->chip = be32_to_cpup(gcid); - m->addr = reg; - - return (scom_map_t)m; -} - -static void opal_scom_unmap(scom_map_t map) -{ - kfree(map); -} - -static int opal_xscom_err_xlate(int64_t rc) -{ - switch(rc) { - case 0: - return 0; - /* Add more translations if necessary */ - default: - return -EIO; - } -} +#include <asm/debugfs.h> +#include <asm/prom.h> static u64 opal_scom_unmangle(u64 addr) { @@ -98,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr) return addr; } -static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value) { - struct opal_scom_map *m = map; int64_t rc; __be64 v; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v)); + if (rc) { + *value = 0xfffffffffffffffful; + return -EIO; + } *value = be64_to_cpu(v); - return opal_xscom_err_xlate(rc); + return 0; } -static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value) { - struct opal_scom_map *m = map; int64_t rc; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_write(m->chip, reg, value); - return opal_xscom_err_xlate(rc); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_write(chip, reg, value); + if (rc) + return -EIO; + return 0; +} + +struct scom_debug_entry { + u32 chip; + struct debugfs_blob_wrapper path; + char name[16]; +}; + +static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = opal_scom_read(ent->chip, reg_base, reg, &val); + if (!rc) + rc = put_user(val, ubuf64); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + *ppos += 8; + done += 8; + } + return done; +} + +static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = get_user(val, ubuf64); + if (!rc) + rc = opal_scom_write(ent->chip, reg_base, reg, val); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + done += 8; + } + return done; } -static const struct scom_controller opal_scom_controller = { - .map = opal_scom_map, - .unmap = opal_scom_unmap, - .read = opal_scom_read, - .write = opal_scom_write +static const struct file_operations scom_debug_fops = { + .read = scom_debug_read, + .write = scom_debug_write, + .open = simple_open, + .llseek = default_llseek, }; -static int opal_xscom_init(void) +static int scom_debug_init_one(struct dentry *root, struct device_node *dn, + int chip) { - if (firmware_has_feature(FW_FEATURE_OPAL)) - scom_init(&opal_scom_controller); + struct scom_debug_entry *ent; + struct dentry *dir; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->chip = chip; + snprintf(ent->name, 16, "%08x", chip); + ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn); + ent->path.size = strlen((char *)ent->path.data); + + dir = debugfs_create_dir(ent->name, root); + if (!dir) { + kfree(ent->path.data); + kfree(ent); + return -1; + } + + debugfs_create_blob("devspec", 0400, dir, &ent->path); + debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); + return 0; } -machine_arch_initcall(powernv, opal_xscom_init); + +static int scom_debug_init(void) +{ + struct device_node *dn; + struct dentry *root; + int chip, rc; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return 0; + + root = debugfs_create_dir("scom", powerpc_debugfs_root); + if (!root) + return -1; + + rc = 0; + for_each_node_with_property(dn, "scom-controller") { + chip = of_get_ibm_chip_id(dn); + WARN_ON(chip == -1); + rc |= scom_debug_init_one(root, dn, chip); + } + + return rc; +} +device_initcall(scom_debug_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index aba443be7daa..38e90270280b 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static uint32_t opal_heartbeat; static struct task_struct *kopald_tsk; +static struct opal_msg *opal_msg; +static u32 opal_msg_size __ro_after_init; void opal_configure_cores(void) { @@ -271,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg) static void opal_handle_message(void) { s64 ret; - /* - * TODO: pre-allocate a message buffer depending on opal-msg-size - * value in /proc/device-tree. - */ - static struct opal_msg msg; u32 type; - ret = opal_get_msg(__pa(&msg), sizeof(msg)); + ret = opal_get_msg(__pa(opal_msg), opal_msg_size); /* No opal message pending. */ if (ret == OPAL_RESOURCE) return; @@ -290,14 +287,14 @@ static void opal_handle_message(void) return; } - type = be32_to_cpu(msg.msg_type); + type = be32_to_cpu(opal_msg->msg_type); /* Sanity check */ if (type >= OPAL_MSG_TYPE_MAX) { pr_warn_once("%s: Unknown message type: %u\n", __func__, type); return; } - opal_message_do_notify(type, (void *)&msg); + opal_message_do_notify(type, (void *)opal_msg); } static irqreturn_t opal_message_notify(int irq, void *data) @@ -306,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data) return IRQ_HANDLED; } -static int __init opal_message_init(void) +static int __init opal_message_init(struct device_node *opal_node) { int ret, i, irq; + ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size); + if (ret) { + pr_notice("Failed to read opal-msg-size property\n"); + opal_msg_size = sizeof(struct opal_msg); + } + + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + if (!opal_msg) { + opal_msg_size = sizeof(struct opal_msg); + /* Try to allocate fixed message size */ + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + BUG_ON(opal_msg == NULL); + } + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); @@ -705,7 +716,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, bin_attr->size); } -static BIN_ATTR_RO(symbol_map, 0); +static struct bin_attribute symbol_map_attr = { + .attr = {.name = "symbol_map", .mode = 0400}, + .read = symbol_map_read +}; static void opal_export_symmap(void) { @@ -722,10 +736,10 @@ static void opal_export_symmap(void) return; /* Setup attributes */ - bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); - bin_attr_symbol_map.size = be64_to_cpu(syms[1]); + symbol_map_attr.private = __va(be64_to_cpu(syms[0])); + symbol_map_attr.size = be64_to_cpu(syms[1]); - rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); + rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr); if (rc) pr_warn("Error %d creating OPAL symbols file\n", rc); } @@ -910,7 +924,7 @@ static int __init opal_init(void) } /* Initialise OPAL messaging system */ - opal_message_init(); + opal_message_init(opal_node); /* Initialise OPAL asynchronous completion interface */ opal_async_comp_init(); diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index e28f03e1eb5e..a0b9c0c23ed2 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) struct page *tce_mem = NULL; __be64 *addr; - tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN, + shift - PAGE_SHIFT); if (!tce_mem) { pr_err("Failed to allocate a TCE memory, level shift=%d\n", shift); @@ -48,6 +49,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) return addr; } +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned int levels); + static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; @@ -57,9 +61,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) while (level) { int n = (idx & mask) >> (level * shift); - unsigned long tce; + unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n])); - if (tmp[n] == 0) { + if (!tce) { __be64 *tmp2; if (!alloc) @@ -70,10 +74,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) if (!tmp2) return NULL; - tmp[n] = cpu_to_be64(__pa(tmp2) | - TCE_PCI_READ | TCE_PCI_WRITE); + tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE; + oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0, + cpu_to_be64(tce))); + if (oldtce) { + pnv_pci_ioda2_table_do_free_pages(tmp2, + ilog2(tbl->it_level_size) + 3, 1); + tce = oldtce; + } } - tce = be64_to_cpu(tmp[n]); tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); idx &= ~mask; @@ -161,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) if (ptce) *ptce = cpu_to_be64(0); + else + /* Skip the rest of the level */ + i |= tbl->it_level_size - 1; } } @@ -260,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, unsigned int table_shift = max_t(unsigned int, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; - unsigned int tmplevels = levels; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; @@ -268,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!is_power_of_2(window_size)) return -EINVAL; - if (alloc_userspace_copy && (window_size > (1ULL << 32))) - tmplevels = 1; - /* Adjust direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; level_shift = entries_shift + 3; @@ -281,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, &total_allocated); + 1, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -292,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (tmplevels == levels && offset < tce_table_size) + if (levels == 1 && offset < tce_table_size) goto free_tces_exit; /* Allocate userspace view of the TCE table */ if (alloc_userspace_copy) { offset = 0; uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, + 1, tce_table_size, &offset, &total_allocated_uas); if (!uas) goto free_tces_exit; - if (tmplevels == levels && (offset < tce_table_size || + if (levels == 1 && (offset < tce_table_size || total_allocated_uas != total_allocated)) goto free_uas_exit; } @@ -318,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base, - tbl->it_userspace, tmplevels, levels); + tbl->it_userspace, 1, levels); return 0; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d8080558d020..c28d0d9b7ee0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1939,26 +1939,12 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, } #ifdef CONFIG_IOMMU_API -static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) +/* Common for IODA1 and IODA2 */ +static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction, + bool realmode) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); - - return ret; + return pnv_tce_xchg(tbl, index, hpa, direction, !realmode); } #endif @@ -1973,8 +1959,8 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda1_iommu_ops = { .set = pnv_ioda1_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda1_tce_xchg, - .exchange_rm = pnv_ioda1_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_p7ioc_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda1_tce_free, @@ -2103,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, return ret; } -#ifdef CONFIG_IOMMU_API -static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); - - return ret; -} -#endif - static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -2138,8 +2100,8 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda2_tce_xchg, - .exchange_rm = pnv_ioda2_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_ioda2_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, @@ -2303,7 +2265,7 @@ found: tbl->it_ops = &pnv_ioda1_iommu_ops; pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; - iommu_init_table(tbl, phb->hose->node); + iommu_init_table(tbl, phb->hose->node, 0, 0); if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); @@ -2420,6 +2382,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = NULL; long rc; + unsigned long res_start, res_end; /* * crashkernel= specifies the kdump kernel's maximum memory at @@ -2433,19 +2396,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1); + + /* + * We create the default window as big as we can. The constraint is + * the max order of allocation possible. The TCE table is likely to + * end up being multilevel and with on-demand allocation in place, + * the initial use is not going to be huge as the default window aims + * to support crippled devices (i.e. not fully 64bit DMAble) only. + */ + /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */ + const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory); + /* Each TCE level cannot exceed maxblock so go multilevel if needed */ + unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT); + unsigned long tcelevel_order = ilog2(maxblock >> 3); + unsigned int levels = tces_order / tcelevel_order; + + if (tces_order % tcelevel_order) + levels += 1; + /* + * We try to stick to default levels (which is >1 at the moment) in + * order to save memory by relying on on-demain TCE level allocation. + */ + levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS); - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT, + window_size, levels, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); return rc; } - iommu_init_table(tbl, pe->phb->hose->node); + /* We use top part of 32bit space for MMIO so exclude it from DMA */ + res_start = 0; + res_end = 0; + if (window_size > pe->phb->ioda.m32_pci_base) { + res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift; + res_end = min(window_size, SZ_4G) >> tbl->it_page_shift; + } + iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end); rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); if (rc) { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6104418c9ad5..2825d004dece 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) break; } - if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) { + if (!of_device_is_compatible(parent, "ibm,ioda2-phb") && + !of_device_is_compatible(parent, "ibm,ioda3-phb")) { of_node_put(parent); continue; } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 469c24463247..f914f0b14e4e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -219,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach( struct pnv_ioda_pe *pe); /* pci-ioda-tce.c */ -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_DEFAULT_LEVELS 2 #define POWERNV_IOMMU_MAX_LEVELS 5 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index fd4a1c5a6369..1aa51c4fa904 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -30,4 +30,9 @@ extern void opal_event_shutdown(void); bool cpu_core_split_required(void); +struct memcons; +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count); +u32 memcons_get_size(struct memcons *mc); +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a5e52f9eed3c..83498604d322 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -24,6 +24,7 @@ #include <linux/bug.h> #include <linux/pci.h> #include <linux/cpufreq.h> +#include <linux/memblock.h> #include <asm/machdep.h> #include <asm/firmware.h> @@ -166,6 +167,14 @@ static void __init pnv_init(void) else #endif add_preferred_console("hvc", 0, NULL); + + if (!radix_enabled()) { + int i; + + /* Allocate per cpu area to save old slb contents during MCE */ + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i)); + } } static void __init pnv_init_IRQ(void) diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c new file mode 100644 index 000000000000..e4a00ad06f9d --- /dev/null +++ b/arch/powerpc/platforms/powernv/ultravisor.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ultravisor high level interfaces + * + * Copyright 2019, IBM Corporation. + * + */ +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/of_fdt.h> +#include <linux/of.h> + +#include <asm/ultravisor.h> +#include <asm/firmware.h> +#include <asm/machdep.h> + +#include "powernv.h" + +static struct kobject *ultravisor_kobj; + +int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname, + int depth, void *data) +{ + if (!of_flat_dt_is_compatible(node, "ibm,ultravisor")) + return 0; + + powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR; + pr_debug("Ultravisor detected!\n"); + return 1; +} + +static struct memcons *uv_memcons; + +static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + return memcons_copy(uv_memcons, to, pos, count); +} + +static struct bin_attribute uv_msglog_attr = { + .attr = {.name = "msglog", .mode = 0400}, + .read = uv_msglog_read +}; + +static int __init uv_init(void) +{ + struct device_node *node; + + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + return 0; + + node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware"); + if (!node) + return -ENODEV; + + uv_memcons = memcons_init(node, "memcons"); + if (!uv_memcons) + return -ENOENT; + + uv_msglog_attr.size = memcons_get_size(uv_memcons); + + ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj); + if (!ultravisor_kobj) + return -ENOMEM; + + return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr); +} +machine_subsys_initcall(powernv, uv_init); diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index bdaeaecdc06b..1193c294b8d0 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -184,10 +184,7 @@ static void spu_unmap(struct spu *spu) * setup_areas - Map the spu regions into the address space. * * The current HV requires the spu shadow regs to be mapped with the - * PTE page protection bits set as read-only (PP=3). This implementation - * uses the low level __ioremap() to bypass the page protection settings - * inforced by ioremap_prot() to get the needed PTE bits set for the - * shadow regs. + * PTE page protection bits set as read-only. */ static int __init setup_areas(struct spu *spu) @@ -195,9 +192,8 @@ static int __init setup_areas(struct spu *spu) struct table {char* name; unsigned long addr; unsigned long size;}; unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); - spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, - sizeof(struct spe_shadow), - shadow_flags); + spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr, + sizeof(struct spe_shadow), shadow_flags); if (!spu_pdata(spu)->shadow) { pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__); goto fail_ioremap; diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f7b484f55553..9e35cddddf73 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -145,3 +145,17 @@ config PAPR_SCM tristate "Support for the PAPR Storage Class Memory interface" help Enable access to hypervisor provided storage class memory. + +config PPC_SVM + bool "Secure virtual machine (SVM) support for POWER" + depends on PPC_PSERIES + select SWIOTLB + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED + help + There are certain POWER platforms which support secure guests using + the Protected Execution Facility, with the help of an Ultravisor + executing below the hypervisor layer. This enables support for + those guests. + + If unsure, say "N". diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index ab3d59aeacca..a3c74a5cf20d 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -26,6 +26,8 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_PPC_SVM) += svm.o +obj-$(CONFIG_FA_DUMP) += rtas-fadump.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 9edae1863e2f..893ba3f562c4 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -42,42 +42,44 @@ static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_pe; -#ifdef CONFIG_PCI_IOV void pseries_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - struct pci_dn *physfn_pdn; - struct eeh_dev *edev; - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - pdn->device_id = pdev->device; - pdn->vendor_id = pdev->vendor; - pdn->class_code = pdev->class; - /* - * Last allow unfreeze return code used for retrieval - * by user space in eeh-sysfs to show the last command - * completion from platform. - */ - pdn->last_allow_rc = 0; - physfn_pdn = pci_get_pdn(pdev->physfn); - pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; - edev = pdn_to_eeh_dev(pdn); + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct pci_dn *physfn_pdn; - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + pdn->device_id = pdev->device; + pdn->vendor_id = pdev->vendor; + pdn->class_code = pdev->class; + /* + * Last allow unfreeze return code used for retrieval + * by user space in eeh-sysfs to show the last command + * completion from platform. + */ + pdn->last_allow_rc = 0; + physfn_pdn = pci_get_pdn(pdev->physfn); + pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; + } +#endif eeh_add_device_early(pdn); eeh_add_device_late(pdev); - edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); - eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ - eeh_add_to_parent_pe(edev); /* Add as VF PE type */ - eeh_sysfs_add_device(pdev); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); -} + edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); + eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ + eeh_add_to_parent_pe(edev); /* Add as VF PE type */ + } #endif + eeh_sysfs_add_device(pdev); +} /* * Buffer for reporting slot-error-detail rtas calls. Its here @@ -144,10 +146,8 @@ static int pseries_eeh_init(void) /* Set EEH probe mode */ eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); -#ifdef CONFIG_PCI_IOV /* Set EEH machine dependent code */ ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; -#endif return 0; } @@ -251,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* * Update class code and mode of eeh device. We need * correctly reflects that current device is root port @@ -280,8 +282,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); /* Enable EEH on the device */ + eeh_edev_dbg(edev, "Enabling EEH on device\n"); ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); - if (!ret) { + if (ret) { + eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret); + } else { /* Retrieve PE address */ edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); pe.addr = edev->pe_config_addr; @@ -297,11 +302,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if (enable) { eeh_add_flag(EEH_ENABLED); eeh_add_to_parent_pe(edev); - - pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n", - __func__, pdn->busno, PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), pe.phb->global_number, - pe.addr); } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) && (pdn_to_eeh_dev(pdn->parent))->pe) { /* This device doesn't support EEH, but it may have an @@ -310,6 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; eeh_add_to_parent_pe(edev); } + eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n", + (enable ? "enabled" : "unsupported"), ret); } /* Save memory bars */ diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 46d0d35b9ca4..8e700390f3d6 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -880,34 +880,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) switch (hp_elog->action) { case PSERIES_HP_ELOG_ACTION_ADD: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_add_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_add_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_add_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; case PSERIES_HP_ELOG_ACTION_REMOVE: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_remove_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_remove_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_remove_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 889dc2e44b89..6ba081dd61c9 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -36,6 +36,7 @@ #include <asm/udbg.h> #include <asm/mmzone.h> #include <asm/plpar_wrappers.h> +#include <asm/svm.h> #include "pseries.h" @@ -609,7 +610,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -621,7 +622,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) #ifdef CONFIG_IOMMU_API static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned - long *tce, enum dma_data_direction *direction) + long *tce, enum dma_data_direction *direction, + bool realmode) { long rc; unsigned long ioba = (unsigned long) index << tbl->it_page_shift; @@ -649,7 +651,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, #ifdef CONFIG_IOMMU_API - .exchange = tce_exchange_pseries, + .xchg_no_kill = tce_exchange_pseries, #endif .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP @@ -690,7 +692,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, ppci->phb->node); + iommu_init_table(tbl, ppci->phb->node, 0, 0); iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -719,7 +721,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, phb->node); + iommu_init_table(tbl, phb->node, 0, 0); set_iommu_table_base(&dev->dev, tbl); return; } @@ -1169,7 +1171,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) iommu_table_setparms_lpar(pci->phb, pdn, tbl, pci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->table_group); @@ -1318,7 +1320,15 @@ void iommu_init_early_pSeries(void) of_reconfig_notifier_register(&iommu_reconfig_nb); register_memory_notifier(&iommu_mem_nb); - set_pci_dma_ops(&dma_iommu_ops); + /* + * Secure guest memory is inacessible to devices so regular DMA isn't + * possible. + * + * In that case keep devices' dma_map_ops as NULL so that the generic + * DMA code path will use SWIOTLB to bounce buffers for DMA. + */ + if (!is_secure_guest()) + set_pci_dma_ops(&dma_iommu_ops); } static int __init disable_multitce(char *str) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 09bb878c21e0..36b846f6e74e 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data) return 0; } -/* Must be called in user context */ +/* + * Must be called in process context. The caller must hold the + * cpus_lock. + */ static int pseries_lpar_resize_hpt(unsigned long shift) { struct hpt_resize_state state = { @@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift) t1 = ktime_get(); - rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL); + rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit, + &state, NULL); t2 = ktime_get(); @@ -1527,16 +1531,24 @@ void __init hpte_init_pseries(void) mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; - register_process_table = pseries_lpar_register_process_table; if (firmware_has_feature(FW_FEATURE_HPT_RESIZE)) mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; + + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pseries_lpar_register_process_table(0, 0, 0); } void radix_init_pseries(void) { pr_info("Using radix MMU under hypervisor\n"); - register_process_table = pseries_lpar_register_process_table; + + pseries_lpar_register_process_table(__pa(process_tb), + 0, PRTB_SIZE_SHIFT - 12); } #ifdef CONFIG_PPC_SMLPAR diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index fe812bebdf5e..b571285f6c14 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -9,6 +9,7 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/kobject.h> +#include <linux/sched.h> #include <linux/smp.h> #include <linux/stat.h> #include <linux/completion.h> @@ -207,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope) prop_data += vd; } + + cond_resched(); } + + cond_resched(); } while (rtas_rc == 1); of_node_put(dn); @@ -310,8 +315,12 @@ int pseries_devicetree_update(s32 scope) add_dt_node(phandle, drc_index); break; } + + cond_resched(); } } + + cond_resched(); } while (rc == 1); kfree(rtas_buf); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 1eae1d09980c..722830978639 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void) pSeries_request_regions(); - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); #ifdef CONFIG_PCI_IOV ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f16fdd0f71f7..3acdcc3bb908 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -76,6 +76,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_UE 0x00 #define MC_ERROR_TYPE_SLB 0x01 #define MC_ERROR_TYPE_ERAT 0x02 +#define MC_ERROR_TYPE_UNKNOWN 0x03 #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 @@ -87,6 +88,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_UE_LOAD_STORE 3 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 +#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 +#define UE_LOGICAL_ADDR_PROVIDED 0x20 + #define MC_ERROR_SLB_PARITY 0 #define MC_ERROR_SLB_MULTIHIT 1 #define MC_ERROR_SLB_INDETERMINATE 2 @@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) } } -static -inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) -{ - __be64 addr = 0; - - switch (mlog->error_type) { - case MC_ERROR_TYPE_UE: - if (mlog->sub_err_type & 0x40) - addr = mlog->effective_address; - break; - case MC_ERROR_TYPE_SLB: - case MC_ERROR_TYPE_ERAT: - case MC_ERROR_TYPE_TLB: - if (mlog->sub_err_type & 0x80) - addr = mlog->effective_address; - default: - break; - } - return be64_to_cpu(addr); -} - /* * Enable the hotplug interrupt late because processing them may touch other * devices or systems (e.g. hugepages) that have not been initialized at the @@ -511,160 +494,165 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -#define VAL_TO_STRING(ar, val) \ - (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") -static void pseries_print_mce_info(struct pt_regs *regs, - struct rtas_error_log *errp) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { - const char *level, *sevstr; + struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; struct pseries_errorlog *pseries_log; struct pseries_mc_errorlog *mce_log; - u8 error_type, err_sub_type; - u64 addr; - u8 initiator = rtas_error_initiator(errp); int disposition = rtas_error_disposition(errp); + int initiator = rtas_error_initiator(errp); + int severity = rtas_error_severity(errp); + u8 error_type, err_sub_type; - static const char * const initiators[] = { - [0] = "Unknown", - [1] = "CPU", - [2] = "PCI", - [3] = "ISA", - [4] = "Memory", - [5] = "Power Mgmt", - }; - static const char * const mc_err_types[] = { - [0] = "UE", - [1] = "SLB", - [2] = "ERAT", - [3] = "Unknown", - [4] = "TLB", - [5] = "D-Cache", - [6] = "Unknown", - [7] = "I-Cache", - }; - static const char * const mc_ue_types[] = { - [0] = "Indeterminate", - [1] = "Instruction fetch", - [2] = "Page table walk ifetch", - [3] = "Load/Store", - [4] = "Page table walk Load/Store", - }; - - /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ - static const char * const mc_slb_types[] = { - [0] = "Parity", - [1] = "Multihit", - [2] = "Indeterminate", - }; - - /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ - static const char * const mc_soft_types[] = { - [0] = "Unknown", - [1] = "Parity", - [2] = "Multihit", - [3] = "Indeterminate", - }; - - if (!rtas_error_extended(errp)) { - pr_err("Machine check interrupt: Missing extended error log\n"); - return; - } + if (initiator == RTAS_INITIATOR_UNKNOWN) + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + else if (initiator == RTAS_INITIATOR_CPU) + mce_err.initiator = MCE_INITIATOR_CPU; + else if (initiator == RTAS_INITIATOR_PCI) + mce_err.initiator = MCE_INITIATOR_PCI; + else if (initiator == RTAS_INITIATOR_ISA) + mce_err.initiator = MCE_INITIATOR_ISA; + else if (initiator == RTAS_INITIATOR_MEMORY) + mce_err.initiator = MCE_INITIATOR_MEMORY; + else if (initiator == RTAS_INITIATOR_POWERMGM) + mce_err.initiator = MCE_INITIATOR_POWERMGM; + else + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + + if (severity == RTAS_SEVERITY_NO_ERROR) + mce_err.severity = MCE_SEV_NO_ERROR; + else if (severity == RTAS_SEVERITY_EVENT) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_WARNING) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_ERROR_SYNC) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_ERROR) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; + else + mce_err.severity = MCE_SEV_FATAL; + + if (severity <= RTAS_SEVERITY_ERROR_SYNC) + mce_err.sync_error = true; + else + mce_err.sync_error = false; + + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err.error_class = MCE_ECLASS_UNKNOWN; + + if (!rtas_error_extended(errp)) + goto out; pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); if (pseries_log == NULL) - return; + goto out; mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; err_sub_type = rtas_mc_error_sub_type(mce_log); - switch (rtas_error_severity(errp)) { - case RTAS_SEVERITY_NO_ERROR: - level = KERN_INFO; - sevstr = "Harmless"; - break; - case RTAS_SEVERITY_WARNING: - level = KERN_WARNING; - sevstr = ""; - break; - case RTAS_SEVERITY_ERROR: - case RTAS_SEVERITY_ERROR_SYNC: - level = KERN_ERR; - sevstr = "Severe"; - break; - case RTAS_SEVERITY_FATAL: - default: - level = KERN_ERR; - sevstr = "Fatal"; - break; - } + switch (mce_log->error_type) { + case MC_ERROR_TYPE_UE: + mce_err.error_type = MCE_ERROR_TYPE_UE; + switch (err_sub_type) { + case MC_ERROR_UE_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + case MC_ERROR_UE_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + break; + case MC_ERROR_UE_INDETERMINATE: + default: + mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) + eaddr = be64_to_cpu(mce_log->effective_address); -#ifdef CONFIG_PPC_BOOK3S_64 - /* Display faulty slb contents for SLB errors. */ - if (error_type == MC_ERROR_TYPE_SLB) - slb_dump_contents(local_paca->mce_faulty_slbs); -#endif + if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { + paddr = be64_to_cpu(mce_log->logical_address); + } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { + unsigned long pfn; - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, - disposition == RTAS_DISP_FULLY_RECOVERED ? - "Recovered" : "Not recovered"); - if (user_mode(regs)) { - printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, - regs->nip, current->pid, current->comm); - } else { - printk("%s NIP [%016lx]: %pS\n", level, regs->nip, - (void *)regs->nip); - } - printk("%s Initiator: %s\n", level, - VAL_TO_STRING(initiators, initiator)); + pfn = addr_to_pfn(regs, eaddr); + if (pfn != ULONG_MAX) + paddr = pfn << PAGE_SHIFT; + } - switch (error_type) { - case MC_ERROR_TYPE_UE: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_ue_types, err_sub_type)); break; case MC_ERROR_TYPE_SLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_slb_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_SLB; + switch (err_sub_type) { + case MC_ERROR_SLB_PARITY: + mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY; + break; + case MC_ERROR_SLB_MULTIHIT: + mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + break; + case MC_ERROR_SLB_INDETERMINATE: + default: + mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_ERAT: + mce_err.error_type = MCE_ERROR_TYPE_ERAT; + switch (err_sub_type) { + case MC_ERROR_ERAT_PARITY: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY; + break; + case MC_ERROR_ERAT_MULTIHIT: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + break; + case MC_ERROR_ERAT_INDETERMINATE: + default: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; case MC_ERROR_TYPE_TLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_soft_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_TLB; + switch (err_sub_type) { + case MC_ERROR_TLB_PARITY: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY; + break; + case MC_ERROR_TLB_MULTIHIT: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + break; + case MC_ERROR_TLB_INDETERMINATE: + default: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; + case MC_ERROR_TYPE_D_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; + case MC_ERROR_TYPE_I_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + break; + case MC_ERROR_TYPE_UNKNOWN: default: - printk("%s Error type: %s\n", level, - VAL_TO_STRING(mc_err_types, error_type)); + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; break; } - addr = rtas_mc_get_effective_addr(mce_log); - if (addr) - printk("%s Effective address: %016llx\n", level, addr); -} - -static int mce_handle_error(struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - int disposition = rtas_error_disposition(errp); - u8 error_type; - - if (!rtas_error_extended(errp)) - goto out; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (pseries_log == NULL) - goto out; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; - #ifdef CONFIG_PPC_BOOK3S_64 if (disposition == RTAS_DISP_NOT_RECOVERED) { switch (error_type) { @@ -682,98 +670,24 @@ static int mce_handle_error(struct rtas_error_log *errp) slb_save_contents(local_paca->mce_faulty_slbs); flush_and_reload_slb(); disposition = RTAS_DISP_FULLY_RECOVERED; - rtas_set_disposition_recovered(errp); break; default: break; } + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + printk(KERN_ERR "MCE: limited recovery, system may " + "be degraded\n"); + disposition = RTAS_DISP_FULLY_RECOVERED; } #endif out: - return disposition; -} - -#ifdef CONFIG_MEMORY_FAILURE - -static DEFINE_PER_CPU(int, rtas_ue_count); -static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]); + save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, + &mce_err, regs->nip, eaddr, paddr); -#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 -#define UE_LOGICAL_ADDR_PROVIDED 0x20 - - -static void pseries_hwpoison_work_fn(struct work_struct *work) -{ - unsigned long paddr; - int index; - - while (__this_cpu_read(rtas_ue_count) > 0) { - index = __this_cpu_read(rtas_ue_count) - 1; - paddr = __this_cpu_read(rtas_ue_paddr[index]); - memory_failure(paddr >> PAGE_SHIFT, 0); - __this_cpu_dec(rtas_ue_count); - } -} - -static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn); - -static void queue_ue_paddr(unsigned long paddr) -{ - int index; - - index = __this_cpu_inc_return(rtas_ue_count) - 1; - if (index >= MAX_MC_EVT) { - __this_cpu_dec(rtas_ue_count); - return; - } - this_cpu_write(rtas_ue_paddr[index], paddr); - schedule_work(&hwpoison_work); -} - -static void pseries_do_memory_failure(struct pt_regs *regs, - struct pseries_mc_errorlog *mce_log) -{ - unsigned long paddr; - - if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { - paddr = be64_to_cpu(mce_log->logical_address); - } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { - unsigned long pfn; - - pfn = addr_to_pfn(regs, - be64_to_cpu(mce_log->effective_address)); - if (pfn == ULONG_MAX) - return; - paddr = pfn << PAGE_SHIFT; - } else { - return; - } - queue_ue_paddr(paddr); -} - -static void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - - if (!rtas_error_extended(errp)) - return; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (!pseries_log) - return; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - - if (mce_log->error_type == MC_ERROR_TYPE_UE) - pseries_do_memory_failure(regs, mce_log); + return disposition; } -#else -static inline void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) { } -#endif /*CONFIG_MEMORY_FAILURE */ /* * Process MCE rtas errlog event. @@ -795,49 +709,51 @@ static void mce_process_errlog_event(struct irq_work *work) * Return 1 if corrected (or delivered a signal). * Return 0 if there is nothing we can do. */ -static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) +static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - int disposition = rtas_error_disposition(err); - - pseries_print_mce_info(regs, err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; - - } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { /* Platform corrected itself */ recovered = 1; + } else if (evt->severity == MCE_SEV_FATAL) { + /* Fatal machine check */ + pr_err("Machine check interrupt is fatal\n"); + recovered = 0; + } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - printk(KERN_ERR "MCE: limited recovery, system may " - "be degraded\n"); - recovered = 1; - - } else if (user_mode(regs) && !is_global_init(current) && - rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { - + if (!recovered && evt->sync_error) { /* - * If we received a synchronous error when in userspace - * kill the task. Firmware may report details of the fail - * asynchronously, so we can't rely on the target and type - * fields being valid here. + * Try to kill processes if we get a synchronous machine check + * (e.g., one caused by execution of this instruction). This + * will devolve into a panic if we try to kill init or are in + * an interrupt etc. + * + * TODO: Queue up this address for hwpoisioning later. + * TODO: This is not quite right for d-side machine + * checks ->nip is not necessarily the important + * address. */ - printk(KERN_ERR "MCE: uncorrectable error, killing task " - "%s:%d\n", current->comm, current->pid); - - _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); - recovered = 1; + if ((user_mode(regs))) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else if (die_will_crash()) { + /* + * die() would kill the kernel, so better to go via + * the platform reboot code that will log the + * machine check. + */ + recovered = 0; + } else { + die("Machine check", regs, SIGBUS); + recovered = 1; + } } - pseries_process_ue(regs, err); - - /* Queue irq work to log this rtas event later. */ - irq_work_queue(&mce_errlog_process_work); - return recovered; } @@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) */ int pSeries_machine_check_exception(struct pt_regs *regs) { - struct rtas_error_log *errp; + struct machine_check_event evt; - if (fwnmi_active) { - fwnmi_release_errinfo(); - errp = fwnmi_get_errlog(); - if (errp && recover_mce(regs, errp)) - return 1; + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return 0; + + /* Print things out */ + if (evt.version != MCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; } + machine_check_print_event_info(&evt, user_mode(regs), false); + + if (recover_mce(regs, &evt)) + return 1; return 0; } @@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs) * to panic. Hence we will call it as soon as we go into * virtual mode. */ - disposition = mce_handle_error(errp); + disposition = mce_handle_error(regs, errp); + fwnmi_release_errinfo(); + + /* Queue irq work to log this rtas event later. */ + irq_work_queue(&mce_errlog_process_work); + if (disposition == RTAS_DISP_FULLY_RECOVERED) return 1; } diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c new file mode 100644 index 000000000000..70c3013fdd07 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "rtas fadump: " fmt + +#include <linux/string.h> +#include <linux/memblock.h> +#include <linux/delay.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/fadump.h> +#include <asm/fadump-internal.h> + +#include "rtas-fadump.h" + +static struct rtas_fadump_mem_struct fdm; +static const struct rtas_fadump_mem_struct *fdm_active; + +static void rtas_fadump_update_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_mem_dest_addr = + be64_to_cpu(fdm->rmr_region.destination_address); + + fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr + + fadump_conf->boot_memory_size); +} + +/* + * This function is called in the capture kernel to get configuration details + * setup in the first kernel and passed to the f/w. + */ +static void rtas_fadump_get_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_mem_addr[0] = + be64_to_cpu(fdm->rmr_region.source_address); + fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len); + fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0]; + + fadump_conf->boot_mem_top = fadump_conf->boot_memory_size; + fadump_conf->boot_mem_regs_cnt = 1; + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = + be64_to_cpu(fdm->cpu_state_data.destination_address); + + rtas_fadump_update_config(fadump_conf, fdm); +} + +static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + u64 addr = fadump_conf->reserve_dump_area_start; + + memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm.header.dump_format_version = cpu_to_be32(0x00000001); + fdm.header.dump_num_sections = cpu_to_be16(3); + fdm.header.dump_status_flag = 0; + fdm.header.offset_first_dump_section = + cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, + cpu_state_data)); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm.header.dd_block_size = 0; + fdm.header.dd_block_offset = 0; + fdm.header.dd_num_blocks = 0; + fdm.header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm.header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm.cpu_state_data.request_flag = + cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.cpu_state_data.source_data_type = + cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); + fdm.cpu_state_data.source_address = 0; + fdm.cpu_state_data.source_len = + cpu_to_be64(fadump_conf->cpu_state_data_size); + fdm.cpu_state_data.destination_address = cpu_to_be64(addr); + addr += fadump_conf->cpu_state_data_size; + + /* hpte region section */ + fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.hpte_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_HPTE_REGION); + fdm.hpte_region.source_address = 0; + fdm.hpte_region.source_len = + cpu_to_be64(fadump_conf->hpte_region_size); + fdm.hpte_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->hpte_region_size; + + /* RMA region section */ + fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rmr_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); + fdm.rmr_region.source_address = cpu_to_be64(0); + fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size); + fdm.rmr_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->boot_memory_size; + + rtas_fadump_update_config(fadump_conf, &fdm); + + return addr; +} + +static u64 rtas_fadump_get_bootmem_min(void) +{ + return RTAS_FADUMP_MIN_BOOT_MEM; +} + +static int rtas_fadump_register(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc, err = -EIO; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_REGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case 0: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case -1: + pr_err("Failed to register. Hardware Error(%d).\n", rc); + break; + case -3: + if (!is_fadump_boot_mem_contiguous()) + pr_err("Can't have holes in boot memory area.\n"); + else if (!is_fadump_reserved_mem_contiguous()) + pr_err("Can't have holes in reserved memory area.\n"); + + pr_err("Failed to register. Parameter Error(%d).\n", rc); + err = -EINVAL; + break; + case -9: + pr_err("Already registered!\n"); + fadump_conf->dump_registered = 1; + err = -EEXIST; + break; + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; + } + + return err; +} + +static int rtas_fadump_unregister(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_UNREGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to un-register - unexpected error(%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_registered = 0; + return 0; +} + +static int rtas_fadump_invalidate(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_INVALIDATE, fdm_active, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to invalidate - unexpected error (%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + fdm_active = NULL; + return 0; +} + +#define RTAS_FADUMP_GPR_MASK 0xffffff0000000000 +static inline int rtas_fadump_gpr_index(u64 id) +{ + char str[3]; + int i = -1; + + if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) { + /* get the digits at the end */ + id &= ~RTAS_FADUMP_GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + if (kstrtoint(str, 10, &i)) + i = -EINVAL; + if (i > 31) + i = -1; + } + return i; +} + +void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) +{ + int i; + + i = rtas_fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct rtas_fadump_reg_entry* +rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry, + struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) { + rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), + be64_to_cpu(reg_entry->reg_value)); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) +{ + struct rtas_fadump_reg_save_area_header *reg_header; + struct fadump_crash_info_header *fdh = NULL; + struct rtas_fadump_reg_entry *reg_entry; + u32 num_cpus, *note_buf; + int i, rc = 0, cpu = 0; + struct pt_regs regs; + unsigned long addr; + void *vaddr; + + addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address); + vaddr = __va(addr); + + reg_header = vaddr; + if (be64_to_cpu(reg_header->magic_number) != + fadump_str_to_u64("REGSAVE")) { + pr_err("Unable to read register save area.\n"); + return -ENOENT; + } + + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number)); + pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset)); + + vaddr += be32_to_cpu(reg_header->num_cpu_offset); + num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct rtas_fadump_reg_entry *)vaddr; + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + + if (fadump_conf->fadumphdr_addr) + fdh = __va(fadump_conf->fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (be64_to_cpu(reg_entry->reg_id) != + fadump_str_to_u64("CPUSTRT")) { + pr_err("Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = (be64_to_cpu(reg_entry->reg_value) & + RTAS_FADUMP_CPU_ID_MASK); + if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = rtas_fadump_read_regs(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + final_note(note_buf); + + if (fdh) { + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + } + return 0; + +error_out: + fadump_free_cpu_notes_buf(); + return rc; + +} + +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init rtas_fadump_process(struct fw_dump *fadump_conf) +{ + struct fadump_crash_info_header *fdh; + int rc = 0; + + if (!fdm_active || !fadump_conf->fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((be16_to_cpu(fdm_active->header.dump_status_flag) == + RTAS_FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || + (fdm_active->rmr_region.error_flags != 0)) { + pr_err("Dump taken by platform is not valid\n"); + return -EINVAL; + } + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { + pr_err("Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return -EINVAL; + } + + rc = rtas_fadump_build_cpu_notes(fadump_conf); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; +} + +static void rtas_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ + const struct rtas_fadump_section *cpu_data_section; + const struct rtas_fadump_mem_struct *fdm_ptr; + + if (fdm_active) + fdm_ptr = fdm_active; + else + fdm_ptr = &fdm; + + cpu_data_section = &(fdm_ptr->cpu_state_data); + seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(cpu_data_section->destination_address), + be64_to_cpu(cpu_data_section->destination_address) + + be64_to_cpu(cpu_data_section->source_len) - 1, + be64_to_cpu(cpu_data_section->source_len), + be64_to_cpu(cpu_data_section->bytes_dumped)); + + seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->hpte_region.destination_address), + be64_to_cpu(fdm_ptr->hpte_region.destination_address) + + be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, + be64_to_cpu(fdm_ptr->hpte_region.source_len), + be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); + + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + be64_to_cpu(fdm_ptr->rmr_region.source_address), + be64_to_cpu(fdm_ptr->rmr_region.destination_address)); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + be64_to_cpu(fdm_ptr->rmr_region.source_len), + be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); + + /* Dump is active. Show reserved area start address. */ + if (fdm_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } +} + +static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)msg); +} + +static struct fadump_ops rtas_fadump_ops = { + .fadump_init_mem_struct = rtas_fadump_init_mem_struct, + .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min, + .fadump_register = rtas_fadump_register, + .fadump_unregister = rtas_fadump_unregister, + .fadump_invalidate = rtas_fadump_invalidate, + .fadump_process = rtas_fadump_process, + .fadump_region_show = rtas_fadump_region_show, + .fadump_trigger = rtas_fadump_trigger, +}; + +void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + int i, size, num_sections; + const __be32 *sections; + const __be32 *token; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return; + + fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); + fadump_conf->ops = &rtas_fadump_ops; + fadump_conf->fadump_supported = 1; + + /* Firmware supports 64-bit value for size, align it to pagesize. */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE); + + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) { + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active)); + } + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives the size of the section in bytes. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return; + + num_sections = size / (3 * sizeof(u32)); + + for (i = 0; i < num_sections; i++, sections += 3) { + u32 type = (u32)of_read_number(sections, 1); + + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + fadump_conf->cpu_state_data_size = + of_read_ulong(§ions[1], 2); + break; + case RTAS_FADUMP_HPTE_REGION: + fadump_conf->hpte_region_size = + of_read_ulong(§ions[1], 2); + break; + } + } +} diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h new file mode 100644 index 000000000000..fd59bd7ca9c3 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _PSERIES_RTAS_FADUMP_H +#define _PSERIES_RTAS_FADUMP_H + +/* + * On some Power systems where RMO is 128MB, it still requires minimum of + * 256MB for kernel to boot successfully. When kdump infrastructure is + * configured to save vmcore over network, we run into OOM issue while + * loading modules related to network setup. Hence we need additional 64M + * of memory to avoid OOM issue. + */ +#define RTAS_FADUMP_MIN_BOOT_MEM ((0x1UL << 28) + (0x1UL << 26)) + +/* Firmware provided dump sections */ +#define RTAS_FADUMP_CPU_STATE_DATA 0x0001 +#define RTAS_FADUMP_HPTE_REGION 0x0002 +#define RTAS_FADUMP_REAL_MODE_REGION 0x0011 + +/* Dump request flag */ +#define RTAS_FADUMP_REQUEST_FLAG 0x00000001 + +/* Dump status flag */ +#define RTAS_FADUMP_ERROR_FLAG 0x2000 + +/* Kernel Dump section info */ +struct rtas_fadump_section { + __be32 request_flag; + __be16 source_data_type; + __be16 error_flags; + __be64 source_address; + __be64 source_len; + __be64 bytes_dumped; + __be64 destination_address; +}; + +/* ibm,configure-kernel-dump header. */ +struct rtas_fadump_section_header { + __be32 dump_format_version; + __be16 dump_num_sections; + __be16 dump_status_flag; + __be32 offset_first_dump_section; + + /* Fields for disk dump option. */ + __be32 dd_block_size; + __be64 dd_block_offset; + __be64 dd_num_blocks; + __be32 dd_offset_disk_path; + + /* Maximum time allowed to prevent an automatic dump-reboot. */ + __be32 max_time_auto; +}; + +/* + * Firmware Assisted dump memory structure. This structure is required for + * registering future kernel dump with power firmware through rtas call. + * + * No disk dump option. Hence disk dump path string section is not included. + */ +struct rtas_fadump_mem_struct { + struct rtas_fadump_section_header header; + + /* Kernel dump sections */ + struct rtas_fadump_section cpu_state_data; + struct rtas_fadump_section hpte_region; + + /* + * TODO: Extend multiple boot memory regions support in the kernel + * for this platform. + */ + struct rtas_fadump_section rmr_region; +}; + +/* + * The firmware-assisted dump format. + * + * The register save area is an area in the partition's memory used to preserve + * the register contents (CPU state data) for the active CPUs during a firmware + * assisted dump. The dump format contains register save area header followed + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" + * and ends with "CPUEND". + */ + +/* Register save area header. */ +struct rtas_fadump_reg_save_area_header { + __be64 magic_number; + __be32 version; + __be32 num_cpu_offset; +}; + +/* Register entry. */ +struct rtas_fadump_reg_entry { + __be64 reg_id; + __be64 reg_value; +}; + +/* Utility macros */ +#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry) \ +({ \ + while (be64_to_cpu(reg_entry->reg_id) != \ + fadump_str_to_u64("CPUEND")) \ + reg_entry++; \ + reg_entry++; \ +}) + +#define RTAS_FADUMP_CPU_ID_MASK ((1UL << 32) - 1) + +#endif /* _PSERIES_RTAS_FADUMP_H */ diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f5940cc71c37..f8adcd0e4589 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -69,6 +69,7 @@ #include <asm/security_features.h> #include <asm/asm-const.h> #include <asm/swiotlb.h> +#include <asm/svm.h> #include "pseries.h" #include "../../../../drivers/pci/pci.h" @@ -141,17 +142,19 @@ static void __init fwnmi_init(void) } #ifdef CONFIG_PPC_BOOK3S_64 - /* Allocate per cpu slb area to save old slb contents during MCE */ - size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; - slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry), - MEMBLOCK_LOW_LIMIT, ppc64_rma_size, - NUMA_NO_NODE); - if (!slb_ptr) - panic("Failed to allocate %zu bytes below %pa for slb area\n", - size, &ppc64_rma_size); - - for_each_possible_cpu(i) - paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + if (!radix_enabled()) { + /* Allocate per cpu area to save old slb contents during MCE */ + size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; + slb_ptr = memblock_alloc_try_nid_raw(size, + sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!slb_ptr) + panic("Failed to allocate %zu bytes below %pa for slb area\n", + size, &ppc64_rma_size); + + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + } #endif } @@ -297,8 +300,10 @@ static inline int alloc_dispatch_logs(void) static int alloc_dispatch_log_kmem_cache(void) { + void (*ctor)(void *) = get_dtl_cache_ctor(); + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, NULL); + DISPATCH_LOG_BYTES, 0, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); @@ -316,6 +321,9 @@ static void pseries_lpar_idle(void) * low power mode by ceding processor to hypervisor */ + if (!prep_irq_for_idle()) + return; + /* Indicate to hypervisor that we are idle. */ get_lppaca()->idle = 1; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 4b3ef8d9c63f..ad61e90032da 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -41,6 +41,7 @@ #include <asm/dbell.h> #include <asm/plpar_wrappers.h> #include <asm/code-patching.h> +#include <asm/svm.h> #include "pseries.h" #include "offline_states.h" @@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) + if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) smp_ops->cause_ipi = smp_pseries_cause_ipi; else smp_ops->cause_ipi = icp_ops->cause_ipi; diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c new file mode 100644 index 000000000000..40c0637203d5 --- /dev/null +++ b/arch/powerpc/platforms/pseries/svm.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Secure VM platform + * + * Copyright 2018 IBM Corporation + * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com> + */ + +#include <linux/mm.h> +#include <asm/machdep.h> +#include <asm/svm.h> +#include <asm/swiotlb.h> +#include <asm/ultravisor.h> + +static int __init init_svm(void) +{ + if (!is_secure_guest()) + return 0; + + /* Don't release the SWIOTLB buffer. */ + ppc_swiotlb_enable = 1; + + /* + * Since the guest memory is inaccessible to the host, devices always + * need to use the SWIOTLB buffer for DMA even if dma_capable() says + * otherwise. + */ + swiotlb_force = SWIOTLB_FORCE; + + /* Share the SWIOTLB buffer with the host. */ + swiotlb_update_mem_attributes(); + + return 0; +} +machine_early_initcall(pseries, init_svm); + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_unshare_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_share_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +/* There's one dispatch log per CPU. */ +#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE) + +static struct page *dtl_page_store[NR_DTL_PAGE]; +static long dtl_nr_pages; + +static bool is_dtl_page_shared(struct page *page) +{ + long i; + + for (i = 0; i < dtl_nr_pages; i++) + if (dtl_page_store[i] == page) + return true; + + return false; +} + +void dtl_cache_ctor(void *addr) +{ + unsigned long pfn = PHYS_PFN(__pa(addr)); + struct page *page = pfn_to_page(pfn); + + if (!is_dtl_page_shared(page)) { + dtl_page_store[dtl_nr_pages] = page; + dtl_nr_pages++; + WARN_ON(dtl_nr_pages >= NR_DTL_PAGE); + uv_share_page(pfn, 1); + } +} diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 3473eef7628c..79e2287991db 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1193,7 +1193,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) else tbl->it_ops = &iommu_table_pseries_ops; - return iommu_init_table(tbl, -1); + return iommu_init_table(tbl, -1, 0, 0); } /** |