diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
21 files changed, 1957 insertions, 317 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 850eee860cf2..938803eab0ad 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -12,7 +12,6 @@ config PPC_POWERNV select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 - select PPC_SCOM select ARCH_RANDOM select CPU_FREQ select PPC_DOORBELL @@ -47,3 +46,7 @@ config PPC_VAS VAS adapters are found in POWER9 based systems. If unsure, say N. + +config SCOM_DEBUGFS + bool "Expose SCOM controllers via debugfs" + depends on DEBUG_FS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index da2e99efbd04..a3ac9646119d 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -4,15 +4,19 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o +obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o +obj-$(CONFIG_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_OPAL_CORE) += opal-core.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o -obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o obj-$(CONFIG_OCXL_BASE) += ocxl.o +obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 620a986209f5..6bc24a47e9ef 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -34,6 +34,7 @@ #include "powernv.h" #include "pci.h" +#include "../../../../drivers/pci/pci.h" static int eeh_event_irq = -EINVAL; @@ -41,13 +42,10 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); eeh_add_device_early(pdn); eeh_add_device_late(pdev); eeh_sysfs_add_device(pdev); @@ -199,6 +197,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); #endif /* CONFIG_DEBUG_FS */ +void pnv_eeh_enable_phbs(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + /* + * If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; + } +} + /** * pnv_eeh_post_init - EEH platform dependent post initialization * @@ -213,9 +230,7 @@ int pnv_eeh_post_init(void) struct pnv_phb *phb; int ret = 0; - /* Probe devices & build address cache */ - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); @@ -237,19 +252,11 @@ int pnv_eeh_post_init(void) if (!eeh_enabled()) disable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; - /* - * If EEH is enabled, we're going to rely on that. - * Otherwise, we restore to conventional mechanism - * to clear frozen PE during PCI config access. - */ - if (eeh_enabled()) - phb->flags |= PNV_PHB_FLAG_EEH; - else - phb->flags &= ~PNV_PHB_FLAG_EEH; - /* Create debugfs entries */ #ifdef CONFIG_DEBUG_FS if (phb->has_dbgfs || !phb->dbgfs) @@ -377,6 +384,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -402,9 +411,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) /* Create PE */ ret = eeh_add_to_parent_pe(edev); if (ret) { - pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n", - __func__, hose->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret); + eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret); return NULL; } @@ -453,11 +460,17 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * Enable EEH explicitly so that we will do EEH check * while accessing I/O stuff */ - eeh_add_flag(EEH_ENABLED); + if (!eeh_has_flag(EEH_ENABLED)) { + enable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + eeh_add_flag(EEH_ENABLED); + } /* Save memory bars */ eeh_save_bars(edev); + eeh_edev_dbg(edev, "EEH enabled on device\n"); + return NULL; } @@ -837,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) int aer = edev ? edev->aer_cap : 0; u32 ctrl; - pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n", __func__, pci_domain_nr(dev->bus), dev->bus->number, option); @@ -895,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option) if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL)) return __pnv_eeh_bridge_reset(pdev, option); + pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(pdev->bus), + pdev->bus->number, option); + switch (option) { case EEH_RESET_FUNDAMENTAL: scope = OPAL_RESET_PCI_FUNDAMENTAL; @@ -1113,17 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return -EIO; } + if (pci_is_root_bus(bus)) + return pnv_eeh_root_reset(hose, option); + /* - * If dealing with the root bus (or the bus underneath the - * root port), we reset the bus underneath the root port. + * For hot resets try use the generic PCI error recovery reset + * functions. These correctly handles the case where the secondary + * bus is behind a hotplug slot and it will use the slot provided + * reset methods to prevent spurious hotplug events during the reset. * - * The cxl driver depends on this behaviour for bi-modal card - * switching. + * Fundemental resets need to be handled internally to EEH since the + * PCI core doesn't really have a concept of a fundemental reset, + * mainly because there's no standard way to generate one. Only a + * few devices require an FRESET so it should be fine. */ - if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) - return pnv_eeh_root_reset(hose, option); + if (option != EEH_RESET_FUNDAMENTAL) { + /* + * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the + * de-assert step. It's like the OPAL reset API was + * poorly designed or something... + */ + if (option == EEH_RESET_DEACTIVATE) + return 0; + rc = pci_bus_error_reset(bus->self); + if (!rc) + return 0; + } + + /* otherwise, use the generic bridge reset. this might call into FW */ + if (pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); return pnv_eeh_bridge_reset(bus->self, option); } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 09f49eed7fb8..78599bca66c2 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) sprs.ptcr = mfspr(SPRN_PTCR); sprs.rpr = mfspr(SPRN_RPR); sprs.tscr = mfspr(SPRN_TSCR); - sprs.ldbar = mfspr(SPRN_LDBAR); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + sprs.ldbar = mfspr(SPRN_LDBAR); sprs_saved = true; @@ -789,7 +790,8 @@ core_woken: mtspr(SPRN_MMCR0, sprs.mmcr0); mtspr(SPRN_MMCR1, sprs.mmcr1); mtspr(SPRN_MMCR2, sprs.mmcr2); - mtspr(SPRN_LDBAR, sprs.ldbar); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_LDBAR, sprs.ldbar); mtspr(SPRN_SPRG3, local_paca->sprg_vdso); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index c16249d251f1..b95b9e3c4c98 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -89,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) } EXPORT_SYMBOL(pnv_pci_get_npu_dev); +#ifdef CONFIG_IOMMU_API /* * Returns the PE assoicated with the PCI device of the given * NPU. Returns the linked pci device if pci_dev != NULL. @@ -192,106 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) return 0; } -/* - * Enables 32 bit DMA on NPU. - */ -static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) -{ - struct pci_dev *gpdev; - struct pnv_ioda_pe *gpe; - int64_t rc; - - /* - * Find the assoicated PCI devices and get the dma window - * information from there. - */ - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - rc = pnv_npu_set_window(&npe->table_group, 0, - gpe->table_group.tables[0]); - - /* - * NVLink devices use the same TCE table configuration as - * their parent device so drivers shouldn't be doing DMA - * operations directly on these devices. - */ - set_dma_ops(&npe->pdev->dev, &dma_dummy_ops); -} - -/* - * Enables bypass mode on the NPU. The NPU only supports one - * window per link, so bypass needs to be explicitly enabled or - * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be - * active at the same time. - */ -static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - int64_t rc = 0; - phys_addr_t top = memblock_end_of_DRAM(); - - if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) - return -EINVAL; - - rc = pnv_npu_unset_window(&npe->table_group, 0); - if (rc != OPAL_SUCCESS) - return rc; - - /* Enable the bypass window */ - - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - 0 /* bypass base */, top); - - if (rc == OPAL_SUCCESS) - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - return rc; -} - -void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) -{ - int i; - struct pnv_phb *phb; - struct pci_dn *pdn; - struct pnv_ioda_pe *npe; - struct pci_dev *npdev; - - for (i = 0; ; ++i) { - npdev = pnv_pci_get_npu_dev(gpdev, i); - - if (!npdev) - break; - - pdn = pci_get_pdn(npdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return; - - phb = pci_bus_to_host(npdev->bus)->private_data; - - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - - if (bypass) { - dev_info(&npdev->dev, - "Using 64-bit DMA iommu bypass\n"); - pnv_npu_dma_set_bypass(npe); - } else { - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); - pnv_npu_dma_set_32(npe); - } - } -} - -#ifdef CONFIG_IOMMU_API /* Switch ownership from platform code to external user (e.g. VFIO) */ static void pnv_npu_take_ownership(struct iommu_table_group *table_group) { diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 29ca523c1c79..a2aa5e433ac8 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ); OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); @@ -287,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); +OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE); +OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG); +OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG); diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c new file mode 100644 index 000000000000..ed895d82c048 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interface for exporting the OPAL ELF core. + * Heavily inspired from fs/proc/vmcore.c + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal core: " fmt + +#include <linux/memblock.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/crash_core.h> +#include <linux/of.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + +#define MAX_PT_LOAD_CNT 8 + +/* NT_AUXV note related info */ +#define AUXV_CNT 1 +#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off)) + +struct opalcore_config { + u32 num_cpus; + /* PIR value of crashing CPU */ + u32 crashing_cpu; + + /* CPU state data info from F/W */ + u64 cpu_state_destination_vaddr; + u64 cpu_state_data_size; + u64 cpu_state_entry_size; + + /* OPAL memory to be exported as PT_LOAD segments */ + u64 ptload_addr[MAX_PT_LOAD_CNT]; + u64 ptload_size[MAX_PT_LOAD_CNT]; + u64 ptload_cnt; + + /* Pointer to the first PT_LOAD in the ELF core file */ + Elf64_Phdr *ptload_phdr; + + /* Total size of opalcore file. */ + size_t opalcore_size; + + /* Buffer for all the ELF core headers and the PT_NOTE */ + size_t opalcorebuf_sz; + char *opalcorebuf; + + /* NT_AUXV buffer */ + char auxv_buf[AUXV_DESC_SZ]; +}; + +struct opalcore { + struct list_head list; + u64 paddr; + size_t size; + loff_t offset; +}; + +static LIST_HEAD(opalcore_list); +static struct opalcore_config *oc_conf; +static const struct opal_mpipl_fadump *opalc_metadata; +static const struct opal_mpipl_fadump *opalc_cpu_metadata; + +/* + * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered + * by kernel, SIGTERM otherwise. + */ +bool kernel_initiated; + +static struct opalcore * __init get_new_element(void) +{ + return kzalloc(sizeof(struct opalcore), GFP_KERNEL); +} + +static inline int is_opalcore_usable(void) +{ + return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0; +} + +static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name, + u32 type, void *data, + size_t data_len) +{ + Elf64_Nhdr *note = (Elf64_Nhdr *)buf; + Elf64_Word namesz = strlen(name) + 1; + + note->n_namesz = cpu_to_be32(namesz); + note->n_descsz = cpu_to_be32(data_len); + note->n_type = cpu_to_be32(type); + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word)); + memcpy(buf, name, namesz); + buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word)); + + return buf; +} + +static void fill_prstatus(struct elf_prstatus *prstatus, int pir, + struct pt_regs *regs) +{ + memset(prstatus, 0, sizeof(struct elf_prstatus)); + elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs); + + /* + * Overload PID with PIR value. + * As a PIR value could also be '0', add an offset of '100' + * to every PIR to avoid misinterpretations in GDB. + */ + prstatus->pr_pid = cpu_to_be32(100 + pir); + prstatus->pr_ppid = cpu_to_be32(1); + + /* + * Indicate SIGUSR1 for crash initiated from kernel. + * SIGTERM otherwise. + */ + if (pir == oc_conf->crashing_cpu) { + short sig; + + sig = kernel_initiated ? SIGUSR1 : SIGTERM; + prstatus->pr_cursig = cpu_to_be16(sig); + } +} + +static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf, + u64 opal_boot_entry) +{ + Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf; + int idx = 0; + + memset(bufp, 0, AUXV_DESC_SZ); + + /* Entry point of OPAL */ + bufp[idx++] = cpu_to_be64(AT_ENTRY); + bufp[idx++] = cpu_to_be64(opal_boot_entry); + + /* end of vector */ + bufp[idx++] = cpu_to_be64(AT_NULL); + + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV, + oc_conf->auxv_buf, AUXV_DESC_SZ); + return buf; +} + +/* + * Read from the ELF header and then the crash dump. + * Returns number of bytes read on success, -errno on failure. + */ +static ssize_t read_opalcore(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct opalcore *m; + ssize_t tsz, avail; + loff_t tpos = pos; + + if (pos >= oc_conf->opalcore_size) + return 0; + + /* Adjust count if it goes beyond opalcore size */ + avail = oc_conf->opalcore_size - pos; + if (count > avail) + count = avail; + + if (count == 0) + return 0; + + /* Read ELF core header and/or PT_NOTE segment */ + if (tpos < oc_conf->opalcorebuf_sz) { + tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count); + memcpy(to, oc_conf->opalcorebuf + tpos, tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + + list_for_each_entry(m, &opalcore_list, list) { + /* nothing more to read here */ + if (count == 0) + break; + + if (tpos < m->offset + m->size) { + void *addr; + + tsz = min_t(size_t, m->offset + m->size - tpos, count); + addr = (void *)(m->paddr + tpos - m->offset); + memcpy(to, __va(addr), tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + } + + return (tpos - pos); +} + +static struct bin_attribute opal_core_attr = { + .attr = {.name = "core", .mode = 0400}, + .read = read_opalcore +}; + +/* + * Read CPU state dump data and convert it into ELF notes. + * + * Each register entry is of 16 bytes, A numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. + */ +static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + struct elf_prstatus prstatus; + Elf64_Word *first_cpu_note; + struct pt_regs regs; + char *bufp; + int i; + + size_per_thread = oc_conf->cpu_state_entry_size; + bufp = __va(oc_conf->cpu_state_destination_vaddr); + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", oc_conf->num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + /* + * Skip past the first CPU note. Fill this note with the + * crashing CPU's prstatus. + */ + first_cpu_note = buf; + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + + for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + thread_pir = be32_to_cpu(thdr->pir); + + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, false, ®s); + + pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir, + be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip)); + fill_prstatus(&prstatus, thread_pir, ®s); + + if (thread_pir != oc_conf->crashing_cpu) { + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } else { + /* + * Add crashing CPU as the first NT_PRSTATUS note for + * GDB to process the core file appropriately. + */ + append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } + } + + return buf; +} + +static int __init create_opalcore(void) +{ + u64 opal_boot_entry, opal_base_addr, paddr; + u32 hdr_size, cpu_notes_size, count; + struct device_node *dn; + struct opalcore *new; + loff_t opalcore_off; + struct page *page; + Elf64_Phdr *phdr; + Elf64_Ehdr *elf; + int i, ret; + char *bufp; + + /* Get size of header & CPU notes for OPAL core */ + hdr_size = (sizeof(Elf64_Ehdr) + + ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr))); + cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + + CRASH_CORE_NOTE_DESC_BYTES)) + + (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ)); + + /* Allocate buffer to setup OPAL core */ + oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size); + oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz, + GFP_KERNEL | __GFP_ZERO); + if (!oc_conf->opalcorebuf) { + pr_err("Not enough memory to setup OPAL core (size: %lu)\n", + oc_conf->opalcorebuf_sz); + oc_conf->opalcorebuf_sz = 0; + return -ENOMEM; + } + count = oc_conf->opalcorebuf_sz / PAGE_SIZE; + page = virt_to_page(oc_conf->opalcorebuf); + for (i = 0; i < count; i++) + mark_page_reserved(page + i); + + pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf); + + /* Read OPAL related device-tree entries */ + dn = of_find_node_by_name(NULL, "ibm,opal"); + if (dn) { + ret = of_property_read_u64(dn, "opal-base-address", + &opal_base_addr); + pr_debug("opal-base-address: %llx\n", opal_base_addr); + ret |= of_property_read_u64(dn, "opal-boot-address", + &opal_boot_entry); + pr_debug("opal-boot-address: %llx\n", opal_boot_entry); + } + if (!dn || ret) + pr_warn("WARNING: Failed to read OPAL base & entry values\n"); + + /* Use count to keep track of the program headers */ + count = 0; + + bufp = oc_conf->opalcorebuf; + elf = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELFDATA2MSB; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = cpu_to_be16(ET_CORE); + elf->e_machine = cpu_to_be16(ELF_ARCH); + elf->e_version = cpu_to_be32(EV_CURRENT); + elf->e_entry = 0; + elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr)); + elf->e_shoff = 0; + elf->e_flags = 0; + + elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr)); + elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr)); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_NOTE); + phdr->p_flags = 0; + phdr->p_align = 0; + phdr->p_paddr = phdr->p_vaddr = 0; + phdr->p_offset = cpu_to_be64(hdr_size); + phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size); + count++; + + opalcore_off = oc_conf->opalcorebuf_sz; + oc_conf->ptload_phdr = (Elf64_Phdr *)bufp; + paddr = 0; + for (i = 0; i < oc_conf->ptload_cnt; i++) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_LOAD); + phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X); + phdr->p_align = 0; + + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = oc_conf->ptload_addr[i]; + new->size = oc_conf->ptload_size[i]; + new->offset = opalcore_off; + list_add_tail(&new->list, &opalcore_list); + + phdr->p_paddr = cpu_to_be64(paddr); + phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr); + phdr->p_filesz = phdr->p_memsz = + cpu_to_be64(oc_conf->ptload_size[i]); + phdr->p_offset = cpu_to_be64(opalcore_off); + + count++; + opalcore_off += oc_conf->ptload_size[i]; + paddr += oc_conf->ptload_size[i]; + } + + elf->e_phnum = cpu_to_be16(count); + + bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp); + bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry); + + oc_conf->opalcore_size = opalcore_off; + return 0; +} + +static void opalcore_cleanup(void) +{ + if (oc_conf == NULL) + return; + + /* Remove OPAL core sysfs file */ + sysfs_remove_bin_file(opal_kobj, &opal_core_attr); + oc_conf->ptload_phdr = NULL; + oc_conf->ptload_cnt = 0; + + /* free the buffer used for setting up OPAL core */ + if (oc_conf->opalcorebuf) { + void *end = (void *)((u64)oc_conf->opalcorebuf + + oc_conf->opalcorebuf_sz); + + free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL); + oc_conf->opalcorebuf = NULL; + oc_conf->opalcorebuf_sz = 0; + } + + kfree(oc_conf); + oc_conf = NULL; +} +__exitcall(opalcore_cleanup); + +static void __init opalcore_config_init(void) +{ + u32 idx, cpu_data_version; + struct device_node *np; + const __be32 *prop; + u64 addr = 0; + int i, ret; + + np = of_find_node_by_path("/ibm,opal/dump"); + if (np == NULL) + return; + + if (!of_device_is_compatible(np, "ibm,opal-dump")) { + pr_warn("Support missing for this f/w version!\n"); + return; + } + + /* Check if dump has been initiated on last reboot */ + prop = of_get_property(np, "mpipl-boot", NULL); + if (!prop) { + of_node_put(np); + return; + } + + /* Get OPAL metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("OPAL metadata addr: %llx\n", addr); + opalc_metadata = __va(addr); + + /* Get OPAL CPU metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL CPU metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opalc_cpu_metadata = __va(addr); + + /* Allocate memory for config buffer */ + oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL); + if (oc_conf == NULL) + goto error_out; + + /* Parse OPAL metadata */ + if (opalc_metadata->version != OPAL_MPIPL_VERSION) { + pr_warn("Supported OPAL metadata version: %u, found: %u!\n", + OPAL_MPIPL_VERSION, opalc_metadata->version); + pr_warn("WARNING: F/W using newer OPAL metadata format!!\n"); + } + + oc_conf->ptload_cnt = 0; + idx = be32_to_cpu(opalc_metadata->region_cnt); + if (idx > MAX_PT_LOAD_CNT) { + pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)", + MAX_PT_LOAD_CNT, idx); + idx = MAX_PT_LOAD_CNT; + } + for (i = 0; i < idx; i++) { + oc_conf->ptload_addr[oc_conf->ptload_cnt] = + be64_to_cpu(opalc_metadata->region[i].dest); + oc_conf->ptload_size[oc_conf->ptload_cnt++] = + be64_to_cpu(opalc_metadata->region[i].size); + } + oc_conf->ptload_cnt = i; + oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir); + + if (!oc_conf->ptload_cnt) { + pr_err("OPAL memory regions not found\n"); + goto error_out; + } + + /* Parse OPAL CPU metadata */ + cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version); + if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU data version: %u, found: %u!\n", + HDAT_FADUMP_CPU_DATA_VER, cpu_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest); + if (!addr) { + pr_err("CPU state data not found!\n"); + goto error_out; + } + oc_conf->cpu_state_destination_vaddr = (u64)__va(addr); + + oc_conf->cpu_state_data_size = + be64_to_cpu(opalc_cpu_metadata->region[0].size); + oc_conf->cpu_state_entry_size = + be32_to_cpu(opalc_cpu_metadata->cpu_data_size); + + if ((oc_conf->cpu_state_entry_size == 0) || + (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid.\n"); + goto error_out; + } + oc_conf->num_cpus = (oc_conf->cpu_state_data_size / + oc_conf->cpu_state_entry_size); + + of_node_put(np); + return; + +error_out: + pr_err("Could not export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + of_node_put(np); +} + +static ssize_t fadump_release_opalcore_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int input = -1; + + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { + if (oc_conf == NULL) { + pr_err("'/sys/firmware/opal/core' file not accessible!\n"); + return -EPERM; + } + + /* + * Take away '/sys/firmware/opal/core' and release all memory + * used for exporting this file. + */ + opalcore_cleanup(); + } else + return -EINVAL; + + return count; +} + +static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore, + 0200, NULL, + fadump_release_opalcore_store); + +static int __init opalcore_init(void) +{ + int rc = -1; + + opalcore_config_init(); + + if (oc_conf == NULL) + return rc; + + create_opalcore(); + + /* + * If oc_conf->opalcorebuf= is set in the 2nd kernel, + * then capture the dump. + */ + if (!(is_opalcore_usable())) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + /* Set OPAL core file size */ + opal_core_attr.size = oc_conf->opalcore_size; + + /* Export OPAL core sysfs file */ + rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr); + if (rc != 0) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr); + if (rc) { + pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n", + rc); + } + + return 0; +} +fs_initcall(opalcore_init); diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c new file mode 100644 index 000000000000..d361d37d975f --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal fadump: " fmt + +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + + +#ifdef CONFIG_PRESERVE_FA_DUMP +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * ensure crash data is preserved in hope that the subsequent memory + * preserving kernel boot is going to process this crash data. + */ +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const struct opal_fadump_mem_struct *opal_fdm_active; + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + s64 ret; + + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) + return; + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_debug("Could not get Kernel metadata (%lld)\n", ret); + return; + } + + /* + * Preserve memory only if kernel memory regions are registered + * with f/w for MPIPL. + */ + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + opal_fdm_active = (void *)addr; + if (opal_fdm_active->registered_regions == 0) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get boot memory tag (%lld)\n", ret); + return; + } + + /* + * Memory below this address can be used for booting a + * capture kernel or petitboot kernel. Preserve everything + * above this address for processing crashdump. + */ + fadump_conf->boot_mem_top = be64_to_cpu(addr); + pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top); + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; +} + +#else /* CONFIG_PRESERVE_FA_DUMP */ +static const struct opal_fadump_mem_struct *opal_fdm_active; +static const struct opal_mpipl_fadump *opal_cpu_metadata; +static struct opal_fadump_mem_struct *opal_fdm; + +#ifdef CONFIG_OPAL_CORE +extern bool kernel_initiated; +#endif + +static int opal_fadump_unregister(struct fw_dump *fadump_conf); + +static void opal_fadump_update_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + pr_debug("Boot memory regions count: %d\n", fdm->region_cnt); + + /* + * The destination address of the first boot memory region is the + * destination address of boot memory regions. + */ + fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest; + pr_debug("Destination address of boot memory regions: %#016llx\n", + fadump_conf->boot_mem_dest_addr); + + fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr; +} + +/* + * This function is called in the capture kernel to get configuration details + * from metadata setup by the first kernel. + */ +static void opal_fadump_get_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + unsigned long base, size, last_end, hole_size; + int i; + + if (!fadump_conf->dump_active) + return; + + last_end = 0; + hole_size = 0; + fadump_conf->boot_memory_size = 0; + + pr_debug("Boot memory regions:\n"); + for (i = 0; i < fdm->region_cnt; i++) { + base = fdm->rgn[i].src; + size = fdm->rgn[i].size; + pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size); + + fadump_conf->boot_mem_addr[i] = base; + fadump_conf->boot_mem_sz[i] = size; + fadump_conf->boot_memory_size += size; + hole_size += (base - last_end); + + last_end = base + size; + } + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest; + + /* + * Rarely, but it can so happen that system crashes before all + * boot memory regions are registered for MPIPL. In such + * cases, warn that the vmcore may not be accurate and proceed + * anyway as that is the best bet considering free pages, cache + * pages, user pages, etc are usually filtered out. + * + * Hope the memory that could not be preserved only has pages + * that are usually filtered out while saving the vmcore. + */ + if (fdm->region_cnt > fdm->registered_regions) { + pr_warn("Not all memory regions were saved!!!\n"); + pr_warn(" Unsaved memory regions:\n"); + i = fdm->registered_regions; + while (i < fdm->region_cnt) { + pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n", + i, fdm->rgn[i].src, fdm->rgn[i].size); + i++; + } + + pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n"); + pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n"); + } + + fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size); + fadump_conf->boot_mem_regs_cnt = fdm->region_cnt; + opal_fadump_update_config(fadump_conf, fdm); +} + +/* Initialize kernel metadata */ +static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) +{ + fdm->version = OPAL_FADUMP_VERSION; + fdm->region_cnt = 0; + fdm->registered_regions = 0; + fdm->fadumphdr_addr = 0; +} + +static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + u64 addr = fadump_conf->reserve_dump_area_start; + int i; + + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* Boot memory regions */ + for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) { + opal_fdm->rgn[i].src = fadump_conf->boot_mem_addr[i]; + opal_fdm->rgn[i].dest = addr; + opal_fdm->rgn[i].size = fadump_conf->boot_mem_sz[i]; + + opal_fdm->region_cnt++; + addr += fadump_conf->boot_mem_sz[i]; + } + + /* + * Kernel metadata is passed to f/w and retrieved in capture kerenl. + * So, use it to save fadump header address instead of calculating it. + */ + opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest + + fadump_conf->boot_memory_size); + + opal_fadump_update_config(fadump_conf, opal_fdm); + + return addr; +} + +static u64 opal_fadump_get_metadata_size(void) +{ + return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct)); +} + +static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) +{ + int err = 0; + s64 ret; + + /* + * Use the last page(s) in FADump memory reservation for + * kernel metadata. + */ + fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start + + fadump_conf->reserve_dump_area_size - + opal_fadump_get_metadata_size()); + pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata); + + /* Initialize kernel metadata before registering the address with f/w */ + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* + * Register metadata address with f/w. Can be retrieved in + * the capture kernel. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, + fadump_conf->kernel_metadata); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set kernel metadata tag!\n"); + err = -EPERM; + } + + /* + * Register boot memory top address with f/w. Should be retrieved + * by a kernel that intends to preserve crash'ed kernel's memory. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM, + fadump_conf->boot_mem_top); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set boot memory tag!\n"); + err = -EPERM; + } + + return err; +} + +static u64 opal_fadump_get_bootmem_min(void) +{ + return OPAL_FADUMP_MIN_BOOT_MEM; +} + +static int opal_fadump_register(struct fw_dump *fadump_conf) +{ + s64 rc = OPAL_PARAMETER; + int i, err = -EIO; + + for (i = 0; i < opal_fdm->region_cnt; i++) { + rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE, + opal_fdm->rgn[i].src, + opal_fdm->rgn[i].dest, + opal_fdm->rgn[i].size); + if (rc != OPAL_SUCCESS) + break; + + opal_fdm->registered_regions++; + } + + switch (rc) { + case OPAL_SUCCESS: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_RESOURCE: + /* If MAX regions limit in f/w is hit, warn and proceed. */ + pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n", + (opal_fdm->region_cnt - opal_fdm->registered_regions)); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_PARAMETER: + pr_err("Failed to register. Parameter Error(%lld).\n", rc); + break; + case OPAL_HARDWARE: + pr_err("Support not available.\n"); + fadump_conf->fadump_supported = 0; + fadump_conf->fadump_enabled = 0; + break; + default: + pr_err("Failed to register. Unknown Error(%lld).\n", rc); + break; + } + + /* + * If some regions were registered before OPAL_MPIPL_ADD_RANGE + * OPAL call failed, unregister all regions. + */ + if ((err < 0) && (opal_fdm->registered_regions > 0)) + opal_fadump_unregister(fadump_conf); + + return err; +} + +static int opal_fadump_unregister(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0); + if (rc) { + pr_err("Failed to un-register - unexpected Error(%lld).\n", rc); + return -EIO; + } + + opal_fdm->registered_regions = 0; + fadump_conf->dump_registered = 0; + return 0; +} + +static int opal_fadump_invalidate(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0); + if (rc) { + pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + opal_fdm_active = NULL; + return 0; +} + +static void opal_fadump_cleanup(struct fw_dump *fadump_conf) +{ + s64 ret; + + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0); + if (ret != OPAL_SUCCESS) + pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); +} + +/* + * Verify if CPU state data is available. If available, do a bit of sanity + * checking before processing this data. + */ +static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf) +{ + if (!opal_cpu_metadata) + return false; + + fadump_conf->cpu_state_data_version = + be32_to_cpu(opal_cpu_metadata->cpu_data_version); + fadump_conf->cpu_state_entry_size = + be32_to_cpu(opal_cpu_metadata->cpu_data_size); + fadump_conf->cpu_state_dest_vaddr = + (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest)); + fadump_conf->cpu_state_data_size = + be64_to_cpu(opal_cpu_metadata->region[0].size); + + if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU state data version: %u, found: %d!\n", + HDAT_FADUMP_CPU_DATA_VER, + fadump_conf->cpu_state_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + if ((fadump_conf->cpu_state_dest_vaddr == 0) || + (fadump_conf->cpu_state_entry_size == 0) || + (fadump_conf->cpu_state_entry_size > + fadump_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid. Ignoring!\n"); + return false; + } + + return true; +} + +/* + * Convert CPU state data saved at the time of crash into ELF notes. + * + * While the crashing CPU's register data is saved by the kernel, CPU state + * data for all CPUs is saved by f/w. In CPU state data provided by f/w, + * each register entry is of 16 bytes, a numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. If this data is + * missing or in unsupported format, append crashing CPU's register data + * saved by the kernel in the PT_NOTE, to have something to work with in + * the vmcore file. + */ +static int __init +opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf, + struct fadump_crash_info_header *fdh) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + bool is_cpu_data_valid = false; + u32 num_cpus = 1, *note_buf; + struct pt_regs regs; + char *bufp; + int rc, i; + + if (is_opal_fadump_cpu_data_valid(fadump_conf)) { + size_per_thread = fadump_conf->cpu_state_entry_size; + num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread); + bufp = __va(fadump_conf->cpu_state_dest_vaddr); + is_cpu_data_valid = true; + } + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + if (!is_cpu_data_valid) + goto out; + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + for (i = 0; i < num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + + thread_pir = be32_to_cpu(thdr->pir); + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * If this is kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu == thread_pir) { + note_buf = fadump_regs_to_elf_notes(note_buf, + &fdh->regs); + pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + fdh->crashing_cpu, fdh->regs.gpr[1], + fdh->regs.nip); + continue; + } + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, true, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + thread_pir, regs.gpr[1], regs.nip); + } + +out: + /* + * CPU state data is invalid/unsupported. Try appending crashing CPU's + * register data, if it is saved by the kernel. + */ + if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) { + if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) { + fadump_free_cpu_notes_buf(); + return -ENODEV; + } + + pr_warn("WARNING: appending only crashing CPU's register data\n"); + note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); + } + + final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + return 0; +} + +static int __init opal_fadump_process(struct fw_dump *fadump_conf) +{ + struct fadump_crash_info_header *fdh; + int rc = -EINVAL; + + if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) + return rc; + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return rc; + } + +#ifdef CONFIG_OPAL_CORE + /* + * If this is a kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN) + kernel_initiated = true; +#endif + + rc = opal_fadump_build_cpu_notes(fadump_conf, fdh); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return rc; +} + +static void opal_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ + const struct opal_fadump_mem_struct *fdm_ptr; + u64 dumped_bytes = 0; + int i; + + if (fadump_conf->dump_active) + fdm_ptr = opal_fdm_active; + else + fdm_ptr = opal_fdm; + + for (i = 0; i < fdm_ptr->region_cnt; i++) { + /* + * Only regions that are registered for MPIPL + * would have dump data. + */ + if ((fadump_conf->dump_active) && + (i < fdm_ptr->registered_regions)) + dumped_bytes = fdm_ptr->rgn[i].size; + + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + fdm_ptr->rgn[i].size, dumped_bytes); + } + + /* Dump is active. Show reserved area start address. */ + if (fadump_conf->dump_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } +} + +static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + int rc; + + /* + * Unlike on pSeries platform, logical CPU number is not provided + * with architected register state data. So, store the crashing + * CPU's PIR instead to plug the appropriate register data for + * crashing CPU in the vmcore file. + */ + fdh->crashing_cpu = (u32)mfspr(SPRN_PIR); + + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg); + if (rc == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported.\n", + OPAL_REBOOT_MPIPL); + } else if (rc == OPAL_HARDWARE) + pr_emerg("No backend support for MPIPL!\n"); +} + +static struct fadump_ops opal_fadump_ops = { + .fadump_init_mem_struct = opal_fadump_init_mem_struct, + .fadump_get_metadata_size = opal_fadump_get_metadata_size, + .fadump_setup_metadata = opal_fadump_setup_metadata, + .fadump_get_bootmem_min = opal_fadump_get_bootmem_min, + .fadump_register = opal_fadump_register, + .fadump_unregister = opal_fadump_unregister, + .fadump_invalidate = opal_fadump_invalidate, + .fadump_cleanup = opal_fadump_cleanup, + .fadump_process = opal_fadump_process, + .fadump_region_show = opal_fadump_region_show, + .fadump_trigger = opal_fadump_trigger, +}; + +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + int i, len; + s64 ret; + + /* + * Check if Firmware-Assisted Dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) { + pr_debug("FADump support is missing!\n"); + return; + } + + if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) { + pr_err("Support missing for this f/w version!\n"); + return; + } + + prop = of_get_flat_dt_prop(dn, "fw-load-area", &len); + if (prop) { + /* + * Each f/w load area is an (address,size) pair, + * 2 cells each, totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, end; + + base = of_read_number(prop + (i * 4) + 0, 2); + end = base; + end += of_read_number(prop + (i * 4) + 2, 2); + if (end > OPAL_FADUMP_MIN_BOOT_MEM) { + pr_err("F/W load area: 0x%llx-0x%llx\n", + base, end); + pr_err("F/W version not supported!\n"); + return; + } + } + } + + fadump_conf->ops = &opal_fadump_ops; + fadump_conf->fadump_supported = 1; + + /* + * Firmware supports 32-bit field for size. Align it to PAGE_SIZE + * and request firmware to copy multiple kernel boot memory regions. + */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE); + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get Kernel metadata (%lld)\n", ret); + return; + } + + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + + opal_fdm_active = __va(addr); + if (opal_fdm_active->version != OPAL_FADUMP_VERSION) { + pr_warn("Supported kernel metadata version: %u, found: %d!\n", + OPAL_FADUMP_VERSION, opal_fdm_active->version); + pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n"); + } + + /* Kernel regions not registered with f/w for MPIPL */ + if (opal_fdm_active->registered_regions == 0) { + opal_fdm_active = NULL; + return; + } + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if (addr) { + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opal_cpu_metadata = __va(addr); + } + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + opal_fadump_get_config(fadump_conf, opal_fdm_active); +} +#endif /* !CONFIG_PRESERVE_FA_DUMP */ diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h new file mode 100644 index 000000000000..f1e9ecf548c5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _POWERNV_OPAL_FADUMP_H +#define _POWERNV_OPAL_FADUMP_H + +#include <asm/reg.h> + +/* + * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum + * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't + * mess with crash'ed kernel's memory during MPIPL. + */ +#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL) + +/* + * OPAL FADump metadata structure format version + * + * OPAL FADump kernel metadata structure stores kernel metadata needed to + * register-for/process crash dump. Format version is used to keep a tab on + * the changes in the structure format. The changes, if any, to the format + * are expected to be minimal and backward compatible. + */ +#define OPAL_FADUMP_VERSION 0x1 + +/* + * OPAL FADump kernel metadata + * + * The address of this structure will be registered with f/w for retrieving + * and processing during crash dump. + */ +struct opal_fadump_mem_struct { + u8 version; + u8 reserved[3]; + u16 region_cnt; /* number of regions */ + u16 registered_regions; /* Regions registered for MPIPL */ + u64 fadumphdr_addr; + struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS]; +} __packed; + +/* + * CPU state data + * + * CPU state data information is provided by f/w. The format for this data + * is defined in the HDAT spec. Version is used to keep a tab on the changes + * in this CPU state data format. Changes to this format are unlikely, but + * if there are any changes, please refer to latest HDAT specification. + */ +#define HDAT_FADUMP_CPU_DATA_VER 1 + +#define HDAT_FADUMP_CORE_INACTIVE (0x0F) + +/* HDAT thread header for register entries */ +struct hdat_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __packed; + +/* Register types populated by f/w */ +#define HDAT_FADUMP_REG_TYPE_GPR 0x01 +#define HDAT_FADUMP_REG_TYPE_SPR 0x02 + +/* ID numbers used by f/w while populating certain registers */ +#define HDAT_FADUMP_REG_ID_NIP 0x7D0 +#define HDAT_FADUMP_REG_ID_MSR 0x7D1 +#define HDAT_FADUMP_REG_ID_CCR 0x7D2 + +/* HDAT register entry. */ +struct hdat_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +} __packed; + +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, + u32 reg_type, u32 reg_num, + u64 reg_val) +{ + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case HDAT_FADUMP_REG_ID_NIP: + regs->nip = reg_val; + break; + case HDAT_FADUMP_REG_ID_MSR: + regs->msr = reg_val; + break; + case HDAT_FADUMP_REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, + unsigned int reg_entry_size, + bool cpu_endian, + struct pt_regs *regs) +{ + struct hdat_fadump_reg_entry *reg_entry; + u64 val; + int i; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct hdat_fadump_reg_entry *)bufp; + val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) : + reg_entry->reg_val); + opal_fadump_set_regval_regnum(regs, + be32_to_cpu(reg_entry->reg_type), + be32_to_cpu(reg_entry->reg_num), + val); + } +} + +#endif /* _POWERNV_OPAL_FADUMP_H */ diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 186109bdd41b..e04b20625cb9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node, struct imc_pmu *pmu_ptr) { static u64 loc, *imc_mode_addr, *imc_cmd_addr; - int chip = 0, nid; char mode[16], cmd[16]; u32 cb_offset; + struct imc_mem_info *ptr = pmu_ptr->mem_info; imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); @@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node, if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; - for_each_node(nid) { - loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; + while (ptr->vbase != NULL) { + loc = (u64)(ptr->vbase) + cb_offset; imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET); - sprintf(mode, "imc_mode_%d", nid); + sprintf(mode, "imc_mode_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent, imc_mode_addr)) goto err; imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET); - sprintf(cmd, "imc_cmd_%d", nid); + sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent, imc_cmd_addr)) goto err; - chip++; + ptr++; } return; diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index dc51d03c6370..d26da19a611f 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -29,23 +29,23 @@ struct memcons { static struct memcons *opal_memcons = NULL; -ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count) { const char *conbuf; ssize_t ret; size_t first_read = 0; uint32_t out_pos, avail; - if (!opal_memcons) + if (!mc) return -ENODEV; - out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos)); + out_pos = be32_to_cpu(READ_ONCE(mc->out_pos)); /* Now we've read out_pos, put a barrier in before reading the new * data it points to in conbuf. */ smp_rmb(); - conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys)); + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); /* When the buffer has wrapped, read from the out_pos marker to the end * of the buffer, and then read the remaining data as in the un-wrapped @@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) if (out_pos & MEMCONS_OUT_POS_WRAP) { out_pos &= MEMCONS_OUT_POS_MASK; - avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos; + avail = be32_to_cpu(mc->obuf_size) - out_pos; ret = memory_read_from_buffer(to, count, &pos, conbuf + out_pos, avail); @@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) } /* Sanity check. The firmware should not do this to us. */ - if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) { + if (out_pos > be32_to_cpu(mc->obuf_size)) { pr_err("OPAL: memory console corruption. Aborting read.\n"); return -EINVAL; } @@ -86,6 +86,11 @@ out: return ret; } +ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +{ + return memcons_copy(opal_memcons, to, pos, count); +} + static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *to, loff_t pos, size_t count) @@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = { .read = opal_msglog_read }; -void __init opal_msglog_init(void) +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name) { u64 mcaddr; struct memcons *mc; - if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { - pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); - return; + if (of_property_read_u64(node, mc_prop_name, &mcaddr)) { + pr_warn("%s property not found, no message log\n", + mc_prop_name); + goto out_err; } mc = phys_to_virt(mcaddr); if (!mc) { - pr_warn("OPAL: memory console address is invalid\n"); - return; + pr_warn("memory console address is invalid\n"); + goto out_err; } if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { - pr_warn("OPAL: memory console version is invalid\n"); - return; + pr_warn("memory console version is invalid\n"); + goto out_err; } - /* Report maximum size */ - opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) + - be32_to_cpu(mc->obuf_size); + return mc; + +out_err: + return NULL; +} + +u32 memcons_get_size(struct memcons *mc) +{ + return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size); +} + +void __init opal_msglog_init(void) +{ + opal_memcons = memcons_init(opal_node, "ibm,opal-memcons"); + if (!opal_memcons) { + pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n"); + return; + } - opal_memcons = mc; + opal_msglog_attr.size = memcons_get_size(opal_memcons); } void __init opal_msglog_sysfs_init(void) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index e072bf157d62..45f4223a790f 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, int msg_size, item_size; unsigned long flags; - if (msg_type != OPAL_MSG_PRD) + if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2) return 0; /* Calculate total size of the message and item we need to store. The @@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev) return rc; } + rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb); + if (rc) { + pr_err("Couldn't register PRD2 event notifier\n"); + return rc; + } + rc = misc_register(&opal_prd_dev); if (rc) { pr_err("failed to register miscdev\n"); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 66430eebe869..fd510d961b8c 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * PowerNV LPC bus handling. + * PowerNV SCOM bus debugfs interface * + * Copyright 2010 Benjamin Herrenschmidt, IBM Corp + * <benh@kernel.crashing.org> + * and David Gibson, IBM Corporation. * Copyright 2013 IBM Corp. */ @@ -10,62 +13,13 @@ #include <linux/bug.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/opal.h> -#include <asm/scom.h> - -/* - * We could probably fit that inside the scom_map_t - * which is a void* after all but it's really too ugly - * so let's kmalloc it for now - */ -struct opal_scom_map { - uint32_t chip; - uint64_t addr; -}; - -static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) -{ - struct opal_scom_map *m; - const __be32 *gcid; - - if (!of_get_property(dev, "scom-controller", NULL)) { - pr_err("%s: device %pOF is not a SCOM controller\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - gcid = of_get_property(dev, "ibm,chip-id", NULL); - if (!gcid) { - pr_err("%s: device %pOF has no ibm,chip-id\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - m = kmalloc(sizeof(*m), GFP_KERNEL); - if (!m) - return NULL; - m->chip = be32_to_cpup(gcid); - m->addr = reg; - - return (scom_map_t)m; -} - -static void opal_scom_unmap(scom_map_t map) -{ - kfree(map); -} - -static int opal_xscom_err_xlate(int64_t rc) -{ - switch(rc) { - case 0: - return 0; - /* Add more translations if necessary */ - default: - return -EIO; - } -} +#include <asm/debugfs.h> +#include <asm/prom.h> static u64 opal_scom_unmangle(u64 addr) { @@ -98,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr) return addr; } -static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value) { - struct opal_scom_map *m = map; int64_t rc; __be64 v; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v)); + if (rc) { + *value = 0xfffffffffffffffful; + return -EIO; + } *value = be64_to_cpu(v); - return opal_xscom_err_xlate(rc); + return 0; } -static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value) { - struct opal_scom_map *m = map; int64_t rc; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_write(m->chip, reg, value); - return opal_xscom_err_xlate(rc); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_write(chip, reg, value); + if (rc) + return -EIO; + return 0; +} + +struct scom_debug_entry { + u32 chip; + struct debugfs_blob_wrapper path; + char name[16]; +}; + +static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = opal_scom_read(ent->chip, reg_base, reg, &val); + if (!rc) + rc = put_user(val, ubuf64); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + *ppos += 8; + done += 8; + } + return done; +} + +static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = get_user(val, ubuf64); + if (!rc) + rc = opal_scom_write(ent->chip, reg_base, reg, val); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + done += 8; + } + return done; } -static const struct scom_controller opal_scom_controller = { - .map = opal_scom_map, - .unmap = opal_scom_unmap, - .read = opal_scom_read, - .write = opal_scom_write +static const struct file_operations scom_debug_fops = { + .read = scom_debug_read, + .write = scom_debug_write, + .open = simple_open, + .llseek = default_llseek, }; -static int opal_xscom_init(void) +static int scom_debug_init_one(struct dentry *root, struct device_node *dn, + int chip) { - if (firmware_has_feature(FW_FEATURE_OPAL)) - scom_init(&opal_scom_controller); + struct scom_debug_entry *ent; + struct dentry *dir; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->chip = chip; + snprintf(ent->name, 16, "%08x", chip); + ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn); + ent->path.size = strlen((char *)ent->path.data); + + dir = debugfs_create_dir(ent->name, root); + if (!dir) { + kfree(ent->path.data); + kfree(ent); + return -1; + } + + debugfs_create_blob("devspec", 0400, dir, &ent->path); + debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); + return 0; } -machine_arch_initcall(powernv, opal_xscom_init); + +static int scom_debug_init(void) +{ + struct device_node *dn; + struct dentry *root; + int chip, rc; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return 0; + + root = debugfs_create_dir("scom", powerpc_debugfs_root); + if (!root) + return -1; + + rc = 0; + for_each_node_with_property(dn, "scom-controller") { + chip = of_get_ibm_chip_id(dn); + WARN_ON(chip == -1); + rc |= scom_debug_init_one(root, dn, chip); + } + + return rc; +} +device_initcall(scom_debug_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index aba443be7daa..38e90270280b 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static uint32_t opal_heartbeat; static struct task_struct *kopald_tsk; +static struct opal_msg *opal_msg; +static u32 opal_msg_size __ro_after_init; void opal_configure_cores(void) { @@ -271,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg) static void opal_handle_message(void) { s64 ret; - /* - * TODO: pre-allocate a message buffer depending on opal-msg-size - * value in /proc/device-tree. - */ - static struct opal_msg msg; u32 type; - ret = opal_get_msg(__pa(&msg), sizeof(msg)); + ret = opal_get_msg(__pa(opal_msg), opal_msg_size); /* No opal message pending. */ if (ret == OPAL_RESOURCE) return; @@ -290,14 +287,14 @@ static void opal_handle_message(void) return; } - type = be32_to_cpu(msg.msg_type); + type = be32_to_cpu(opal_msg->msg_type); /* Sanity check */ if (type >= OPAL_MSG_TYPE_MAX) { pr_warn_once("%s: Unknown message type: %u\n", __func__, type); return; } - opal_message_do_notify(type, (void *)&msg); + opal_message_do_notify(type, (void *)opal_msg); } static irqreturn_t opal_message_notify(int irq, void *data) @@ -306,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data) return IRQ_HANDLED; } -static int __init opal_message_init(void) +static int __init opal_message_init(struct device_node *opal_node) { int ret, i, irq; + ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size); + if (ret) { + pr_notice("Failed to read opal-msg-size property\n"); + opal_msg_size = sizeof(struct opal_msg); + } + + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + if (!opal_msg) { + opal_msg_size = sizeof(struct opal_msg); + /* Try to allocate fixed message size */ + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + BUG_ON(opal_msg == NULL); + } + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); @@ -705,7 +716,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, bin_attr->size); } -static BIN_ATTR_RO(symbol_map, 0); +static struct bin_attribute symbol_map_attr = { + .attr = {.name = "symbol_map", .mode = 0400}, + .read = symbol_map_read +}; static void opal_export_symmap(void) { @@ -722,10 +736,10 @@ static void opal_export_symmap(void) return; /* Setup attributes */ - bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); - bin_attr_symbol_map.size = be64_to_cpu(syms[1]); + symbol_map_attr.private = __va(be64_to_cpu(syms[0])); + symbol_map_attr.size = be64_to_cpu(syms[1]); - rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); + rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr); if (rc) pr_warn("Error %d creating OPAL symbols file\n", rc); } @@ -910,7 +924,7 @@ static int __init opal_init(void) } /* Initialise OPAL messaging system */ - opal_message_init(); + opal_message_init(opal_node); /* Initialise OPAL asynchronous completion interface */ opal_async_comp_init(); diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index e28f03e1eb5e..a0b9c0c23ed2 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) struct page *tce_mem = NULL; __be64 *addr; - tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN, + shift - PAGE_SHIFT); if (!tce_mem) { pr_err("Failed to allocate a TCE memory, level shift=%d\n", shift); @@ -48,6 +49,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) return addr; } +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned int levels); + static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; @@ -57,9 +61,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) while (level) { int n = (idx & mask) >> (level * shift); - unsigned long tce; + unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n])); - if (tmp[n] == 0) { + if (!tce) { __be64 *tmp2; if (!alloc) @@ -70,10 +74,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) if (!tmp2) return NULL; - tmp[n] = cpu_to_be64(__pa(tmp2) | - TCE_PCI_READ | TCE_PCI_WRITE); + tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE; + oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0, + cpu_to_be64(tce))); + if (oldtce) { + pnv_pci_ioda2_table_do_free_pages(tmp2, + ilog2(tbl->it_level_size) + 3, 1); + tce = oldtce; + } } - tce = be64_to_cpu(tmp[n]); tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); idx &= ~mask; @@ -161,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) if (ptce) *ptce = cpu_to_be64(0); + else + /* Skip the rest of the level */ + i |= tbl->it_level_size - 1; } } @@ -260,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, unsigned int table_shift = max_t(unsigned int, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; - unsigned int tmplevels = levels; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; @@ -268,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!is_power_of_2(window_size)) return -EINVAL; - if (alloc_userspace_copy && (window_size > (1ULL << 32))) - tmplevels = 1; - /* Adjust direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; level_shift = entries_shift + 3; @@ -281,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, &total_allocated); + 1, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -292,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (tmplevels == levels && offset < tce_table_size) + if (levels == 1 && offset < tce_table_size) goto free_tces_exit; /* Allocate userspace view of the TCE table */ if (alloc_userspace_copy) { offset = 0; uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, + 1, tce_table_size, &offset, &total_allocated_uas); if (!uas) goto free_tces_exit; - if (tmplevels == levels && (offset < tce_table_size || + if (levels == 1 && (offset < tce_table_size || total_allocated_uas != total_allocated)) goto free_uas_exit; } @@ -318,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base, - tbl->it_userspace, tmplevels, levels); + tbl->it_userspace, 1, levels); return 0; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d8080558d020..c28d0d9b7ee0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1939,26 +1939,12 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, } #ifdef CONFIG_IOMMU_API -static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) +/* Common for IODA1 and IODA2 */ +static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction, + bool realmode) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); - - return ret; + return pnv_tce_xchg(tbl, index, hpa, direction, !realmode); } #endif @@ -1973,8 +1959,8 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda1_iommu_ops = { .set = pnv_ioda1_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda1_tce_xchg, - .exchange_rm = pnv_ioda1_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_p7ioc_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda1_tce_free, @@ -2103,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, return ret; } -#ifdef CONFIG_IOMMU_API -static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); - - return ret; -} -#endif - static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -2138,8 +2100,8 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda2_tce_xchg, - .exchange_rm = pnv_ioda2_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_ioda2_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, @@ -2303,7 +2265,7 @@ found: tbl->it_ops = &pnv_ioda1_iommu_ops; pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; - iommu_init_table(tbl, phb->hose->node); + iommu_init_table(tbl, phb->hose->node, 0, 0); if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); @@ -2420,6 +2382,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = NULL; long rc; + unsigned long res_start, res_end; /* * crashkernel= specifies the kdump kernel's maximum memory at @@ -2433,19 +2396,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1); + + /* + * We create the default window as big as we can. The constraint is + * the max order of allocation possible. The TCE table is likely to + * end up being multilevel and with on-demand allocation in place, + * the initial use is not going to be huge as the default window aims + * to support crippled devices (i.e. not fully 64bit DMAble) only. + */ + /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */ + const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory); + /* Each TCE level cannot exceed maxblock so go multilevel if needed */ + unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT); + unsigned long tcelevel_order = ilog2(maxblock >> 3); + unsigned int levels = tces_order / tcelevel_order; + + if (tces_order % tcelevel_order) + levels += 1; + /* + * We try to stick to default levels (which is >1 at the moment) in + * order to save memory by relying on on-demain TCE level allocation. + */ + levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS); - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT, + window_size, levels, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); return rc; } - iommu_init_table(tbl, pe->phb->hose->node); + /* We use top part of 32bit space for MMIO so exclude it from DMA */ + res_start = 0; + res_end = 0; + if (window_size > pe->phb->ioda.m32_pci_base) { + res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift; + res_end = min(window_size, SZ_4G) >> tbl->it_page_shift; + } + iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end); rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); if (rc) { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6104418c9ad5..2825d004dece 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) break; } - if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) { + if (!of_device_is_compatible(parent, "ibm,ioda2-phb") && + !of_device_is_compatible(parent, "ibm,ioda3-phb")) { of_node_put(parent); continue; } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 469c24463247..f914f0b14e4e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -219,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach( struct pnv_ioda_pe *pe); /* pci-ioda-tce.c */ -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_DEFAULT_LEVELS 2 #define POWERNV_IOMMU_MAX_LEVELS 5 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index fd4a1c5a6369..1aa51c4fa904 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -30,4 +30,9 @@ extern void opal_event_shutdown(void); bool cpu_core_split_required(void); +struct memcons; +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count); +u32 memcons_get_size(struct memcons *mc); +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a5e52f9eed3c..83498604d322 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -24,6 +24,7 @@ #include <linux/bug.h> #include <linux/pci.h> #include <linux/cpufreq.h> +#include <linux/memblock.h> #include <asm/machdep.h> #include <asm/firmware.h> @@ -166,6 +167,14 @@ static void __init pnv_init(void) else #endif add_preferred_console("hvc", 0, NULL); + + if (!radix_enabled()) { + int i; + + /* Allocate per cpu area to save old slb contents during MCE */ + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i)); + } } static void __init pnv_init_IRQ(void) diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c new file mode 100644 index 000000000000..e4a00ad06f9d --- /dev/null +++ b/arch/powerpc/platforms/powernv/ultravisor.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ultravisor high level interfaces + * + * Copyright 2019, IBM Corporation. + * + */ +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/of_fdt.h> +#include <linux/of.h> + +#include <asm/ultravisor.h> +#include <asm/firmware.h> +#include <asm/machdep.h> + +#include "powernv.h" + +static struct kobject *ultravisor_kobj; + +int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname, + int depth, void *data) +{ + if (!of_flat_dt_is_compatible(node, "ibm,ultravisor")) + return 0; + + powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR; + pr_debug("Ultravisor detected!\n"); + return 1; +} + +static struct memcons *uv_memcons; + +static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + return memcons_copy(uv_memcons, to, pos, count); +} + +static struct bin_attribute uv_msglog_attr = { + .attr = {.name = "msglog", .mode = 0400}, + .read = uv_msglog_read +}; + +static int __init uv_init(void) +{ + struct device_node *node; + + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + return 0; + + node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware"); + if (!node) + return -ENODEV; + + uv_memcons = memcons_init(node, "memcons"); + if (!uv_memcons) + return -ENOENT; + + uv_msglog_attr.size = memcons_get_size(uv_memcons); + + ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj); + if (!ultravisor_kobj) + return -ENOMEM; + + return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr); +} +machine_subsys_initcall(powernv, uv_init); |