diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-20 21:48:06 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-20 21:48:06 +0300 |
commit | 45824fc0da6e46cc5d563105e1eaaf3098a686f9 (patch) | |
tree | 8e57c1f18104ed5f0d74d9eed9dc0365b3c137b8 /arch/powerpc/kernel | |
parent | 8c2b418c3f95a488f5226870eee68574d323f0f8 (diff) | |
parent | d9101bfa6adc831bda8836c4d774820553c14942 (diff) | |
download | linux-45824fc0da6e46cc5d563105e1eaaf3098a686f9.tar.xz |
Merge tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman:
"This is a bit late, partly due to me travelling, and partly due to a
power outage knocking out some of my test systems *while* I was
travelling.
- Initial support for running on a system with an Ultravisor, which
is software that runs below the hypervisor and protects guests
against some attacks by the hypervisor.
- Support for building the kernel to run as a "Secure Virtual
Machine", ie. as a guest capable of running on a system with an
Ultravisor.
- Some changes to our DMA code on bare metal, to allow devices with
medium sized DMA masks (> 32 && < 59 bits) to use more than 2GB of
DMA space.
- Support for firmware assisted crash dumps on bare metal (powernv).
- Two series fixing bugs in and refactoring our PCI EEH code.
- A large series refactoring our exception entry code to use gas
macros, both to make it more readable and also enable some future
optimisations.
As well as many cleanups and other minor features & fixups.
Thanks to: Adam Zerella, Alexey Kardashevskiy, Alistair Popple, Andrew
Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Anshuman Khandual,
Balbir Singh, Benjamin Herrenschmidt, Cédric Le Goater, Christophe
JAILLET, Christophe Leroy, Christopher M. Riedl, Christoph Hellwig,
Claudio Carvalho, Daniel Axtens, David Gibson, David Hildenbrand,
Desnes A. Nunes do Rosario, Ganesh Goudar, Gautham R. Shenoy, Greg
Kurz, Guerney Hunt, Gustavo Romero, Halil Pasic, Hari Bathini, Joakim
Tjernlund, Jonathan Neuschafer, Jordan Niethe, Leonardo Bras, Lianbo
Jiang, Madhavan Srinivasan, Mahesh Salgaonkar, Mahesh Salgaonkar,
Masahiro Yamada, Maxiwell S. Garcia, Michael Anderson, Nathan
Chancellor, Nathan Lynch, Naveen N. Rao, Nicholas Piggin, Oliver
O'Halloran, Qian Cai, Ram Pai, Ravi Bangoria, Reza Arbab, Ryan Grimm,
Sam Bobroff, Santosh Sivaraj, Segher Boessenkool, Sukadev Bhattiprolu,
Thiago Bauermann, Thiago Jung Bauermann, Thomas Gleixner, Tom
Lendacky, Vasant Hegde"
* tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (264 commits)
powerpc/mm/mce: Keep irqs disabled during lockless page table walk
powerpc: Use ftrace_graph_ret_addr() when unwinding
powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
ftrace: Look up the address of return_to_handler() using helpers
powerpc: dump kernel log before carrying out fadump or kdump
docs: powerpc: Add missing documentation reference
powerpc/xmon: Fix output of XIVE IPI
powerpc/xmon: Improve output of XIVE interrupts
powerpc/mm/radix: remove useless kernel messages
powerpc/fadump: support holes in kernel boot memory area
powerpc/fadump: remove RMA_START and RMA_END macros
powerpc/fadump: update documentation about option to release opalcore
powerpc/fadump: consider f/w load area
powerpc/opalcore: provide an option to invalidate /sys/firmware/opal/core file
powerpc/opalcore: export /sys/firmware/opal/core for analysing opal crashes
powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP
powerpc/fadump: add support to preserve crash data on FADUMP disabled kernel
powerpc/fadump: improve how crashed kernel's memory is reserved
powerpc/fadump: consider reserved ranges while releasing memory
powerpc/fadump: make crash memory ranges array allocation generic
...
Diffstat (limited to 'arch/powerpc/kernel')
55 files changed, 2715 insertions, 2159 deletions
diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore index c5f676c3c224..67ebd3003c05 100644 --- a/arch/powerpc/kernel/.gitignore +++ b/arch/powerpc/kernel/.gitignore @@ -1 +1,2 @@ +prom_init_check vmlinux.lds diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 56dfa7a2a6f2..a7ca8fe62368 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -52,7 +52,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ - paca.o nvram_64.o firmware.o + paca.o nvram_64.o firmware.o note.o obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o @@ -78,7 +78,9 @@ obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ eeh_driver.o eeh_event.o eeh_sysfs.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_FA_DUMP) += fadump.o +ifneq ($(CONFIG_FA_DUMP)$(CONFIG_PRESERVE_FA_DUMP),) +obj-y += fadump.o +endif ifdef CONFIG_PPC32 obj-$(CONFIG_E500) += idle_e500.o endif @@ -155,6 +157,9 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o +ifneq ($(CONFIG_PPC_POWERNV)$(CONFIG_PPC_SVM),) +obj-y += ucall.o +endif # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n @@ -184,15 +189,13 @@ extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o -ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE -$(obj)/built-in.a: prom_init_check +extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check -quiet_cmd_prom_init_check = CALL $< - cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o" +quiet_cmd_prom_init_check = PROMCHK $@ + cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" $(obj)/prom_init.o; touch $@ -PHONY += prom_init_check -prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o - $(call cmd,prom_init_check) -endif +$(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE + $(call if_changed,prom_init_check) +targets += prom_init_check clean-files := vmlinux.lds diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4ccb6b3a7fbd..484f54dab247 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -506,6 +506,7 @@ int main(void) OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); OFFSET(KVM_RADIX, kvm, arch.radix); OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); + OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index bfe5f4a2886b..e745abc5457a 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -569,7 +569,7 @@ static struct cpu_spec __initdata cpu_specs[] = { #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC32 -#ifdef CONFIG_PPC_BOOK3S_32 +#ifdef CONFIG_PPC_BOOK3S_601 { /* 601 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00010000, @@ -583,6 +583,8 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "ppc601", }, +#endif /* CONFIG_PPC_BOOK3S_601 */ +#ifdef CONFIG_PPC_BOOK3S_6xx { /* 603 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00030000, @@ -1212,7 +1214,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "ppc603", }, -#endif /* CONFIG_PPC_BOOK3S_32 */ +#endif /* CONFIG_PPC_BOOK3S_6xx */ #ifdef CONFIG_PPC_8xx { /* 8xx */ .pvr_mask = 0xffff0000, diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 2f5a53874f6d..e486d1d78de2 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -122,18 +122,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) { struct iommu_table *tbl = get_iommu_table_base(dev); - if (!tbl) { - dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx" - ", table unavailable\n", mask); - return 0; - } - if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { dev->archdata.iommu_bypass = true; dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); return 1; } + if (!tbl) { + dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask); + return 0; + } + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c0e4b73191f3..0a91dee51245 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -150,6 +150,16 @@ static int __init eeh_setup(char *str) } __setup("eeh=", eeh_setup); +void eeh_show_enabled(void) +{ + if (eeh_has_flag(EEH_FORCE_DISABLED)) + pr_info("EEH: Recovery disabled by kernel parameter.\n"); + else if (eeh_has_flag(EEH_ENABLED)) + pr_info("EEH: Capable adapter found: recovery enabled.\n"); + else + pr_info("EEH: No capable adapters found: recovery disabled.\n"); +} + /* * This routine captures assorted PCI configuration space data * for the indicated PCI device, and puts them into a buffer @@ -410,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) eeh_pe_mark_isolated(phb_pe); eeh_serialize_unlock(flags); - pr_err("EEH: PHB#%x failure detected, location: %s\n", + pr_debug("EEH: PHB#%x failure detected, location: %s\n", phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); - dump_stack(); eeh_send_failure_event(phb_pe); - return 1; out: eeh_serialize_unlock(flags); @@ -441,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) unsigned long flags; struct device_node *dn; struct pci_dev *dev; - struct eeh_pe *pe, *parent_pe, *phb_pe; + struct eeh_pe *pe, *parent_pe; int rc = 0; const char *location = NULL; @@ -460,8 +468,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) /* Access to IO BARs might get this far and still not want checking. */ if (!pe) { eeh_stats.ignored_check++; - pr_debug("EEH: Ignored check for %s\n", - eeh_pci_name(dev)); + eeh_edev_dbg(edev, "Ignored check\n"); return 0; } @@ -501,12 +508,11 @@ int eeh_dev_check_failure(struct eeh_dev *edev) if (dn) location = of_get_property(dn, "ibm,loc-code", NULL); - printk(KERN_ERR "EEH: %d reads ignored for recovering device at " - "location=%s driver=%s pci addr=%s\n", + eeh_edev_err(edev, "%d reads ignored for recovering device at location=%s driver=%s\n", pe->check_count, location ? location : "unknown", - eeh_driver_name(dev), eeh_pci_name(dev)); - printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", + eeh_driver_name(dev)); + eeh_edev_err(edev, "Might be infinite loop in %s driver\n", eeh_driver_name(dev)); dump_stack(); } @@ -573,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ - phb_pe = eeh_phb_pe_get(pe->phb); - pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", - pe->phb->global_number, pe->addr); - pr_err("EEH: PE location: %s, PHB location: %s\n", - eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); - dump_stack(); - + pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n", + __func__, pe->phb->global_number, pe->addr); eeh_send_failure_event(pe); return 1; @@ -697,7 +698,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function) return rc; } -static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, +static void eeh_disable_and_save_dev_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); @@ -708,7 +709,7 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, * state for the specified device */ if (!pdev || pdev == dev) - return NULL; + return; /* Ensure we have D0 power state */ pci_set_power_state(pdev, PCI_D0); @@ -721,18 +722,16 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, * interrupt from the device */ pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); - - return NULL; } -static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) +static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) { struct pci_dn *pdn = eeh_dev_to_pdn(edev); struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); struct pci_dev *dev = userdata; if (!pdev) - return NULL; + return; /* Apply customization from firmware */ if (pdn && eeh_ops->restore_config) @@ -741,8 +740,6 @@ static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) /* The caller should restore state for the specified device */ if (pdev != dev) pci_restore_state(pdev); - - return NULL; } int eeh_restore_vf_config(struct pci_dn *pdn) @@ -868,7 +865,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat * the indicated device and its children so that the bunch of the * devices could be reset properly. */ -static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) +static void eeh_set_dev_freset(struct eeh_dev *edev, void *flag) { struct pci_dev *dev; unsigned int *freset = (unsigned int *)flag; @@ -876,8 +873,6 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) dev = eeh_dev_to_pci_dev(edev); if (dev) *freset |= dev->needs_freset; - - return NULL; } static void eeh_pe_refreeze_passed(struct eeh_pe *root) @@ -1063,23 +1058,6 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; -void eeh_probe_devices(void) -{ - struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else - pr_info("EEH: No capable adapters found\n"); - -} - /** * eeh_init - EEH initialization * @@ -1120,6 +1098,8 @@ static int eeh_init(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) eeh_dev_phb_init_dynamic(hose); + eeh_addr_cache_init(); + /* Initialize EEH event */ return eeh_event_init(); } @@ -1190,15 +1170,14 @@ void eeh_add_device_late(struct pci_dev *dev) struct pci_dn *pdn; struct eeh_dev *edev; - if (!dev || !eeh_enabled()) + if (!dev) return; - pr_debug("EEH: Adding device %s\n", pci_name(dev)); - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); edev = pdn_to_eeh_dev(pdn); + eeh_edev_dbg(edev, "Adding device\n"); if (edev->pdev == dev) { - pr_debug("EEH: Already referenced !\n"); + eeh_edev_dbg(edev, "Device already referenced!\n"); return; } @@ -1246,6 +1225,8 @@ void eeh_add_device_tree_late(struct pci_bus *bus) { struct pci_dev *dev; + if (eeh_has_flag(EEH_FORCE_DISABLED)) + return; list_for_each_entry(dev, &bus->devices, bus_list) { eeh_add_device_late(dev); if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { @@ -1299,10 +1280,10 @@ void eeh_remove_device(struct pci_dev *dev) edev = pci_dev_to_eeh_dev(dev); /* Unregister the device with the EEH/PCI address search system */ - pr_debug("EEH: Removing device %s\n", pci_name(dev)); + dev_dbg(&dev->dev, "EEH: Removing device\n"); if (!edev || !edev->pdev || !edev->pe) { - pr_debug("EEH: Not referenced !\n"); + dev_dbg(&dev->dev, "EEH: Device not referenced!\n"); return; } @@ -1890,6 +1871,198 @@ static const struct file_operations eeh_force_recover_fops = { .llseek = no_llseek, .write = eeh_force_recover_write, }; + +static ssize_t eeh_debugfs_dev_usage(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + static const char usage[] = "input format: <domain>:<bus>:<dev>.<fn>\n"; + + return simple_read_from_buffer(user_buf, count, ppos, + usage, sizeof(usage) - 1); +} + +static ssize_t eeh_dev_check_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + uint32_t domain, bus, dev, fn; + struct pci_dev *pdev; + struct eeh_dev *edev; + char buf[20]; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); + if (!ret) + return -EFAULT; + + ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); + if (ret != 4) { + pr_err("%s: expected 4 args, got %d\n", __func__, ret); + return -EINVAL; + } + + pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); + if (!pdev) + return -ENODEV; + + edev = pci_dev_to_eeh_dev(pdev); + if (!edev) { + pci_err(pdev, "No eeh_dev for this device!\n"); + pci_dev_put(pdev); + return -ENODEV; + } + + ret = eeh_dev_check_failure(edev); + pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n", + domain, bus, dev, fn, ret); + + pci_dev_put(pdev); + + return count; +} + +static const struct file_operations eeh_dev_check_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_dev_check_write, + .read = eeh_debugfs_dev_usage, +}; + +static int eeh_debugfs_break_device(struct pci_dev *pdev) +{ + struct resource *bar = NULL; + void __iomem *mapped; + u16 old, bit; + int i, pos; + + /* Do we have an MMIO BAR to disable? */ + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { + struct resource *r = &pdev->resource[i]; + + if (!r->flags || !r->start) + continue; + if (r->flags & IORESOURCE_IO) + continue; + if (r->flags & IORESOURCE_UNSET) + continue; + + bar = r; + break; + } + + if (!bar) { + pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n"); + return -ENXIO; + } + + pci_err(pdev, "Going to break: %pR\n", bar); + + if (pdev->is_virtfn) { +#ifndef CONFIG_IOV + return -ENXIO; +#else + /* + * VFs don't have a per-function COMMAND register, so the best + * we can do is clear the Memory Space Enable bit in the PF's + * SRIOV control reg. + * + * Unfortunately, this requires that we have a PF (i.e doesn't + * work for a passed-through VF) and it has the potential side + * effect of also causing an EEH on every other VF under the + * PF. Oh well. + */ + pdev = pdev->physfn; + if (!pdev) + return -ENXIO; /* passed through VFs have no PF */ + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); + pos += PCI_SRIOV_CTRL; + bit = PCI_SRIOV_CTRL_MSE; +#endif /* !CONFIG_IOV */ + } else { + bit = PCI_COMMAND_MEMORY; + pos = PCI_COMMAND; + } + + /* + * Process here is: + * + * 1. Disable Memory space. + * + * 2. Perform an MMIO to the device. This should result in an error + * (CA / UR) being raised by the device which results in an EEH + * PE freeze. Using the in_8() accessor skips the eeh detection hook + * so the freeze hook so the EEH Detection machinery won't be + * triggered here. This is to match the usual behaviour of EEH + * where the HW will asyncronously freeze a PE and it's up to + * the kernel to notice and deal with it. + * + * 3. Turn Memory space back on. This is more important for VFs + * since recovery will probably fail if we don't. For normal + * the COMMAND register is reset as a part of re-initialising + * the device. + * + * Breaking stuff is the point so who cares if it's racy ;) + */ + pci_read_config_word(pdev, pos, &old); + + mapped = ioremap(bar->start, PAGE_SIZE); + if (!mapped) { + pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar); + return -ENXIO; + } + + pci_write_config_word(pdev, pos, old & ~bit); + in_8(mapped); + pci_write_config_word(pdev, pos, old); + + iounmap(mapped); + + return 0; +} + +static ssize_t eeh_dev_break_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + uint32_t domain, bus, dev, fn; + struct pci_dev *pdev; + char buf[20]; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); + if (!ret) + return -EFAULT; + + ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); + if (ret != 4) { + pr_err("%s: expected 4 args, got %d\n", __func__, ret); + return -EINVAL; + } + + pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); + if (!pdev) + return -ENODEV; + + ret = eeh_debugfs_break_device(pdev); + pci_dev_put(pdev); + + if (ret < 0) + return ret; + + return count; +} + +static const struct file_operations eeh_dev_break_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_dev_break_write, + .read = eeh_debugfs_dev_usage, +}; + #endif static int __init eeh_init_proc(void) @@ -1905,6 +2078,12 @@ static int __init eeh_init_proc(void) debugfs_create_bool("eeh_disable_recovery", 0600, powerpc_debugfs_root, &eeh_debugfs_no_recover); + debugfs_create_file_unsafe("eeh_dev_check", 0600, + powerpc_debugfs_root, NULL, + &eeh_dev_check_fops); + debugfs_create_file_unsafe("eeh_dev_break", 0600, + powerpc_debugfs_root, NULL, + &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, powerpc_debugfs_root, NULL, &eeh_force_recover_fops); diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 05ffd32b3416..cf11277ebd02 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -148,8 +148,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, piar->pcidev = dev; piar->flags = flags; - pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", - &alo, &ahi, pci_name(dev)); + eeh_edev_dbg(piar->edev, "PIAR: insert range=[%pap:%pap]\n", + &alo, &ahi); rb_link_node(&piar->rb_node, parent, p); rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); @@ -229,8 +229,8 @@ restart: piar = rb_entry(n, struct pci_io_addr_range, rb_node); if (piar->pcidev == dev) { - pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n", - &piar->addr_lo, &piar->addr_hi, pci_name(dev)); + eeh_edev_dbg(piar->edev, "PIAR: remove range=[%pap:%pap]\n", + &piar->addr_lo, &piar->addr_hi); rb_erase(n, &pci_io_addr_cache_root.rb_root); kfree(piar); goto restart; @@ -258,37 +258,14 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev) } /** - * eeh_addr_cache_build - Build a cache of I/O addresses + * eeh_addr_cache_init - Initialize a cache of I/O addresses * - * Build a cache of pci i/o addresses. This cache will be used to + * Initialize a cache of pci i/o addresses. This cache will be used to * find the pci device that corresponds to a given address. - * This routine scans all pci busses to build the cache. - * Must be run late in boot process, after the pci controllers - * have been scanned for devices (after all device resources are known). */ -void eeh_addr_cache_build(void) +void eeh_addr_cache_init(void) { - struct pci_dn *pdn; - struct eeh_dev *edev; - struct pci_dev *dev = NULL; - spin_lock_init(&pci_io_addr_cache_root.piar_lock); - - for_each_pci_dev(dev) { - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - if (!pdn) - continue; - - edev = pdn_to_eeh_dev(pdn); - if (!edev) - continue; - - dev->dev.archdata.edev = edev; - edev->pdev = dev; - - eeh_addr_cache_insert_dev(dev); - eeh_sysfs_add_device(dev); - } } static int eeh_addr_cache_show(struct seq_file *s, void *v) diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index c4317c452d98..7370185c7a05 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -47,6 +47,8 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; + edev->bdfn = (pdn->busno << 8) | pdn->devfn; + edev->controller = pdn->phb; return edev; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 89623962c727..d9279d0ee9f5 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -27,6 +27,7 @@ #include <linux/irq.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/pci_hotplug.h> #include <asm/eeh.h> #include <asm/eeh_event.h> #include <asm/ppc-pci.h> @@ -81,23 +82,6 @@ static const char *pci_ers_result_name(enum pci_ers_result result) } }; -static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr, - edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf); - - va_end(args); -} - static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, enum pci_ers_result new) { @@ -113,8 +97,16 @@ static bool eeh_dev_removed(struct eeh_dev *edev) static bool eeh_edev_actionable(struct eeh_dev *edev) { - return (edev->pdev && !eeh_dev_removed(edev) && - !eeh_pe_passed(edev->pe)); + if (!edev->pdev) + return false; + if (edev->pdev->error_state == pci_channel_io_perm_failure) + return false; + if (eeh_dev_removed(edev)) + return false; + if (eeh_pe_passed(edev->pe)) + return false; + + return true; } /** @@ -214,12 +206,12 @@ static void eeh_enable_irq(struct eeh_dev *edev) } } -static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) +static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev; if (!edev) - return NULL; + return; /* * We cannot access the config space on some adapters. @@ -229,14 +221,13 @@ static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) * device is created. */ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) - return NULL; + return; pdev = eeh_dev_to_pci_dev(edev); if (!pdev) - return NULL; + return; pci_save_state(pdev); - return NULL; } static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) @@ -274,20 +265,27 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable) } typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, + struct pci_dev *, struct pci_driver *); static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, enum pci_ers_result *result) { + struct pci_dev *pdev; struct pci_driver *driver; enum pci_ers_result new_result; - if (!edev->pdev) { + pci_lock_rescan_remove(); + pdev = edev->pdev; + if (pdev) + get_device(&pdev->dev); + pci_unlock_rescan_remove(); + if (!pdev) { eeh_edev_info(edev, "no device"); return; } - device_lock(&edev->pdev->dev); + device_lock(&pdev->dev); if (eeh_edev_actionable(edev)) { - driver = eeh_pcid_get(edev->pdev); + driver = eeh_pcid_get(pdev); if (!driver) eeh_edev_info(edev, "no driver"); @@ -296,7 +294,7 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, else if (edev->mode & EEH_DEV_NO_HANDLER) eeh_edev_info(edev, "driver bound too late"); else { - new_result = fn(edev, driver); + new_result = fn(edev, pdev, driver); eeh_edev_info(edev, "%s driver reports: '%s'", driver->name, pci_ers_result_name(new_result)); @@ -305,12 +303,15 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, new_result); } if (driver) - eeh_pcid_put(edev->pdev); + eeh_pcid_put(pdev); } else { - eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, + eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); } - device_unlock(&edev->pdev->dev); + device_unlock(&pdev->dev); + if (edev->pdev != pdev) + eeh_edev_warn(edev, "Device changed during processing!\n"); + put_device(&pdev->dev); } static void eeh_pe_report(const char *name, struct eeh_pe *root, @@ -337,20 +338,20 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root, * Report an EEH error to each device driver. */ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { enum pci_ers_result rc; - struct pci_dev *dev = edev->pdev; if (!driver->err_handler->error_detected) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", driver->name); - rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); + rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); edev->in_error = true; - pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); + pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); return rc; } @@ -363,12 +364,13 @@ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, * are now enabled. */ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->mmio_enabled) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); - return driver->err_handler->mmio_enabled(edev->pdev); + return driver->err_handler->mmio_enabled(pdev); } /** @@ -382,20 +384,21 @@ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, * driver can work again while the device is recovered. */ static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->slot_reset || !edev->in_error) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); - return driver->err_handler->slot_reset(edev->pdev); + return driver->err_handler->slot_reset(pdev); } -static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) +static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev; if (!edev) - return NULL; + return; /* * The content in the config space isn't saved because @@ -407,15 +410,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); - return NULL; + return; } pdev = eeh_dev_to_pci_dev(edev); if (!pdev) - return NULL; + return; pci_restore_state(pdev); - return NULL; } /** @@ -428,13 +430,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) * to make the recovered device work again. */ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->resume || !edev->in_error) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->resume()", driver->name); - driver->err_handler->resume(edev->pdev); + driver->err_handler->resume(pdev); pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); #ifdef CONFIG_PCI_IOV @@ -453,6 +456,7 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, * dead, and that no further recovery attempts will be made on it. */ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { enum pci_ers_result rc; @@ -462,10 +466,10 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", driver->name); - rc = driver->err_handler->error_detected(edev->pdev, + rc = driver->err_handler->error_detected(pdev, pci_channel_io_perm_failure); - pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); return rc; } @@ -473,12 +477,9 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) { struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); - struct pci_dn *pdn = eeh_dev_to_pdn(edev); if (!(edev->physfn)) { - pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", - __func__, pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + eeh_edev_warn(edev, "Not for VF\n"); return NULL; } @@ -492,12 +493,12 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) } #ifdef CONFIG_PCI_IOV - pci_iov_add_virtfn(edev->physfn, pdn->vf_index); + pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); #endif return NULL; } -static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) +static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) { struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); @@ -512,7 +513,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) */ if (!eeh_edev_actionable(edev) || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) - return NULL; + return; if (rmv_data) { driver = eeh_pcid_get(dev); @@ -521,7 +522,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) driver->err_handler->error_detected && driver->err_handler->slot_reset) { eeh_pcid_put(dev); - return NULL; + return; } eeh_pcid_put(dev); } @@ -554,8 +555,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pci_stop_and_remove_bus_device(dev); pci_unlock_rescan_remove(); } - - return NULL; } static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) @@ -744,6 +743,99 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, */ #define MAX_WAIT_FOR_RECOVERY 300 + +/* Walks the PE tree after processing an event to remove any stale PEs. + * + * NB: This needs to be recursive to ensure the leaf PEs get removed + * before their parents do. Although this is possible to do recursively + * we don't since this is easier to read and we need to garantee + * the leaf nodes will be handled first. + */ +static void eeh_pe_cleanup(struct eeh_pe *pe) +{ + struct eeh_pe *child_pe, *tmp; + + list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) + eeh_pe_cleanup(child_pe); + + if (pe->state & EEH_PE_KEEP) + return; + + if (!(pe->state & EEH_PE_INVALID)) + return; + + if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { + list_del(&pe->child); + kfree(pe); + } +} + +/** + * eeh_check_slot_presence - Check if a device is still present in a slot + * @pdev: pci_dev to check + * + * This function may return a false positive if we can't determine the slot's + * presence state. This might happen for for PCIe slots if the PE containing + * the upstream bridge is also frozen, or the bridge is part of the same PE + * as the device. + * + * This shouldn't happen often, but you might see it if you hotplug a PCIe + * switch. + */ +static bool eeh_slot_presence_check(struct pci_dev *pdev) +{ + const struct hotplug_slot_ops *ops; + struct pci_slot *slot; + u8 state; + int rc; + + if (!pdev) + return false; + + if (pdev->error_state == pci_channel_io_perm_failure) + return false; + + slot = pdev->slot; + if (!slot || !slot->hotplug) + return true; + + ops = slot->hotplug->ops; + if (!ops || !ops->get_adapter_status) + return true; + + /* set the attention indicator while we've got the slot ops */ + if (ops->set_attention_status) + ops->set_attention_status(slot->hotplug, 1); + + rc = ops->get_adapter_status(slot->hotplug, &state); + if (rc) + return true; + + return !!state; +} + +static void eeh_clear_slot_attention(struct pci_dev *pdev) +{ + const struct hotplug_slot_ops *ops; + struct pci_slot *slot; + + if (!pdev) + return; + + if (pdev->error_state == pci_channel_io_perm_failure) + return; + + slot = pdev->slot; + if (!slot || !slot->hotplug) + return; + + ops = slot->hotplug->ops; + if (!ops || !ops->set_attention_status) + return; + + ops->set_attention_status(slot->hotplug, 0); +} + /** * eeh_handle_normal_event - Handle EEH events on a specific PE * @pe: EEH PE - which should not be used after we return, as it may @@ -774,6 +866,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) enum pci_ers_result result = PCI_ERS_RESULT_NONE; struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; + int devices = 0; bus = eeh_pe_bus_get(pe); if (!bus) { @@ -782,7 +875,59 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + /* + * When devices are hot-removed we might get an EEH due to + * a driver attempting to touch the MMIO space of a removed + * device. In this case we don't have a device to recover + * so suppress the event if we can't find any present devices. + * + * The hotplug driver should take care of tearing down the + * device itself. + */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + if (eeh_slot_presence_check(edev->pdev)) + devices++; + + if (!devices) { + pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", + pe->phb->global_number, pe->addr); + goto out; /* nothing to recover */ + } + + /* Log the event */ + if (pe->type & EEH_PE_PHB) { + pr_err("EEH: PHB#%x failure detected, location: %s\n", + pe->phb->global_number, eeh_pe_loc_get(pe)); + } else { + struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); + + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); + } + +#ifdef CONFIG_STACKTRACE + /* + * Print the saved stack trace now that we've verified there's + * something to recover. + */ + if (pe->trace_entries) { + void **ptrs = (void **) pe->stack_trace; + int i; + + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + + /* FIXME: Use the same format as dump_stack() */ + pr_err("EEH: Call Trace:\n"); + for (i = 0; i < pe->trace_entries; i++) + pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); + + pe->trace_entries = 0; + } +#endif /* CONFIG_STACKTRACE */ eeh_pe_update_time_stamp(pe); pe->freeze_count++; @@ -793,6 +938,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe) result = PCI_ERS_RESULT_DISCONNECT; } + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + edev->mode &= ~EEH_DEV_NO_HANDLER; + /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs * to accomplish the reset. Each child gets a report of the @@ -969,6 +1118,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } } + +out: + /* + * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING + * we don't want to modify the PE tree structure so we do it here. + */ + eeh_pe_cleanup(pe); + + /* clear the slot attention LED for all recovered devices */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + eeh_clear_slot_attention(edev->pdev); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } @@ -981,7 +1143,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) */ void eeh_handle_special_event(void) { - struct eeh_pe *pe, *phb_pe; + struct eeh_pe *pe, *phb_pe, *tmp_pe; + struct eeh_dev *edev, *tmp_edev; struct pci_bus *bus; struct pci_controller *hose; unsigned long flags; @@ -1040,6 +1203,7 @@ void eeh_handle_special_event(void) */ if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); eeh_handle_normal_event(pe); } else { pci_lock_rescan_remove(); @@ -1050,6 +1214,10 @@ void eeh_handle_special_event(void) (phb_pe->state & EEH_PE_RECOVERING)) continue; + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) + edev->mode &= ~EEH_DEV_NO_HANDLER; + /* Notify all devices to be down */ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_set_channel_state(pe, pci_channel_io_perm_failure); diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 64cfbe41174b..a7a8dc182efb 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; - struct eeh_pe *pe; while (!kthread_should_stop()) { if (wait_for_completion_interruptible(&eeh_eventlist_event)) @@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy) continue; /* We might have event without binding PE */ - pe = event->pe; - if (pe) { - if (pe->type & EEH_PE_PHB) - pr_info("EEH: Detected error on PHB#%x\n", - pe->phb->global_number); - else - pr_info("EEH: Detected PCI bus error on " - "PHB#%x-PE#%x\n", - pe->phb->global_number, pe->addr); - eeh_handle_normal_event(pe); - } else { + if (event->pe) + eeh_handle_normal_event(event->pe); + else eeh_handle_special_event(); - } kfree(event); } @@ -121,6 +111,24 @@ int __eeh_send_failure_event(struct eeh_pe *pe) } event->pe = pe; + /* + * Mark the PE as recovering before inserting it in the queue. + * This prevents the PE from being free()ed by a hotplug driver + * while the PE is sitting in the event queue. + */ + if (pe) { +#ifdef CONFIG_STACKTRACE + /* + * Save the current stack trace so we can dump it from the + * event handler thread. + */ + pe->trace_entries = stack_trace_save(pe->stack_trace, + ARRAY_SIZE(pe->stack_trace), 0); +#endif /* CONFIG_STACKTRACE */ + + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + } + /* We may or may not be called in an interrupt context */ spin_lock_irqsave(&eeh_eventlist_lock, flags); list_add(&event->list, &eeh_eventlist); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 854cef7b18f4..177852e39a25 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -231,29 +231,22 @@ void *eeh_pe_traverse(struct eeh_pe *root, * The function is used to traverse the devices of the specified * PE and its child PEs. */ -void *eeh_pe_dev_traverse(struct eeh_pe *root, +void eeh_pe_dev_traverse(struct eeh_pe *root, eeh_edev_traverse_func fn, void *flag) { struct eeh_pe *pe; struct eeh_dev *edev, *tmp; - void *ret; if (!root) { pr_warn("%s: Invalid PE %p\n", __func__, root); - return NULL; + return; } /* Traverse root PE */ - eeh_for_each_pe(root, pe) { - eeh_pe_for_each_dev(pe, edev, tmp) { - ret = fn(edev, flag); - if (ret) - return ret; - } - } - - return NULL; + eeh_for_each_pe(root, pe) + eeh_pe_for_each_dev(pe, edev, tmp) + fn(edev, flag); } /** @@ -379,8 +372,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { - pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n", - __func__, config_addr, pdn->phb->global_number); + eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n"); return -EINVAL; } @@ -391,42 +383,34 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * components. */ pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); - if (pe && !(pe->type & EEH_PE_INVALID)) { - /* Mark the PE as type of PCI bus */ - pe->type = EEH_PE_BUS; - edev->pe = pe; - - /* Put the edev to PE */ - list_add_tail(&edev->entry, &pe->edevs); - pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr); - return 0; - } else if (pe && (pe->type & EEH_PE_INVALID)) { - list_add_tail(&edev->entry, &pe->edevs); - edev->pe = pe; - /* - * We're running to here because of PCI hotplug caused by - * EEH recovery. We need clear EEH_PE_INVALID until the top. - */ - parent = pe; - while (parent) { - if (!(parent->type & EEH_PE_INVALID)) - break; - parent->type &= ~EEH_PE_INVALID; - parent = parent->parent; - } + if (pe) { + if (pe->type & EEH_PE_INVALID) { + list_add_tail(&edev->entry, &pe->edevs); + edev->pe = pe; + /* + * We're running to here because of PCI hotplug caused by + * EEH recovery. We need clear EEH_PE_INVALID until the top. + */ + parent = pe; + while (parent) { + if (!(parent->type & EEH_PE_INVALID)) + break; + parent->type &= ~EEH_PE_INVALID; + parent = parent->parent; + } + + eeh_edev_dbg(edev, + "Added to device PE (parent: PE#%x)\n", + pe->parent->addr); + } else { + /* Mark the PE as type of PCI bus */ + pe->type = EEH_PE_BUS; + edev->pe = pe; - pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device " - "PE#%x, Parent PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr, pe->parent->addr); + /* Put the edev to PE */ + list_add_tail(&edev->entry, &pe->edevs); + eeh_edev_dbg(edev, "Added to bus PE\n"); + } return 0; } @@ -468,13 +452,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) list_add_tail(&pe->child, &parent->child_list); list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; - pr_debug("EEH: Add %04x:%02x:%02x.%01x to " - "Device PE#%x, Parent PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr, pe->parent->addr); + eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n", + pe->parent->addr); return 0; } @@ -491,16 +470,12 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) int eeh_rmv_from_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; + bool keep, recover; int cnt; - struct pci_dn *pdn = eeh_dev_to_pdn(edev); pe = eeh_dev_to_pe(edev); if (!pe) { - pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", - __func__, pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn)); + eeh_edev_dbg(edev, "No PE found for device.\n"); return -EEXIST; } @@ -516,10 +491,21 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ while (1) { parent = pe->parent; + + /* PHB PEs should never be removed */ if (pe->type & EEH_PE_PHB) break; - if (!(pe->state & EEH_PE_KEEP)) { + /* + * XXX: KEEP is set while resetting a PE. I don't think it's + * ever set without RECOVERING also being set. I could + * be wrong though so catch that with a WARN. + */ + keep = !!(pe->state & EEH_PE_KEEP); + recover = !!(pe->state & EEH_PE_RECOVERING); + WARN_ON(keep && !recover); + + if (!keep && !recover) { if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { list_del(&pe->child); @@ -528,6 +514,15 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) break; } } else { + /* + * Mark the PE as invalid. At the end of the recovery + * process any invalid PEs will be garbage collected. + * + * We need to delay the free()ing of them since we can + * remove edev's while traversing the PE tree which + * might trigger the removal of a PE and we can't + * deal with that (yet). + */ if (list_empty(&pe->edevs)) { cnt = 0; list_for_each_entry(child, &pe->child_list, child) { @@ -623,13 +618,11 @@ void eeh_pe_mark_isolated(struct eeh_pe *root) } EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated); -static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) +static void __eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) { int mode = *((int *)flag); edev->mode |= mode; - - return NULL; } /** @@ -717,17 +710,13 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT))) return; - pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", - __func__, pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn)); + eeh_edev_dbg(edev, "Checking PCIe link...\n"); /* Check slot status */ cap = edev->pcie_cap; eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val); if (!(val & PCI_EXP_SLTSTA_PDS)) { - pr_debug(" No card in the slot (0x%04x) !\n", val); + eeh_edev_dbg(edev, "No card in the slot (0x%04x) !\n", val); return; } @@ -736,7 +725,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) if (val & PCI_EXP_SLTCAP_PCP) { eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val); if (val & PCI_EXP_SLTCTL_PCC) { - pr_debug(" In power-off state, power it on ...\n"); + eeh_edev_dbg(edev, "In power-off state, power it on ...\n"); val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); val |= (0x0100 & PCI_EXP_SLTCTL_PIC); eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val); @@ -752,7 +741,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) /* Check link */ eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val); if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { - pr_debug(" No link reporting capability (0x%08x) \n", val); + eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", val); msleep(1000); return; } @@ -769,10 +758,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) } if (val & PCI_EXP_LNKSTA_DLLLA) - pr_debug(" Link up (%s)\n", + eeh_edev_dbg(edev, "Link up (%s)\n", (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); else - pr_debug(" Link not ready (0x%04x)\n", val); + eeh_edev_dbg(edev, "Link not ready (0x%04x)\n", val); } #define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) @@ -852,7 +841,7 @@ static void eeh_restore_device_bars(struct eeh_dev *edev) * the expansion ROM base address, the latency timer, and etc. * from the saved values in the device node. */ -static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) +static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) { struct pci_dn *pdn = eeh_dev_to_pdn(edev); @@ -864,8 +853,6 @@ static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) if (eeh_ops->restore_config && pdn) eeh_ops->restore_config(pdn); - - return NULL; } /** diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 54fab22c9a43..d60908ea37fb 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -230,7 +230,7 @@ transfer_to_handler_cont: */ lis r12,reenable_mmu@h ori r12,r12,reenable_mmu@l - LOAD_MSR_KERNEL(r0, MSR_KERNEL) + LOAD_REG_IMMEDIATE(r0, MSR_KERNEL) mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r0 SYNC @@ -304,7 +304,7 @@ stack_ovf: addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD lis r9,StackOverflow@ha addi r9,r9,StackOverflow@l - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 #endif @@ -324,7 +324,7 @@ trace_syscall_entry_irq_off: bl trace_hardirqs_on /* Now enable for real */ - LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) mtmsr r10 REST_GPR(0, r1) @@ -394,7 +394,7 @@ ret_from_syscall: #endif mr r6,r3 /* disable interrupts so current_thread_info()->flags can't change */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ SYNC MTMSRD(r10) @@ -777,11 +777,19 @@ fast_exception_return: 1: lis r3,exc_exit_restart_end@ha addi r3,r3,exc_exit_restart_end@l cmplw r12,r3 +#if CONFIG_PPC_BOOK3S_601 + bge 2b +#else bge 3f +#endif lis r4,exc_exit_restart@ha addi r4,r4,exc_exit_restart@l cmplw r12,r4 +#if CONFIG_PPC_BOOK3S_601 + blt 2b +#else blt 3f +#endif lis r3,fee_restarts@ha tophys(r3,r3) lwz r5,fee_restarts@l(r3) @@ -800,9 +808,6 @@ fee_restarts: /* aargh, we don't know which trap this is */ /* but the 601 doesn't implement the RI bit, so assume it's OK */ 3: -BEGIN_FTR_SECTION - b 2b -END_FTR_SECTION_IFSET(CPU_FTR_601) li r10,-1 stw r10,_TRAP(r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -824,7 +829,7 @@ ret_from_except: * can't change between when we test it and when we return * from the interrupt. */ /* Note: We don't bother telling lockdep about it */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC /* Some chip revs have problems here... */ MTMSRD(r10) /* disable interrupts */ @@ -991,7 +996,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) * can restart the exception exit path at the label * exc_exit_restart below. -- paulus */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) SYNC MTMSRD(r10) /* clear the RI bit */ .globl exc_exit_restart @@ -1066,7 +1071,7 @@ exc_exit_restart_end: REST_NVGPRS(r1); \ lwz r3,_MSR(r1); \ andi. r3,r3,MSR_PR; \ - LOAD_MSR_KERNEL(r10,MSR_KERNEL); \ + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL); \ bne user_exc_return; \ lwz r0,GPR0(r1); \ lwz r2,GPR2(r1); \ @@ -1236,7 +1241,7 @@ recheck: * neither. Those disable/enable cycles used to peek at * TI_FLAGS aren't advertised. */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC MTMSRD(r10) /* disable interrupts */ lwz r9,TI_FLAGS(r2) @@ -1270,11 +1275,19 @@ nonrecoverable: lis r10,exc_exit_restart_end@ha addi r10,r10,exc_exit_restart_end@l cmplw r12,r10 +#ifdef CONFIG_PPC_BOOK3S_601 + bgelr +#else bge 3f +#endif lis r11,exc_exit_restart@ha addi r11,r11,exc_exit_restart@l cmplw r12,r11 +#ifdef CONFIG_PPC_BOOK3S_601 + bltlr +#else blt 3f +#endif lis r10,ee_restarts@ha lwz r12,ee_restarts@l(r10) addi r12,r12,1 @@ -1283,9 +1296,6 @@ nonrecoverable: blr 3: /* OK, we can't recover, kill this process */ /* but the 601 doesn't implement the RI bit, so assume it's OK */ -BEGIN_FTR_SECTION - blr -END_FTR_SECTION_IFSET(CPU_FTR_601) lwz r3,_TRAP(r1) andi. r0,r3,1 beq 5f @@ -1329,7 +1339,7 @@ _GLOBAL(enter_rtas) lwz r4,RTASBASE(r4) mfmsr r9 stw r9,8(r1) - LOAD_MSR_KERNEL(r0,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r0,MSR_KERNEL) SYNC /* disable interrupts so SRR0/1 */ MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 0a0b5310f54a..6467bdab8d40 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -69,24 +69,20 @@ BEGIN_FTR_SECTION bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif - andi. r10,r12,MSR_PR mr r10,r1 - addi r1,r1,-INT_FRAME_SIZE - beq- 1f ld r1,PACAKSAVE(r13) -1: std r10,0(r1) + std r10,0(r1) std r11,_NIP(r1) std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) - beq 2f /* if from kernel mode */ #ifdef CONFIG_PPC_FSL_BOOK3E START_BTB_FLUSH_SECTION BTB_FLUSH(r10) END_BTB_FLUSH_SECTION #endif ACCOUNT_CPU_USER_ENTRY(r13, r10, r11) -2: std r2,GPR2(r1) + std r2,GPR2(r1) std r3,GPR3(r1) mfcr r2 std r4,GPR4(r1) @@ -122,14 +118,13 @@ END_BTB_FLUSH_SECTION #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) BEGIN_FW_FTR_SECTION - beq 33f - /* if from user, see if there are any DTL entries to process */ + /* see if there are any DTL entries to process */ ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ ld r11,PACA_DTL_RIDX(r13) /* get log read index */ addi r10,r10,LPPACA_DTLIDX LDX_BE r10,0,r10 /* get log write index */ - cmpd cr1,r11,r10 - beq+ cr1,33f + cmpd r11,r10 + beq+ 33f bl accumulate_stolen_time REST_GPR(0,r1) REST_4GPRS(3,r1) @@ -203,6 +198,7 @@ system_call: /* label this so stack traces look sane */ mtctr r12 bctrl /* Call handler */ + /* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */ .Lsyscall_exit: std r3,RESULT(r1) @@ -216,11 +212,6 @@ system_call: /* label this so stack traces look sane */ ld r12, PACA_THREAD_INFO(r13) ld r8,_MSR(r1) -#ifdef CONFIG_PPC_BOOK3S - /* No MSR:RI on BookE */ - andi. r10,r8,MSR_RI - beq- .Lunrecov_restore -#endif /* * This is a few instructions into the actual syscall exit path (which actually diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1cfb3da4a84a..829950b96d29 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -750,12 +750,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) ld r15,PACATOC(r13) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) -#else - LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE(r15,__end_interrupts) -#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 +#else + LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e) + cmpld cr0, r10, r14 + LOAD_REG_IMMEDIATE_SYM(r14, r15, __end_interrupts) + cmpld cr1, r10, r14 +#endif blt+ cr0,1f bge+ cr1,1f @@ -820,12 +822,14 @@ kernel_dbg_exc: ld r15,PACATOC(r13) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) -#else - LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE(r15,__end_interrupts) -#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 +#else + LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e) + cmpld cr0, r10, r14 + LOAD_REG_IMMEDIATE_SYM(r14, r15,__end_interrupts) + cmpld cr1, r10, r14 +#endif blt+ cr0,1f bge+ cr1,1f @@ -1449,7 +1453,7 @@ a2_tlbinit_code_start: a2_tlbinit_after_linear_map: /* Now we branch the new virtual address mapped by this entry */ - LOAD_REG_IMMEDIATE(r3,1f) + LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f) mtctr r3 bctr diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6ba3cc2ef8ab..d0018dd17e0a 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -44,6 +44,58 @@ #endif /* + * Following are fixed section helper macros. + * + * EXC_REAL_BEGIN/END - real, unrelocated exception vectors + * EXC_VIRT_BEGIN/END - virt (AIL), unrelocated exception vectors + * TRAMP_REAL_BEGIN - real, unrelocated helpers (virt may call these) + * TRAMP_VIRT_BEGIN - virt, unreloc helpers (in practice, real can use) + * TRAMP_KVM_BEGIN - KVM handlers, these are put into real, unrelocated + * EXC_COMMON - After switching to virtual, relocated mode. + */ + +#define EXC_REAL_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size) + +#define EXC_REAL_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size) + +#define EXC_VIRT_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) + +#define EXC_VIRT_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) + +#define EXC_COMMON_BEGIN(name) \ + USE_TEXT_SECTION(); \ + .balign IFETCH_ALIGN_BYTES; \ + .global name; \ + _ASM_NOKPROBE_SYMBOL(name); \ + DEFINE_FIXED_SYMBOL(name); \ +name: + +#define TRAMP_REAL_BEGIN(name) \ + FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name) + +#define TRAMP_VIRT_BEGIN(name) \ + FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name) + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#define TRAMP_KVM_BEGIN(name) \ + TRAMP_VIRT_BEGIN(name) +#else +#define TRAMP_KVM_BEGIN(name) +#endif + +#define EXC_REAL_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size) + +#define EXC_VIRT_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size) + +/* * We're short on space and time in the exception prolog, so we can't * use the normal LOAD_REG_IMMEDIATE macro to load the address of label. * Instead we get the base of the kernel from paca->kernelbase and or in the low @@ -68,6 +120,7 @@ addis reg,reg,(ABS_ADDR(label))@h /* Exception register prefixes */ +#define EXC_HV_OR_STD 2 /* depends on HVMODE */ #define EXC_HV 1 #define EXC_STD 0 @@ -127,126 +180,6 @@ BEGIN_FTR_SECTION_NESTED(943) \ std ra,offset(r13); \ END_FTR_SECTION_NESTED(ftr,ftr,943) -.macro EXCEPTION_PROLOG_0 area - SET_SCRATCH0(r13) /* save r13 */ - GET_PACA(r13) - std r9,\area\()+EX_R9(r13) /* save r9 */ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) - HMT_MEDIUM - std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ - OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) -.endm - -.macro EXCEPTION_PROLOG_1 hsrr, area, kvm, vec, dar, dsisr, bitmask - OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) - OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) - INTERRUPT_TO_KERNEL - SAVE_CTR(r10, \area\()) - mfcr r9 - .if \kvm - KVMTEST \hsrr \vec - .endif - .if \bitmask - lbz r10,PACAIRQSOFTMASK(r13) - andi. r10,r10,\bitmask - /* Associate vector numbers with bits in paca->irq_happened */ - .if \vec == 0x500 || \vec == 0xea0 - li r10,PACA_IRQ_EE - .elseif \vec == 0x900 - li r10,PACA_IRQ_DEC - .elseif \vec == 0xa00 || \vec == 0xe80 - li r10,PACA_IRQ_DBELL - .elseif \vec == 0xe60 - li r10,PACA_IRQ_HMI - .elseif \vec == 0xf00 - li r10,PACA_IRQ_PMI - .else - .abort "Bad maskable vector" - .endif - - .if \hsrr - bne masked_Hinterrupt - .else - bne masked_interrupt - .endif - .endif - - std r11,\area\()+EX_R11(r13) - std r12,\area\()+EX_R12(r13) - - /* - * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], - * because a d-side MCE will clobber those registers so is - * not recoverable if they are live. - */ - GET_SCRATCH0(r10) - std r10,\area\()+EX_R13(r13) - .if \dar - mfspr r10,SPRN_DAR - std r10,\area\()+EX_DAR(r13) - .endif - .if \dsisr - mfspr r10,SPRN_DSISR - stw r10,\area\()+EX_DSISR(r13) - .endif -.endm - -.macro EXCEPTION_PROLOG_2_REAL label, hsrr, set_ri - ld r10,PACAKMSR(r13) /* get MSR value for kernel */ - .if ! \set_ri - xori r10,r10,MSR_RI /* Clear MSR_RI */ - .endif - .if \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - .endif - LOAD_HANDLER(r10, \label\()) - .if \hsrr - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - .else - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - .endif - b . /* prevent speculative execution */ -.endm - -.macro EXCEPTION_PROLOG_2_VIRT label, hsrr -#ifdef CONFIG_RELOCATABLE - .if \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - .endif - LOAD_HANDLER(r12, \label\()) - mtctr r12 - .if \hsrr - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - bctr -#else - .if \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - b \label -#endif -.endm - /* * Branch to label using its 0xC000 address. This results in instruction * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned @@ -260,6 +193,11 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg; \ bctr +.macro INT_KVM_HANDLER name, vec, hsrr, area, skip + TRAMP_KVM_BEGIN(\name\()_kvm) + KVM_HANDLER \vec, \hsrr, \area, \skip +.endm + #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* @@ -272,17 +210,13 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define kvmppc_interrupt kvmppc_interrupt_pr #endif -.macro KVMTEST hsrr, n +.macro KVMTEST name, hsrr, n lbz r10,HSTATE_IN_GUEST(r13) cmpwi r10,0 - .if \hsrr - bne do_kvm_H\n - .else - bne do_kvm_\n - .endif + bne \name\()_kvm .endm -.macro KVM_HANDLER area, hsrr, n, skip +.macro KVM_HANDLER vec, hsrr, area, skip .if \skip cmpwi r10,KVM_GUEST_MODE_SKIP beq 89f @@ -301,10 +235,16 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r12,HSTATE_SCRATCH0(r13) sldi r12,r9,32 /* HSRR variants have the 0x2 bit added to their trap number */ - .if \hsrr - ori r12,r12,(\n + 0x2) + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + ori r12,r12,(\vec + 0x2) + FTR_SECTION_ELSE + ori r12,r12,(\vec) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + ori r12,r12,(\vec + 0x2) .else - ori r12,r12,(\n) + ori r12,r12,(\vec) .endif #ifdef CONFIG_RELOCATABLE @@ -329,7 +269,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) 89: mtocrf 0x80,r9 ld r9,\area+EX_R9(r13) ld r10,\area+EX_R10(r13) - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + b kvmppc_skip_Hinterrupt + FTR_SECTION_ELSE + b kvmppc_skip_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr b kvmppc_skip_Hinterrupt .else b kvmppc_skip_interrupt @@ -338,88 +284,328 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endm #else -.macro KVMTEST hsrr, n +.macro KVMTEST name, hsrr, n +.endm +.macro KVM_HANDLER name, vec, hsrr, area, skip .endm -.macro KVM_HANDLER area, hsrr, n, skip +#endif + +.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri + ld r10,PACAKMSR(r13) /* get MSR value for kernel */ + .if ! \set_ri + xori r10,r10,MSR_RI /* Clear MSR_RI */ + .endif + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + mtspr SPRN_HSRR1,r10 + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + mtspr SPRN_SRR1,r10 + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + mtspr SPRN_HSRR1,r10 + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + mtspr SPRN_SRR1,r10 + .endif + LOAD_HANDLER(r10, \label\()) + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mtspr SPRN_HSRR0,r10 + HRFI_TO_KERNEL + FTR_SECTION_ELSE + mtspr SPRN_SRR0,r10 + RFI_TO_KERNEL + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mtspr SPRN_HSRR0,r10 + HRFI_TO_KERNEL + .else + mtspr SPRN_SRR0,r10 + RFI_TO_KERNEL + .endif + b . /* prevent speculative execution */ .endm + +/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */ +.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr +#ifdef CONFIG_RELOCATABLE + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + .endif + LOAD_HANDLER(r12, \label\()) + mtctr r12 + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r12,SPRN_SRR1 /* and HSRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + .else + mfspr r12,SPRN_SRR1 /* and HSRR1 */ + .endif + li r10,MSR_RI + mtmsrd r10,1 /* Set RI (EE=0) */ + bctr +#else + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + .endif + li r10,MSR_RI + mtmsrd r10,1 /* Set RI (EE=0) */ + b \label #endif +.endm -#define EXCEPTION_PROLOG_COMMON_1() \ - std r9,_CCR(r1); /* save CR in stackframe */ \ - std r11,_NIP(r1); /* save SRR0 in stackframe */ \ - std r12,_MSR(r1); /* save SRR1 in stackframe */ \ - std r10,0(r1); /* make stack chain pointer */ \ - std r0,GPR0(r1); /* save r0 in stackframe */ \ - std r10,GPR1(r1); /* save r1 in stackframe */ \ - -/* Save original regs values from save area to stack frame. */ -#define EXCEPTION_PROLOG_COMMON_2(area) \ - ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ - ld r10,area+EX_R10(r13); \ - std r9,GPR9(r1); \ - std r10,GPR10(r1); \ - ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \ - ld r10,area+EX_R12(r13); \ - ld r11,area+EX_R13(r13); \ - std r9,GPR11(r1); \ - std r10,GPR12(r1); \ - std r11,GPR13(r1); \ -BEGIN_FTR_SECTION_NESTED(66); \ - ld r10,area+EX_CFAR(r13); \ - std r10,ORIG_GPR3(r1); \ -END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ - GET_CTR(r10, area); \ - std r10,_CTR(r1); - -#define EXCEPTION_PROLOG_COMMON_3(trap) \ - std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - mflr r9; /* Get LR, later save to stack */ \ - ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ - std r9,_LINK(r1); \ - lbz r10,PACAIRQSOFTMASK(r13); \ - mfspr r11,SPRN_XER; /* save XER in stackframe */ \ - std r10,SOFTE(r1); \ - std r11,_XER(r1); \ - li r9,(trap)+1; \ - std r9,_TRAP(r1); /* set trap number */ \ - li r10,0; \ - ld r11,exception_marker@toc(r2); \ - std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ +/* + * This is the BOOK3S interrupt entry code macro. + * + * This can result in one of several things happening: + * - Branch to the _common handler, relocated, in virtual mode. + * These are normal interrupts (synchronous and asynchronous) handled by + * the kernel. + * - Branch to KVM, relocated but real mode interrupts remain in real mode. + * These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by + * / intended for host or guest kernel, but KVM must always be involved + * because the machine state is set for guest execution. + * - Branch to the masked handler, unrelocated. + * These occur when maskable asynchronous interrupts are taken with the + * irq_soft_mask set. + * - Branch to an "early" handler in real mode but relocated. + * This is done if early=1. MCE and HMI use these to handle errors in real + * mode. + * - Fall through and continue executing in real, unrelocated mode. + * This is done if early=2. + */ +.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 + SET_SCRATCH0(r13) /* save r13 */ + GET_PACA(r13) + std r9,\area\()+EX_R9(r13) /* save r9 */ + OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ + OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) + .if \ool + .if !\virt + b tramp_real_\name + .pushsection .text + TRAMP_REAL_BEGIN(tramp_real_\name) + .else + b tramp_virt_\name + .pushsection .text + TRAMP_VIRT_BEGIN(tramp_virt_\name) + .endif + .endif + + OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) + OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) + INTERRUPT_TO_KERNEL + SAVE_CTR(r10, \area\()) + mfcr r9 + .if \kvm + KVMTEST \name \hsrr \vec + .endif + .if \bitmask + lbz r10,PACAIRQSOFTMASK(r13) + andi. r10,r10,\bitmask + /* Associate vector numbers with bits in paca->irq_happened */ + .if \vec == 0x500 || \vec == 0xea0 + li r10,PACA_IRQ_EE + .elseif \vec == 0x900 + li r10,PACA_IRQ_DEC + .elseif \vec == 0xa00 || \vec == 0xe80 + li r10,PACA_IRQ_DBELL + .elseif \vec == 0xe60 + li r10,PACA_IRQ_HMI + .elseif \vec == 0xf00 + li r10,PACA_IRQ_PMI + .else + .abort "Bad maskable vector" + .endif + + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + bne masked_Hinterrupt + FTR_SECTION_ELSE + bne masked_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + bne masked_Hinterrupt + .else + bne masked_interrupt + .endif + .endif + + std r11,\area\()+EX_R11(r13) + std r12,\area\()+EX_R12(r13) + + /* + * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], + * because a d-side MCE will clobber those registers so is + * not recoverable if they are live. + */ + GET_SCRATCH0(r10) + std r10,\area\()+EX_R13(r13) + .if \dar + .if \hsrr + mfspr r10,SPRN_HDAR + .else + mfspr r10,SPRN_DAR + .endif + std r10,\area\()+EX_DAR(r13) + .endif + .if \dsisr + .if \hsrr + mfspr r10,SPRN_HDSISR + .else + mfspr r10,SPRN_DSISR + .endif + stw r10,\area\()+EX_DSISR(r13) + .endif + + .if \early == 2 + /* nothing more */ + .elseif \early + mfctr r10 /* save ctr, even for !RELOCATABLE */ + BRANCH_TO_C000(r11, \name\()_early_common) + .elseif !\virt + INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri + .else + INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr + .endif + .if \ool + .popsection + .endif +.endm /* * On entry r13 points to the paca, r9-r13 are saved in the paca, * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and * SRR1, and relocation is on. + * + * If stack=0, then the stack is already set in r1, and r1 is saved in r10. + * PPR save and CPU accounting is not done for the !stack case (XXX why not?) */ -#define EXCEPTION_COMMON(area, trap) \ - andi. r10,r12,MSR_PR; /* See if coming from user */ \ - mr r10,r1; /* Save r1 */ \ - subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \ - beq- 1f; \ - ld r1,PACAKSAVE(r13); /* kernel stack to use */ \ -1: tdgei r1,-INT_FRAME_SIZE; /* trap if r1 is in userspace */ \ - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; \ -3: EXCEPTION_PROLOG_COMMON_1(); \ - kuap_save_amr_and_lock r9, r10, cr1, cr0; \ - beq 4f; /* if from kernel mode */ \ - ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ - SAVE_PPR(area, r9); \ -4: EXCEPTION_PROLOG_COMMON_2(area); \ - EXCEPTION_PROLOG_COMMON_3(trap); \ +.macro INT_COMMON vec, area, stack, kaup, reconcile, dar, dsisr + .if \stack + andi. r10,r12,MSR_PR /* See if coming from user */ + mr r10,r1 /* Save r1 */ + subi r1,r1,INT_FRAME_SIZE /* alloc frame on kernel stack */ + beq- 100f + ld r1,PACAKSAVE(r13) /* kernel stack to use */ +100: tdgei r1,-INT_FRAME_SIZE /* trap if r1 is in userspace */ + EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,0 + .endif + + std r9,_CCR(r1) /* save CR in stackframe */ + std r11,_NIP(r1) /* save SRR0 in stackframe */ + std r12,_MSR(r1) /* save SRR1 in stackframe */ + std r10,0(r1) /* make stack chain pointer */ + std r0,GPR0(r1) /* save r0 in stackframe */ + std r10,GPR1(r1) /* save r1 in stackframe */ + + .if \stack + .if \kaup + kuap_save_amr_and_lock r9, r10, cr1, cr0 + .endif + beq 101f /* if from kernel mode */ + ACCOUNT_CPU_USER_ENTRY(r13, r9, r10) + SAVE_PPR(\area, r9) +101: + .else + .if \kaup + kuap_save_amr_and_lock r9, r10, cr1 + .endif + .endif + + /* Save original regs values from save area to stack frame. */ + ld r9,\area+EX_R9(r13) /* move r9, r10 to stackframe */ + ld r10,\area+EX_R10(r13) + std r9,GPR9(r1) + std r10,GPR10(r1) + ld r9,\area+EX_R11(r13) /* move r11 - r13 to stackframe */ + ld r10,\area+EX_R12(r13) + ld r11,\area+EX_R13(r13) + std r9,GPR11(r1) + std r10,GPR12(r1) + std r11,GPR13(r1) + .if \dar + .if \dar == 2 + ld r10,_NIP(r1) + .else + ld r10,\area+EX_DAR(r13) + .endif + std r10,_DAR(r1) + .endif + .if \dsisr + .if \dsisr == 2 + ld r10,_MSR(r1) + lis r11,DSISR_SRR1_MATCH_64S@h + and r10,r10,r11 + .else + lwz r10,\area+EX_DSISR(r13) + .endif + std r10,_DSISR(r1) + .endif +BEGIN_FTR_SECTION_NESTED(66) + ld r10,\area+EX_CFAR(r13) + std r10,ORIG_GPR3(r1) +END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) + GET_CTR(r10, \area) + std r10,_CTR(r1) + std r2,GPR2(r1) /* save r2 in stackframe */ + SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */ + SAVE_2GPRS(7, r1) /* save r7, r8 in stackframe */ + mflr r9 /* Get LR, later save to stack */ + ld r2,PACATOC(r13) /* get kernel TOC into r2 */ + std r9,_LINK(r1) + lbz r10,PACAIRQSOFTMASK(r13) + mfspr r11,SPRN_XER /* save XER in stackframe */ + std r10,SOFTE(r1) + std r11,_XER(r1) + li r9,(\vec)+1 + std r9,_TRAP(r1) /* set trap number */ + li r10,0 + ld r11,exception_marker@toc(r2) + std r10,RESULT(r1) /* clear regs->result */ + std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */ + + .if \stack ACCOUNT_STOLEN_TIME + .endif -/* - * Exception where stack is already set in r1, r1 is saved in r10. - * PPR save and CPU accounting is not done (for some reason). - */ -#define EXCEPTION_COMMON_STACK(area, trap) \ - EXCEPTION_PROLOG_COMMON_1(); \ - kuap_save_amr_and_lock r9, r10, cr1; \ - EXCEPTION_PROLOG_COMMON_2(area); \ - EXCEPTION_PROLOG_COMMON_3(trap) + .if \reconcile + RECONCILE_IRQ_STATE(r10, r11) + .endif +.endm /* * Restore all registers including H/SRR0/1 saved in a stack frame of a @@ -428,6 +614,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ .macro EXCEPTION_RESTORE_REGS hsrr /* Move original SRR0 and SRR1 into the respective regs */ ld r9,_MSR(r1) + .if \hsrr == EXC_HV_OR_STD + .error "EXC_HV_OR_STD Not implemented for EXCEPTION_RESTORE_REGS" + .endif .if \hsrr mtspr SPRN_HSRR1,r9 .else @@ -481,219 +670,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif -/* - * Following are the BOOK3S exception handler helper macros. - * Handlers come in a number of types, and each type has a number of varieties. - * - * EXC_REAL_* - real, unrelocated exception vectors - * EXC_VIRT_* - virt (AIL), unrelocated exception vectors - * TRAMP_REAL_* - real, unrelocated helpers (virt can call these) - * TRAMP_VIRT_* - virt, unreloc helpers (in practice, real can use) - * TRAMP_KVM - KVM handlers that get put into real, unrelocated - * EXC_COMMON - virt, relocated common handlers - * - * The EXC handlers are given a name, and branch to name_common, or the - * appropriate KVM or masking function. Vector handler verieties are as - * follows: - * - * EXC_{REAL|VIRT}_BEGIN/END - used to open-code the exception - * - * EXC_{REAL|VIRT} - standard exception - * - * EXC_{REAL|VIRT}_suffix - * where _suffix is: - * - _MASKABLE - maskable exception - * - _OOL - out of line with trampoline to common handler - * - _HV - HV exception - * - * There can be combinations, e.g., EXC_VIRT_OOL_MASKABLE_HV - * - * KVM handlers come in the following verieties: - * TRAMP_KVM - * TRAMP_KVM_SKIP - * TRAMP_KVM_HV - * TRAMP_KVM_HV_SKIP - * - * COMMON handlers come in the following verieties: - * EXC_COMMON_BEGIN/END - used to open-code the handler - * EXC_COMMON - * EXC_COMMON_ASYNC - * - * TRAMP_REAL and TRAMP_VIRT can be used with BEGIN/END. KVM - * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers. - */ - -#define __EXC_REAL(name, start, size, area) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 area ; \ - EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_REAL(name, start, size) \ - __EXC_REAL(name, start, size, PACA_EXGEN) - -#define __EXC_VIRT(name, start, size, realvec, area) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 area ; \ - EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0, 0, 0; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \ - EXC_VIRT_END(name, start, size) - -#define EXC_VIRT(name, start, size, realvec) \ - __EXC_VIRT(name, start, size, realvec, PACA_EXGEN) - -#define EXC_REAL_MASKABLE(name, start, size, bitmask) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_VIRT_MASKABLE(name, start, size, realvec, bitmask) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \ - EXC_VIRT_END(name, start, size) - -#define EXC_REAL_HV(name, start, size) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN; \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_VIRT_HV(name, start, size, realvec) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN; \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV ; \ - EXC_VIRT_END(name, start, size) - -#define __EXC_REAL_OOL(name, start, size) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - b tramp_real_##name ; \ - EXC_REAL_END(name, start, size) - -#define __TRAMP_REAL_OOL(name, vec) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_REAL_OOL(name, start, size) \ - __EXC_REAL_OOL(name, start, size); \ - __TRAMP_REAL_OOL(name, start) - -#define __EXC_REAL_OOL_MASKABLE(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_MASKABLE(name, vec, bitmask) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_REAL_OOL_MASKABLE(name, start, size, bitmask) \ - __EXC_REAL_OOL_MASKABLE(name, start, size); \ - __TRAMP_REAL_OOL_MASKABLE(name, start, bitmask) - -#define __EXC_REAL_OOL_HV(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_HV(name, vec) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 - -#define EXC_REAL_OOL_HV(name, start, size) \ - __EXC_REAL_OOL_HV(name, start, size); \ - __TRAMP_REAL_OOL_HV(name, start) - -#define __EXC_REAL_OOL_MASKABLE_HV(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec, bitmask) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 - -#define EXC_REAL_OOL_MASKABLE_HV(name, start, size, bitmask) \ - __EXC_REAL_OOL_MASKABLE_HV(name, start, size); \ - __TRAMP_REAL_OOL_MASKABLE_HV(name, start, bitmask) - -#define __EXC_VIRT_OOL(name, start, size) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - b tramp_virt_##name; \ - EXC_VIRT_END(name, start, size) - -#define __TRAMP_VIRT_OOL(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD - -#define EXC_VIRT_OOL(name, start, size, realvec) \ - __EXC_VIRT_OOL(name, start, size); \ - __TRAMP_VIRT_OOL(name, realvec) - -#define __EXC_VIRT_OOL_MASKABLE(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec, bitmask) \ - __EXC_VIRT_OOL_MASKABLE(name, start, size); \ - __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) - -#define __EXC_VIRT_OOL_HV(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_HV(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV - -#define EXC_VIRT_OOL_HV(name, start, size, realvec) \ - __EXC_VIRT_OOL_HV(name, start, size); \ - __TRAMP_VIRT_OOL_HV(name, realvec) - -#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV - -#define EXC_VIRT_OOL_MASKABLE_HV(name, start, size, realvec, bitmask) \ - __EXC_VIRT_OOL_MASKABLE_HV(name, start, size); \ - __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask) - -#define TRAMP_KVM(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_##n); \ - KVM_HANDLER area, EXC_STD, n, 0 - -#define TRAMP_KVM_SKIP(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_##n); \ - KVM_HANDLER area, EXC_STD, n, 1 - -#define TRAMP_KVM_HV(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_H##n); \ - KVM_HANDLER area, EXC_HV, n, 0 - -#define TRAMP_KVM_HV_SKIP(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_H##n); \ - KVM_HANDLER area, EXC_HV, n, 1 - #define EXC_COMMON(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - EXCEPTION_COMMON(PACA_EXGEN, realvec); \ + INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ bl save_nvgprs; \ - RECONCILE_IRQ_STATE(r10, r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ b ret_from_except @@ -704,9 +684,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) */ #define EXC_COMMON_ASYNC(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - EXCEPTION_COMMON(PACA_EXGEN, realvec); \ + INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ FINISH_NAP; \ - RECONCILE_IRQ_STATE(r10, r11); \ RUNLATCH_ON; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ @@ -836,9 +815,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif - EXCEPTION_PROLOG_0 PACA_EXNMI - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 1, 0x100, 0, 0, 0 - EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0 + INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0, kvm=1 /* * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is * being used, so a nested NMI exception would corrupt it. @@ -850,9 +827,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) * be dangerous anyway. */ EXC_REAL_END(system_reset, 0x100, 0x100) - EXC_VIRT_NONE(0x4100, 0x100) -TRAMP_KVM(PACA_EXNMI, 0x100) +INT_KVM_HANDLER system_reset 0x100, EXC_STD, PACA_EXNMI, 0 #ifdef CONFIG_PPC_P7_NAP TRAMP_REAL_BEGIN(system_reset_idle_wake) @@ -868,9 +844,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) */ TRAMP_REAL_BEGIN(system_reset_fwnmi) /* See comment at system_reset exception, don't turn on RI */ - EXCEPTION_PROLOG_0 PACA_EXNMI - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 0, 0x100, 0, 0, 0 - EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0 + INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0 #endif /* CONFIG_PPC_PSERIES */ @@ -890,7 +864,7 @@ EXC_COMMON_BEGIN(system_reset_common) mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE - EXCEPTION_COMMON_STACK(PACA_EXNMI, 0x100) + INT_COMMON 0x100, PACA_EXNMI, 0, 1, 0, 0, 0 bl save_nvgprs /* * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does @@ -933,26 +907,39 @@ EXC_COMMON_BEGIN(system_reset_common) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) - /* This is moved out of line as it can be patched by FW, but - * some code path might still want to branch into the original - * vector + INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 + /* + * MSR_RI is not enabled, because PACA_EXMC is being used, so a + * nested machine check corrupts it. machine_check_common enables + * MSR_RI. */ - EXCEPTION_PROLOG_0 PACA_EXMC -BEGIN_FTR_SECTION - b machine_check_common_early -FTR_SECTION_ELSE - b machine_check_pSeries_0 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) -TRAMP_REAL_BEGIN(machine_check_common_early) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 0, 0, 0 + +#ifdef CONFIG_PPC_PSERIES +TRAMP_REAL_BEGIN(machine_check_fwnmi) + /* See comment at machine_check exception, don't turn on RI */ + INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 +#endif + +INT_KVM_HANDLER machine_check 0x200, EXC_STD, PACA_EXMC, 1 + +#define MACHINE_CHECK_HANDLER_WINDUP \ + /* Clear MSR_RI before setting SRR0 and SRR1. */\ + li r9,0; \ + mtmsrd r9,1; /* Clear MSR_RI */ \ + /* Decrement paca->in_mce now RI is clear. */ \ + lhz r12,PACA_IN_MCE(r13); \ + subi r12,r12,1; \ + sth r12,PACA_IN_MCE(r13); \ + EXCEPTION_RESTORE_REGS EXC_STD + +EXC_COMMON_BEGIN(machine_check_early_common) + mtctr r10 /* Restore ctr */ + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 + /* - * Register contents: - * R13 = PACA - * R9 = CR - * Original R9 to R13 is saved on PACA_EXMC - * * Switch to mc_emergency stack and handle re-entrancy (we limit * the nested MCE upto level 4 to avoid stack overflow). * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1 @@ -973,103 +960,127 @@ TRAMP_REAL_BEGIN(machine_check_common_early) * the machine check is handled then the idle wakeup code is called * to restore state. */ - mr r11,r1 /* Save r1 */ lhz r10,PACA_IN_MCE(r13) cmpwi r10,0 /* Are we in nested machine check */ - bne 0f /* Yes, we are. */ - /* First machine check entry */ - ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ -0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + cmpwi cr1,r10,MAX_MCE_DEPTH /* Are we at maximum nesting */ addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) - /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,MAX_MCE_DEPTH - bgt 2f /* Check if we hit limit of 4 */ - std r11,GPR1(r1) /* Save r1 on the stack. */ - std r11,0(r1) /* make stack chain pointer */ - mfspr r11,SPRN_SRR0 /* Save SRR0 */ - std r11,_NIP(r1) - mfspr r11,SPRN_SRR1 /* Save SRR1 */ - std r11,_MSR(r1) - mfspr r11,SPRN_DAR /* Save DAR */ - std r11,_DAR(r1) - mfspr r11,SPRN_DSISR /* Save DSISR */ - std r11,_DSISR(r1) - std r9,_CCR(r1) /* Save CR in stackframe */ + + mr r10,r1 /* Save r1 */ + bne 1f + /* First machine check entry */ + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ +1: /* Limit nested MCE to level 4 to avoid stack overflow */ + bgt cr1,unrecoverable_mce /* Check if we hit limit of 4 */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + /* We don't touch AMR here, we never go to virtual mode */ - /* Save r9 through r13 from EXMC save area to stack frame. */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) - mfmsr r11 /* get MSR value */ + INT_COMMON 0x200, PACA_EXMC, 0, 0, 0, 1, 1 + BEGIN_FTR_SECTION - ori r11,r11,MSR_ME /* turn on ME bit */ + bl enable_machine_check END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ori r11,r11,MSR_RI /* turn on RI bit */ - LOAD_HANDLER(r12, machine_check_handle_early) -1: mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r11 - RFI_TO_KERNEL - b . /* prevent speculative execution */ -2: - /* Stack overflow. Stay on emergency stack and panic. - * Keep the ME bit off while panic-ing, so that if we hit - * another machine check we checkstop. - */ - addi r1,r1,INT_FRAME_SIZE /* go back to previous stack frame */ - ld r11,PACAKMSR(r13) - LOAD_HANDLER(r12, unrecover_mce) - li r10,MSR_ME - andc r11,r11,r10 /* Turn off MSR_ME */ - b 1b - b . /* prevent speculative execution */ + li r10,MSR_RI + mtmsrd r10,1 + + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_early + std r3,RESULT(r1) /* Save result */ + ld r12,_MSR(r1) -TRAMP_REAL_BEGIN(machine_check_pSeries) - .globl machine_check_fwnmi -machine_check_fwnmi: - EXCEPTION_PROLOG_0 PACA_EXMC +#ifdef CONFIG_PPC_P7_NAP + /* + * Check if thread was in power saving mode. We come here when any + * of the following is true: + * a. thread wasn't in power saving mode + * b. thread was in power saving mode with no state loss, + * supervisor state loss or hypervisor state loss. + * + * Go back to nap/sleep/winkle mode again if (b) is true. + */ BEGIN_FTR_SECTION - b machine_check_common_early -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) -machine_check_pSeries_0: - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 + rlwinm. r11,r12,47-31,30,31 + bne machine_check_idle_common +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) +#endif + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* - * MSR_RI is not enabled, because PACA_EXMC is being used, so a - * nested machine check corrupts it. machine_check_common enables - * MSR_RI. + * Check if we are coming from guest. If yes, then run the normal + * exception handler which will take the + * machine_check_kvm->kvmppc_interrupt branch to deliver the MC event + * to guest. */ - EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 + lbz r11,HSTATE_IN_GUEST(r13) + cmpwi r11,0 /* Check if coming from guest */ + bne mce_deliver /* continue if we are. */ +#endif -TRAMP_KVM_SKIP(PACA_EXMC, 0x200) + /* + * Check if we are coming from userspace. If yes, then run the normal + * exception handler which will deliver the MC event to this kernel. + */ + andi. r11,r12,MSR_PR /* See if coming from user. */ + bne mce_deliver /* continue in V mode if we are. */ + + /* + * At this point we are coming from kernel context. + * Queue up the MCE event and return from the interrupt. + * But before that, check if this is an un-recoverable exception. + * If yes, then stay on emergency stack and panic. + */ + andi. r11,r12,MSR_RI + beq unrecoverable_mce + + /* + * Check if we have successfully handled/recovered from error, if not + * then stay on emergency stack and panic. + */ + ld r3,RESULT(r1) /* Load result */ + cmpdi r3,0 /* see if we handled MCE successfully */ + beq unrecoverable_mce /* if !handled then panic */ + + /* + * Return from MC interrupt. + * Queue up the MCE event so that we can log it later, while + * returning from kernel or opal call. + */ + bl machine_check_queue_event + MACHINE_CHECK_HANDLER_WINDUP + RFI_TO_KERNEL + +mce_deliver: + /* + * This is a host user or guest MCE. Restore all registers, then + * run the "late" handler. For host user, this will run the + * machine_check_exception handler in virtual mode like a normal + * interrupt handler. For guest, this will trigger the KVM test + * and branch to the KVM interrupt similarly to other interrupts. + */ +BEGIN_FTR_SECTION + ld r10,ORIG_GPR3(r1) + mtspr SPRN_CFAR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + MACHINE_CHECK_HANDLER_WINDUP + /* See comment at machine_check exception, don't turn on RI */ + INT_HANDLER machine_check, 0x200, area=PACA_EXMC, ri=0, dar=1, dsisr=1, kvm=1 EXC_COMMON_BEGIN(machine_check_common) /* * Machine check is different because we use a different * save area: PACA_EXMC instead of PACA_EXGEN. */ - EXCEPTION_COMMON(PACA_EXMC, 0x200) + INT_COMMON 0x200, PACA_EXMC, 1, 1, 1, 1, 1 FINISH_NAP - RECONCILE_IRQ_STATE(r10, r11) - ld r3,PACA_EXMC+EX_DAR(r13) - lwz r4,PACA_EXMC+EX_DSISR(r13) /* Enable MSR_RI when finished with PACA_EXMC */ li r10,MSR_RI mtmsrd r10,1 - std r3,_DAR(r1) - std r4,_DSISR(r1) bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception b ret_from_except -#define MACHINE_CHECK_HANDLER_WINDUP \ - /* Clear MSR_RI before setting SRR0 and SRR1. */\ - li r9,0; \ - mtmsrd r9,1; /* Clear MSR_RI */ \ - /* Decrement paca->in_mce now RI is clear. */ \ - lhz r12,PACA_IN_MCE(r13); \ - subi r12,r12,1; \ - sth r12,PACA_IN_MCE(r13); \ - EXCEPTION_RESTORE_REGS EXC_STD - #ifdef CONFIG_PPC_P7_NAP /* * This is an idle wakeup. Low level machine check has already been @@ -1101,72 +1112,8 @@ EXC_COMMON_BEGIN(machine_check_idle_common) bltlr cr1 /* no state loss, return to idle caller */ b idle_return_gpr_loss #endif - /* - * Handle machine check early in real mode. We come here with - * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. - */ -EXC_COMMON_BEGIN(machine_check_handle_early) - std r0,GPR0(r1) /* Save r0 */ - EXCEPTION_PROLOG_COMMON_3(0x200) - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_early - std r3,RESULT(r1) /* Save result */ - ld r12,_MSR(r1) -BEGIN_FTR_SECTION - b 4f -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) -#ifdef CONFIG_PPC_P7_NAP - /* - * Check if thread was in power saving mode. We come here when any - * of the following is true: - * a. thread wasn't in power saving mode - * b. thread was in power saving mode with no state loss, - * supervisor state loss or hypervisor state loss. - * - * Go back to nap/sleep/winkle mode again if (b) is true. - */ -BEGIN_FTR_SECTION - rlwinm. r11,r12,47-31,30,31 - bne machine_check_idle_common -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) -#endif - - /* - * Check if we are coming from hypervisor userspace. If yes then we - * continue in host kernel in V mode to deliver the MC event. - */ - rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ - beq 5f -4: andi. r11,r12,MSR_PR /* See if coming from user. */ - bne 9f /* continue in V mode if we are. */ - -5: -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -BEGIN_FTR_SECTION - /* - * We are coming from kernel context. Check if we are coming from - * guest. if yes, then we can continue. We will fall through - * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest. - */ - lbz r11,HSTATE_IN_GUEST(r13) - cmpwi r11,0 /* Check if coming from guest */ - bne 9f /* continue if we are. */ -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) -#endif - /* - * At this point we are not sure about what context we come from. - * Queue up the MCE event and return from the interrupt. - * But before that, check if this is an un-recoverable exception. - * If yes, then stay on emergency stack and panic. - */ - andi. r11,r12,MSR_RI - bne 2f -1: mfspr r11,SPRN_SRR0 - LOAD_HANDLER(r10,unrecover_mce) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) +EXC_COMMON_BEGIN(unrecoverable_mce) /* * We are going down. But there are chances that we might get hit by * another MCE during panic path and we may run into unstable state @@ -1174,84 +1121,36 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) * when another MCE is hit during panic path, system will checkstop * and hypervisor will get restarted cleanly by SP. */ - li r3,MSR_ME - andc r10,r10,r3 /* Turn off MSR_ME */ - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . -2: - /* - * Check if we have successfully handled/recovered from error, if not - * then stay on emergency stack and panic. - */ - ld r3,RESULT(r1) /* Load result */ - cmpdi r3,0 /* see if we handled MCE successfully */ - - beq 1b /* if !handled then panic */ BEGIN_FTR_SECTION - /* - * Return from MC interrupt. - * Queue up the MCE event so that we can log it later, while - * returning from kernel or opal call. - */ - bl machine_check_queue_event - MACHINE_CHECK_HANDLER_WINDUP - RFI_TO_USER_OR_KERNEL -FTR_SECTION_ELSE - /* - * pSeries: Return from MC interrupt. Before that stay on emergency - * stack and call machine_check_exception to log the MCE event. - */ - LOAD_HANDLER(r10,mce_return) - mtspr SPRN_SRR0,r10 + li r10,0 /* clear MSR_RI */ + mtmsrd r10,1 + bl disable_machine_check +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -9: - /* Deliver the machine check to host kernel in V mode. */ - MACHINE_CHECK_HANDLER_WINDUP - EXCEPTION_PROLOG_0 PACA_EXMC - b machine_check_pSeries_0 + li r3,MSR_ME + andc r10,r10,r3 + mtmsrd r10 -EXC_COMMON_BEGIN(unrecover_mce) /* Invoke machine_check_exception to print MCE event and panic. */ addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception + /* - * We will not reach here. Even if we did, there is no way out. Call - * unrecoverable_exception and die. + * We will not reach here. Even if we did, there is no way out. + * Call unrecoverable_exception and die. */ -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -EXC_COMMON_BEGIN(mce_return) - /* Invoke machine_check_exception to print MCE event and return. */ addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception - MACHINE_CHECK_HANDLER_WINDUP - RFI_TO_KERNEL + bl unrecoverable_exception b . + EXC_REAL_BEGIN(data_access, 0x300, 0x80) - EXCEPTION_PROLOG_0 PACA_EXGEN - b tramp_real_data_access + INT_HANDLER data_access, 0x300, ool=1, dar=1, dsisr=1, kvm=1 EXC_REAL_END(data_access, 0x300, 0x80) - -TRAMP_REAL_BEGIN(tramp_real_data_access) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 1, 1, 0 - EXCEPTION_PROLOG_2_REAL data_access_common, EXC_STD, 1 - EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x300, 1, 1, 0 -EXCEPTION_PROLOG_2_VIRT data_access_common, EXC_STD + INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1 EXC_VIRT_END(data_access, 0x4300, 0x80) - -TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) - +INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON_BEGIN(data_access_common) /* * Here r13 points to the paca, r9 contains the saved CR, @@ -1259,15 +1158,12 @@ EXC_COMMON_BEGIN(data_access_common) * r9 - r13 are saved in paca->exgen. * EX_DAR and EX_DSISR have saved DAR/DSISR */ - EXCEPTION_COMMON(PACA_EXGEN, 0x300) - RECONCILE_IRQ_STATE(r10, r11) - ld r12,_MSR(r1) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - li r5,0x300 - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1 + ld r4,_DAR(r1) + ld r5,_DSISR(r1) BEGIN_MMU_FTR_SECTION + ld r6,_MSR(r1) + li r3,0x300 b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault @@ -1275,26 +1171,15 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - EXCEPTION_PROLOG_0 PACA_EXSLB - b tramp_real_data_access_slb + INT_HANDLER data_access_slb, 0x380, ool=1, area=PACA_EXSLB, dar=1, kvm=1 EXC_REAL_END(data_access_slb, 0x380, 0x80) - -TRAMP_REAL_BEGIN(tramp_real_data_access_slb) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 1, 0x380, 1, 0, 0 - EXCEPTION_PROLOG_2_REAL data_access_slb_common, EXC_STD, 1 - EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - EXCEPTION_PROLOG_0 PACA_EXSLB - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 0, 0x380, 1, 0, 0 - EXCEPTION_PROLOG_2_VIRT data_access_slb_common, EXC_STD + INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1 EXC_VIRT_END(data_access_slb, 0x4380, 0x80) - -TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) - +INT_KVM_HANDLER data_access_slb, 0x380, EXC_STD, PACA_EXSLB, 1 EXC_COMMON_BEGIN(data_access_slb_common) - EXCEPTION_COMMON(PACA_EXSLB, 0x380) - ld r4,PACA_EXSLB+EX_DAR(r13) - std r4,_DAR(r1) + INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0 + ld r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ @@ -1317,33 +1202,36 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) b ret_from_except -EXC_REAL(instruction_access, 0x400, 0x80) -EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) -TRAMP_KVM(PACA_EXGEN, 0x400) - +EXC_REAL_BEGIN(instruction_access, 0x400, 0x80) + INT_HANDLER instruction_access, 0x400, kvm=1 +EXC_REAL_END(instruction_access, 0x400, 0x80) +EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) + INT_HANDLER instruction_access, 0x400, virt=1 +EXC_VIRT_END(instruction_access, 0x4400, 0x80) +INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x400) - RECONCILE_IRQ_STATE(r10, r11) - ld r12,_MSR(r1) - ld r3,_NIP(r1) - andis. r4,r12,DSISR_SRR1_MATCH_64S@h - li r5,0x400 - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2 + ld r4,_DAR(r1) + ld r5,_DSISR(r1) BEGIN_MMU_FTR_SECTION + ld r6,_MSR(r1) + li r3,0x400 b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) -__EXC_REAL(instruction_access_slb, 0x480, 0x80, PACA_EXSLB) -__EXC_VIRT(instruction_access_slb, 0x4480, 0x80, 0x480, PACA_EXSLB) -TRAMP_KVM(PACA_EXSLB, 0x480) - +EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) + INT_HANDLER instruction_access_slb, 0x480, area=PACA_EXSLB, kvm=1 +EXC_REAL_END(instruction_access_slb, 0x480, 0x80) +EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) + INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB +EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) +INT_KVM_HANDLER instruction_access_slb, 0x480, EXC_STD, PACA_EXSLB, 0 EXC_COMMON_BEGIN(instruction_access_slb_common) - EXCEPTION_COMMON(PACA_EXSLB, 0x480) - ld r4,_NIP(r1) + INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0 + ld r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ @@ -1359,69 +1247,44 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) - ld r4,_NIP(r1) + ld r4,_DAR(r1) ld r5,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_bad_slb_fault b ret_from_except - EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN -BEGIN_FTR_SECTION - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV, 1 -FTR_SECTION_ELSE - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_STD, 1 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + INT_HANDLER hardware_interrupt, 0x500, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_REAL_END(hardware_interrupt, 0x500, 0x100) - EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN -BEGIN_FTR_SECTION - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV -FTR_SECTION_ELSE - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_STD -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) - -TRAMP_KVM(PACA_EXGEN, 0x500) -TRAMP_KVM_HV(PACA_EXGEN, 0x500) +INT_KVM_HANDLER hardware_interrupt, 0x500, EXC_HV_OR_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) EXC_REAL_BEGIN(alignment, 0x600, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x600, 1, 1, 0 - EXCEPTION_PROLOG_2_REAL alignment_common, EXC_STD, 1 + INT_HANDLER alignment, 0x600, dar=1, dsisr=1, kvm=1 EXC_REAL_END(alignment, 0x600, 0x100) - EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x600, 1, 1, 0 - EXCEPTION_PROLOG_2_VIRT alignment_common, EXC_STD + INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1 EXC_VIRT_END(alignment, 0x4600, 0x100) - -TRAMP_KVM(PACA_EXGEN, 0x600) +INT_KVM_HANDLER alignment, 0x600, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(alignment_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x600) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl alignment_exception b ret_from_except -EXC_REAL(program_check, 0x700, 0x100) -EXC_VIRT(program_check, 0x4700, 0x100, 0x700) -TRAMP_KVM(PACA_EXGEN, 0x700) +EXC_REAL_BEGIN(program_check, 0x700, 0x100) + INT_HANDLER program_check, 0x700, kvm=1 +EXC_REAL_END(program_check, 0x700, 0x100) +EXC_VIRT_BEGIN(program_check, 0x4700, 0x100) + INT_HANDLER program_check, 0x700, virt=1 +EXC_VIRT_END(program_check, 0x4700, 0x100) +INT_KVM_HANDLER program_check, 0x700, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(program_check_common) /* * It's possible to receive a TM Bad Thing type program check with @@ -1447,27 +1310,33 @@ EXC_COMMON_BEGIN(program_check_common) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - b 3f /* Jump into the macro !! */ + INT_COMMON 0x700, PACA_EXGEN, 0, 1, 1, 0, 0 + b 3f 2: - EXCEPTION_COMMON(PACA_EXGEN, 0x700) + INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0 +3: bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception b ret_from_except -EXC_REAL(fp_unavailable, 0x800, 0x100) -EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800) -TRAMP_KVM(PACA_EXGEN, 0x800) +EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100) + INT_HANDLER fp_unavailable, 0x800, kvm=1 +EXC_REAL_END(fp_unavailable, 0x800, 0x100) +EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) + INT_HANDLER fp_unavailable, 0x800, virt=1 +EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) +INT_KVM_HANDLER fp_unavailable, 0x800, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(fp_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x800) + INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0 bne 1f /* if from user, just load it up */ bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_fp_unavailable_exception - BUG_OPCODE +0: trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 1: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION @@ -1490,21 +1359,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif -EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED) -EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900, IRQS_DISABLED) -TRAMP_KVM(PACA_EXGEN, 0x900) +EXC_REAL_BEGIN(decrementer, 0x900, 0x80) + INT_HANDLER decrementer, 0x900, ool=1, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(decrementer, 0x900, 0x80) +EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) + INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED +EXC_VIRT_END(decrementer, 0x4900, 0x80) +INT_KVM_HANDLER decrementer, 0x900, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) -EXC_REAL_HV(hdecrementer, 0x980, 0x80) -EXC_VIRT_HV(hdecrementer, 0x4980, 0x80, 0x980) -TRAMP_KVM_HV(PACA_EXGEN, 0x980) +EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80) + INT_HANDLER hdecrementer, 0x980, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(hdecrementer, 0x980, 0x80) +EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) + INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(hdecrementer, 0x4980, 0x80) +INT_KVM_HANDLER hdecrementer, 0x980, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) -EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100, IRQS_DISABLED) -EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00, IRQS_DISABLED) -TRAMP_KVM(PACA_EXGEN, 0xa00) +EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100) + INT_HANDLER doorbell_super, 0xa00, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(doorbell_super, 0xa00, 0x100) +EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) + INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED +EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) +INT_KVM_HANDLER doorbell_super, 0xa00, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) #else @@ -1512,17 +1393,13 @@ EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception) #endif -EXC_REAL(trap_0b, 0xb00, 0x100) -EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) -TRAMP_KVM(PACA_EXGEN, 0xb00) -EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +EXC_REAL_NONE(0xb00, 0x100) +EXC_VIRT_NONE(0x4b00, 0x100) /* * system call / hypercall (0xc00, 0x4c00) * * The system call exception is invoked with "sc 0" and does not alter HV bit. - * There is support for kernel code to invoke system calls but there are no - * in-tree users. * * The hypercall is invoked with "sc 1" and sets HV=1. * @@ -1567,7 +1444,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) GET_PACA(r13) std r10,PACA_EXGEN+EX_R10(r13) INTERRUPT_TO_KERNEL - KVMTEST EXC_STD 0xc00 /* uses r10, branch to do_kvm_0xc00_system_call */ + KVMTEST system_call EXC_STD 0xc00 /* uses r10, branch to system_call_kvm */ mfctr r9 #else mr r9,r13 @@ -1621,7 +1498,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) EXC_REAL_BEGIN(system_call, 0xc00, 0x100) SYSTEM_CALL 0 EXC_REAL_END(system_call, 0xc00, 0x100) - EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) SYSTEM_CALL 1 EXC_VIRT_END(system_call, 0x4c00, 0x100) @@ -1634,7 +1510,7 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100) * ctr = orig r13 * orig r10 saved in PACA */ -TRAMP_KVM_BEGIN(do_kvm_0xc00) +TRAMP_KVM_BEGIN(system_call_kvm) /* * Save the PPR (on systems that support it) before changing to * HMT_MEDIUM. That allows the KVM code to save that value into the @@ -1647,32 +1523,33 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00) SET_SCRATCH0(r10) std r9,PACA_EXGEN+EX_R9(r13) mfcr r9 - KVM_HANDLER PACA_EXGEN, EXC_STD, 0xc00, 0 + KVM_HANDLER 0xc00, EXC_STD, PACA_EXGEN, 0 #endif -EXC_REAL(single_step, 0xd00, 0x100) -EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00) -TRAMP_KVM(PACA_EXGEN, 0xd00) +EXC_REAL_BEGIN(single_step, 0xd00, 0x100) + INT_HANDLER single_step, 0xd00, kvm=1 +EXC_REAL_END(single_step, 0xd00, 0x100) +EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) + INT_HANDLER single_step, 0xd00, virt=1 +EXC_VIRT_END(single_step, 0x4d00, 0x100) +INT_KVM_HANDLER single_step, 0xd00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(single_step_common, 0xd00, single_step_exception) -EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20) -EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x20, 0xe00) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) + +EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20) + INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 +EXC_REAL_END(h_data_storage, 0xe00, 0x20) +EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) + INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 +EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) +INT_KVM_HANDLER h_data_storage, 0xe00, EXC_HV, PACA_EXGEN, 1 EXC_COMMON_BEGIN(h_data_storage_common) - mfspr r10,SPRN_HDAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_HDSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) - EXCEPTION_COMMON(PACA_EXGEN, 0xe00) + INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION - ld r4,PACA_EXGEN+EX_DAR(r13) - lwz r5,PACA_EXGEN+EX_DSISR(r13) - std r4,_DAR(r1) - std r5,_DSISR(r1) + ld r4,_DAR(r1) li r5,SIGSEGV bl bad_page_fault MMU_FTR_SECTION_ELSE @@ -1681,15 +1558,23 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) b ret_from_except -EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20) -EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x20, 0xe20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe20) +EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20) + INT_HANDLER h_instr_storage, 0xe20, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(h_instr_storage, 0xe20, 0x20) +EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) + INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) +INT_KVM_HANDLER h_instr_storage, 0xe20, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) -EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20) -EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40) -TRAMP_KVM_HV(PACA_EXGEN, 0xe40) +EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20) + INT_HANDLER emulation_assist, 0xe40, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(emulation_assist, 0xe40, 0x20) +EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) + INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) +INT_KVM_HANDLER emulation_assist, 0xe40, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) @@ -1699,16 +1584,10 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) * mode. */ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) - EXCEPTION_PROLOG_0 PACA_EXGEN - b hmi_exception_early + INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1 EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe60) -TRAMP_REAL_BEGIN(hmi_exception_early) - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, 0 - mfctr r10 /* save ctr, even for !RELOCATABLE */ - BRANCH_TO_C000(r11, hmi_exception_early_common) - +INT_KVM_HANDLER hmi_exception, 0xe60, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_BEGIN(hmi_exception_early_common) mtctr r10 /* Restore ctr */ mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ @@ -1716,10 +1595,10 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - EXCEPTION_PROLOG_COMMON_1() + /* We don't touch AMR here, we never go to virtual mode */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) - EXCEPTION_PROLOG_COMMON_3(0xe60) + INT_COMMON 0xe60, PACA_EXGEN, 0, 0, 0, 0, 0 + addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode cmpdi cr0,r3,0 @@ -1734,23 +1613,25 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) * firmware. */ EXCEPTION_RESTORE_REGS EXC_HV - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hmi_exception_common, EXC_HV, 1 + INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_COMMON_BEGIN(hmi_exception_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xe60) + INT_COMMON 0xe60, PACA_EXGEN, 1, 1, 1, 0, 0 FINISH_NAP - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) RUNLATCH_ON + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl handle_hmi_exception b ret_from_except -EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20, IRQS_DISABLED) -EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80, IRQS_DISABLED) -TRAMP_KVM_HV(PACA_EXGEN, 0xe80) + +EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20) + INT_HANDLER h_doorbell, 0xe80, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(h_doorbell, 0xe80, 0x20) +EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) + INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) +INT_KVM_HANDLER h_doorbell, 0xe80, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) #else @@ -1758,9 +1639,13 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception) #endif -EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20, IRQS_DISABLED) -EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0, IRQS_DISABLED) -TRAMP_KVM_HV(PACA_EXGEN, 0xea0) +EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20) + INT_HANDLER h_virt_irq, 0xea0, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(h_virt_irq, 0xea0, 0x20) +EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) + INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) +INT_KVM_HANDLER h_virt_irq, 0xea0, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) @@ -1770,17 +1655,25 @@ EXC_REAL_NONE(0xee0, 0x20) EXC_VIRT_NONE(0x4ee0, 0x20) -EXC_REAL_OOL_MASKABLE(performance_monitor, 0xf00, 0x20, IRQS_PMI_DISABLED) -EXC_VIRT_OOL_MASKABLE(performance_monitor, 0x4f00, 0x20, 0xf00, IRQS_PMI_DISABLED) -TRAMP_KVM(PACA_EXGEN, 0xf00) +EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20) + INT_HANDLER performance_monitor, 0xf00, ool=1, bitmask=IRQS_PMI_DISABLED, kvm=1 +EXC_REAL_END(performance_monitor, 0xf00, 0x20) +EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) + INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED +EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) +INT_KVM_HANDLER performance_monitor, 0xf00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) -EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20) -EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20) -TRAMP_KVM(PACA_EXGEN, 0xf20) +EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20) + INT_HANDLER altivec_unavailable, 0xf20, ool=1, kvm=1 +EXC_REAL_END(altivec_unavailable, 0xf20, 0x20) +EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) + INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1 +EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) +INT_KVM_HANDLER altivec_unavailable, 0xf20, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(altivec_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xf20) + INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION beq 1f @@ -1813,11 +1706,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) b ret_from_except -EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20) -EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40) -TRAMP_KVM(PACA_EXGEN, 0xf40) +EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20) + INT_HANDLER vsx_unavailable, 0xf40, ool=1, kvm=1 +EXC_REAL_END(vsx_unavailable, 0xf40, 0x20) +EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) + INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1 +EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) +INT_KVM_HANDLER vsx_unavailable, 0xf40, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(vsx_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xf40) + INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_VSX BEGIN_FTR_SECTION beq 1f @@ -1849,15 +1746,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) b ret_from_except -EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20) -EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60) -TRAMP_KVM(PACA_EXGEN, 0xf60) +EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20) + INT_HANDLER facility_unavailable, 0xf60, ool=1, kvm=1 +EXC_REAL_END(facility_unavailable, 0xf60, 0x20) +EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) + INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1 +EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) +INT_KVM_HANDLER facility_unavailable, 0xf60, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) -EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20) -EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80) -TRAMP_KVM_HV(PACA_EXGEN, 0xf80) +EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20) + INT_HANDLER h_facility_unavailable, 0xf80, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20) +EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) + INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) +INT_KVM_HANDLER h_facility_unavailable, 0xf80, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) @@ -1874,9 +1779,11 @@ EXC_REAL_NONE(0x1100, 0x100) EXC_VIRT_NONE(0x5100, 0x100) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_system_error, 0x1200, 0x100) +EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) + INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200) +INT_KVM_HANDLER cbe_system_error, 0x1200, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1200, 0x100) @@ -1884,37 +1791,43 @@ EXC_VIRT_NONE(0x5200, 0x100) #endif -EXC_REAL(instruction_breakpoint, 0x1300, 0x100) -EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300) -TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300) +EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100) + INT_HANDLER instruction_breakpoint, 0x1300, kvm=1 +EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100) +EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) + INT_HANDLER instruction_breakpoint, 0x1300, virt=1 +EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) +INT_KVM_HANDLER instruction_breakpoint, 0x1300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) + EXC_REAL_NONE(0x1400, 0x100) EXC_VIRT_NONE(0x5400, 0x100) EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 0, 0x1500, 0, 0, 0 - + INT_HANDLER denorm_exception_hv, 0x1500, early=2, hsrr=EXC_HV #ifdef CONFIG_PPC_DENORMALISATION mfspr r10,SPRN_HSRR1 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ bne+ denorm_assist #endif - - KVMTEST EXC_HV 0x1500 - EXCEPTION_PROLOG_2_REAL denorm_common, EXC_HV, 1 + KVMTEST denorm_exception_hv, EXC_HV 0x1500 + INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) - b exc_real_0x1500_denorm_exception_hv + INT_HANDLER denorm_exception, 0x1500, 0, 2, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0 + mfspr r10,SPRN_HSRR1 + andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ + bne+ denorm_assist + INT_VIRT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV EXC_VIRT_END(denorm_exception, 0x5500, 0x100) #else EXC_VIRT_NONE(0x5500, 0x100) #endif -TRAMP_KVM_HV(PACA_EXGEN, 0x1500) +INT_KVM_HANDLER denorm_exception_hv, 0x1500, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DENORMALISATION TRAMP_REAL_BEGIN(denorm_assist) @@ -1989,9 +1902,11 @@ EXC_COMMON(denorm_common, 0x1500, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100) +EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) + INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600) +INT_KVM_HANDLER cbe_maintenance, 0x1600, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1600, 0x100) @@ -1999,9 +1914,13 @@ EXC_VIRT_NONE(0x5600, 0x100) #endif -EXC_REAL(altivec_assist, 0x1700, 0x100) -EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700) -TRAMP_KVM(PACA_EXGEN, 0x1700) +EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100) + INT_HANDLER altivec_assist, 0x1700, kvm=1 +EXC_REAL_END(altivec_assist, 0x1700, 0x100) +EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) + INT_HANDLER altivec_assist, 0x1700, virt=1 +EXC_VIRT_END(altivec_assist, 0x5700, 0x100) +INT_KVM_HANDLER altivec_assist, 0x1700, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_ALTIVEC EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) #else @@ -2010,15 +1929,18 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_thermal, 0x1800, 0x100) +EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) + INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800) +INT_KVM_HANDLER cbe_thermal, 0x1800, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif + #ifdef CONFIG_PPC_WATCHDOG #define MASKED_DEC_HANDLER_LABEL 3f @@ -2028,7 +1950,7 @@ EXC_VIRT_NONE(0x5800, 0x100) std r12,PACA_EXGEN+EX_R12(r13); \ GET_SCRATCH0(r10); \ std r10,PACA_EXGEN+EX_R13(r13); \ - EXCEPTION_PROLOG_2_REAL soft_nmi_common, _H, 1 + INT_SAVE_SRR_AND_JUMP soft_nmi_common, _H, 1 /* * Branch to soft_nmi_interrupt using the emergency stack. The emergency @@ -2043,9 +1965,8 @@ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) subi r1,r1,INT_FRAME_SIZE - EXCEPTION_COMMON_STACK(PACA_EXGEN, 0x900) + INT_COMMON 0x900, PACA_EXGEN, 0, 1, 1, 0, 0 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl soft_nmi_interrupt b ret_from_except @@ -2302,6 +2223,35 @@ CLOSE_FIXED_SECTION(virt_trampolines); USE_TEXT_SECTION() +/* MSR[RI] should be clear because this uses SRR[01] */ +enable_machine_check: + mflr r0 + bcl 20,31,$+4 +0: mflr r3 + addi r3,r3,(1f - 0b) + mtspr SPRN_SRR0,r3 + mfmsr r3 + ori r3,r3,MSR_ME + mtspr SPRN_SRR1,r3 + RFI_TO_KERNEL +1: mtlr r0 + blr + +/* MSR[RI] should be clear because this uses SRR[01] */ +disable_machine_check: + mflr r0 + bcl 20,31,$+4 +0: mflr r3 + addi r3,r3,(1f - 0b) + mtspr SPRN_SRR0,r3 + mfmsr r3 + li r4,MSR_ME + andc r3,r3,r4 + mtspr SPRN_SRR1,r3 + RFI_TO_KERNEL +1: mtlr r0 + blr + /* * Hash table stuff */ @@ -2310,7 +2260,7 @@ do_hash_page: #ifdef CONFIG_PPC_BOOK3S_64 lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h ori r0,r0,DSISR_BAD_FAULT_64S@l - and. r0,r4,r0 /* weird error? */ + and. r0,r5,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ ld r11, PACA_THREAD_INFO(r13) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ @@ -2318,15 +2268,13 @@ do_hash_page: bne 77f /* then don't call hash_page now */ /* - * r3 contains the faulting address - * r4 msr - * r5 contains the trap number - * r6 contains dsisr + * r3 contains the trap number + * r4 contains the faulting address + * r5 contains dsisr + * r6 msr * * at return r3 = 0 for success, 1 for page fault, negative for error */ - mr r4,r12 - ld r6,_DSISR(r1) bl __hash_page /* build HPTE if possible */ cmpdi r3,0 /* see if __hash_page succeeded */ @@ -2336,16 +2284,15 @@ do_hash_page: /* Error */ blt- 13f - /* Reload DSISR into r4 for the DABR check below */ - ld r4,_DSISR(r1) + /* Reload DAR/DSISR into r4/r5 for the DABR check below */ + ld r4,_DAR(r1) + ld r5,_DSISR(r1) #endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: -11: andis. r0,r4,DSISR_DABRMATCH@h +11: andis. r0,r5,DSISR_DABRMATCH@h bne- handle_dabr_fault - ld r4,_DAR(r1) - ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_page_fault cmpdi r3,0 @@ -2353,7 +2300,7 @@ handle_page_fault: bl save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD - lwz r4,_DAR(r1) + ld r4,_DAR(r1) bl bad_page_fault b ret_from_except @@ -2392,7 +2339,6 @@ handle_dabr_fault: * the access, or panic if there isn't a handler. */ 77: bl save_nvgprs - mr r4,r3 addi r3,r1,STACK_FRAME_OVERHEAD li r5,SIGSEGV bl bad_page_fault diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 4eab97292cc2..ed59855430b9 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -28,24 +28,22 @@ #include <asm/debugfs.h> #include <asm/page.h> #include <asm/prom.h> -#include <asm/rtas.h> #include <asm/fadump.h> +#include <asm/fadump-internal.h> #include <asm/setup.h> static struct fw_dump fw_dump; -static struct fadump_mem_struct fdm; -static const struct fadump_mem_struct *fdm_active; -#ifdef CONFIG_CMA -static struct cma *fadump_cma; -#endif +static void __init fadump_reserve_crash_area(u64 base); + +#ifndef CONFIG_PRESERVE_FA_DUMP static DEFINE_MUTEX(fadump_mutex); -struct fad_crash_memory_ranges *crash_memory_ranges; -int crash_memory_ranges_size; -int crash_mem_ranges; -int max_crash_mem_ranges; +struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; +struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; #ifdef CONFIG_CMA +static struct cma *fadump_cma; + /* * fadump_cma_init() - Initialize CMA area from a fadump reserved memory * @@ -107,84 +105,45 @@ static int __init fadump_cma_init(void) { return 1; } #endif /* CONFIG_CMA */ /* Scan the Firmware Assisted dump configuration details. */ -int __init early_init_dt_scan_fw_dump(unsigned long node, - const char *uname, int depth, void *data) +int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data) { - const __be32 *sections; - int i, num_sections; - int size; - const __be32 *token; - - if (depth != 1 || strcmp(uname, "rtas") != 0) + if (depth != 1) return 0; - /* - * Check if Firmware Assisted dump is supported. if yes, check - * if dump has been initiated on last reboot. - */ - token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); - if (!token) + if (strcmp(uname, "rtas") == 0) { + rtas_fadump_dt_scan(&fw_dump, node); return 1; + } - fw_dump.fadump_supported = 1; - fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token); - - /* - * The 'ibm,kernel-dump' rtas node is present only if there is - * dump data waiting for us. - */ - fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); - if (fdm_active) - fw_dump.dump_active = 1; - - /* Get the sizes required to store dump data for the firmware provided - * dump sections. - * For each dump section type supported, a 32bit cell which defines - * the ID of a supported section followed by two 32 bit cells which - * gives teh size of the section in bytes. - */ - sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", - &size); - - if (!sections) + if (strcmp(uname, "ibm,opal") == 0) { + opal_fadump_dt_scan(&fw_dump, node); return 1; - - num_sections = size / (3 * sizeof(u32)); - - for (i = 0; i < num_sections; i++, sections += 3) { - u32 type = (u32)of_read_number(sections, 1); - - switch (type) { - case FADUMP_CPU_STATE_DATA: - fw_dump.cpu_state_data_size = - of_read_ulong(§ions[1], 2); - break; - case FADUMP_HPTE_REGION: - fw_dump.hpte_region_size = - of_read_ulong(§ions[1], 2); - break; - } } - return 1; + return 0; } /* * If fadump is registered, check if the memory provided * falls within boot memory area and reserved memory area. */ -int is_fadump_memory_area(u64 addr, ulong size) +int is_fadump_memory_area(u64 addr, unsigned long size) { - u64 d_start = fw_dump.reserve_dump_area_start; - u64 d_end = d_start + fw_dump.reserve_dump_area_size; + u64 d_start, d_end; if (!fw_dump.dump_registered) return 0; + if (!size) + return 0; + + d_start = fw_dump.reserve_dump_area_start; + d_end = d_start + fw_dump.reserve_dump_area_size; if (((addr + size) > d_start) && (addr <= d_end)) return 1; - return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; + return (addr <= fw_dump.boot_mem_top); } int should_fadump_crash(void) @@ -200,31 +159,29 @@ int is_fadump_active(void) } /* - * Returns 1, if there are no holes in boot memory area, - * 0 otherwise. + * Returns true, if there are no holes in memory area between d_start to d_end, + * false otherwise. */ -static int is_boot_memory_area_contiguous(void) +static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) { struct memblock_region *reg; - unsigned long tstart, tend; - unsigned long start_pfn = PHYS_PFN(RMA_START); - unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); - unsigned int ret = 0; + bool ret = false; + u64 start, end; for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); - if (tstart < tend) { - /* Memory hole from start_pfn to tstart */ - if (tstart > start_pfn) + start = max_t(u64, d_start, reg->base); + end = min_t(u64, d_end, (reg->base + reg->size)); + if (d_start < end) { + /* Memory hole from d_start to start */ + if (start > d_start) break; - if (tend == end_pfn) { - ret = 1; + if (end == d_end) { + ret = true; break; } - start_pfn = tend + 1; + d_start = end + 1; } } @@ -232,37 +189,45 @@ static int is_boot_memory_area_contiguous(void) } /* - * Returns true, if there are no holes in reserved memory area, + * Returns true, if there are no holes in boot memory area, * false otherwise. */ -static bool is_reserved_memory_area_contiguous(void) +bool is_fadump_boot_mem_contiguous(void) { - struct memblock_region *reg; - unsigned long start, end; - unsigned long d_start = fw_dump.reserve_dump_area_start; - unsigned long d_end = d_start + fw_dump.reserve_dump_area_size; - - for_each_memblock(memory, reg) { - start = max(d_start, (unsigned long)reg->base); - end = min(d_end, (unsigned long)(reg->base + reg->size)); - if (d_start < end) { - /* Memory hole from d_start to start */ - if (start > d_start) - break; + unsigned long d_start, d_end; + bool ret = false; + int i; - if (end == d_end) - return true; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + d_start = fw_dump.boot_mem_addr[i]; + d_end = d_start + fw_dump.boot_mem_sz[i]; - d_start = end + 1; - } + ret = is_fadump_mem_area_contiguous(d_start, d_end); + if (!ret) + break; } - return false; + return ret; +} + +/* + * Returns true, if there are no holes in reserved memory area, + * false otherwise. + */ +bool is_fadump_reserved_mem_contiguous(void) +{ + u64 d_start, d_end; + + d_start = fw_dump.reserve_dump_area_start; + d_end = d_start + fw_dump.reserve_dump_area_size; + return is_fadump_mem_area_contiguous(d_start, d_end); } /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { + int i; + pr_debug("Support for firmware-assisted dump (fadump): %s\n", (fw_dump.fadump_supported ? "present" : "no support")); @@ -276,62 +241,13 @@ static void fadump_show_config(void) pr_debug("Dump section sizes:\n"); pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); - pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); -} - -static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, - unsigned long addr) -{ - if (!fdm) - return 0; - - memset(fdm, 0, sizeof(struct fadump_mem_struct)); - addr = addr & PAGE_MASK; - - fdm->header.dump_format_version = cpu_to_be32(0x00000001); - fdm->header.dump_num_sections = cpu_to_be16(3); - fdm->header.dump_status_flag = 0; - fdm->header.offset_first_dump_section = - cpu_to_be32((u32)offsetof(struct fadump_mem_struct, cpu_state_data)); - - /* - * Fields for disk dump option. - * We are not using disk dump option, hence set these fields to 0. - */ - fdm->header.dd_block_size = 0; - fdm->header.dd_block_offset = 0; - fdm->header.dd_num_blocks = 0; - fdm->header.dd_offset_disk_path = 0; - - /* set 0 to disable an automatic dump-reboot. */ - fdm->header.max_time_auto = 0; - - /* Kernel dump sections */ - /* cpu state data section. */ - fdm->cpu_state_data.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->cpu_state_data.source_data_type = cpu_to_be16(FADUMP_CPU_STATE_DATA); - fdm->cpu_state_data.source_address = 0; - fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size); - fdm->cpu_state_data.destination_address = cpu_to_be64(addr); - addr += fw_dump.cpu_state_data_size; - - /* hpte region section */ - fdm->hpte_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->hpte_region.source_data_type = cpu_to_be16(FADUMP_HPTE_REGION); - fdm->hpte_region.source_address = 0; - fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size); - fdm->hpte_region.destination_address = cpu_to_be64(addr); - addr += fw_dump.hpte_region_size; - - /* RMA region section */ - fdm->rmr_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->rmr_region.source_data_type = cpu_to_be16(FADUMP_REAL_MODE_REGION); - fdm->rmr_region.source_address = cpu_to_be64(RMA_START); - fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size); - fdm->rmr_region.destination_address = cpu_to_be64(addr); - addr += fw_dump.boot_memory_size; - - return addr; + pr_debug(" Boot memory size : %lx\n", fw_dump.boot_memory_size); + pr_debug(" Boot memory top : %llx\n", fw_dump.boot_mem_top); + pr_debug("Boot memory regions cnt: %llx\n", fw_dump.boot_mem_regs_cnt); + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + pr_debug("[%03d] base = %llx, size = %llx\n", i, + fw_dump.boot_mem_addr[i], fw_dump.boot_mem_sz[i]); + } } /** @@ -349,10 +265,10 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, * that is required for a kernel to boot successfully. * */ -static inline unsigned long fadump_calculate_reserve_size(void) +static inline u64 fadump_calculate_reserve_size(void) { + u64 base, size, bootmem_min; int ret; - unsigned long long base, size; if (fw_dump.reserve_bootvar) pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); @@ -402,7 +318,8 @@ static inline unsigned long fadump_calculate_reserve_size(void) if (memory_limit && size > memory_limit) size = memory_limit; - return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); + bootmem_min = fw_dump.ops->fadump_get_bootmem_min(); + return (size > bootmem_min ? size : bootmem_min); } /* @@ -423,57 +340,136 @@ static unsigned long get_fadump_area_size(void) size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); size = PAGE_ALIGN(size); + + /* This is to hold kernel metadata on platforms that support it */ + size += (fw_dump.ops->fadump_get_metadata_size ? + fw_dump.ops->fadump_get_metadata_size() : 0); return size; } -static void __init fadump_reserve_crash_area(unsigned long base, - unsigned long size) +static int __init add_boot_mem_region(unsigned long rstart, + unsigned long rsize) +{ + int i = fw_dump.boot_mem_regs_cnt++; + + if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) { + fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS; + return 0; + } + + pr_debug("Added boot memory range[%d] [%#016lx-%#016lx)\n", + i, rstart, (rstart + rsize)); + fw_dump.boot_mem_addr[i] = rstart; + fw_dump.boot_mem_sz[i] = rsize; + return 1; +} + +/* + * Firmware usually has a hard limit on the data it can copy per region. + * Honour that by splitting a memory range into multiple regions. + */ +static int __init add_boot_mem_regions(unsigned long mstart, + unsigned long msize) { + unsigned long rstart, rsize, max_size; + int ret = 1; + + rstart = mstart; + max_size = fw_dump.max_copy_size ? fw_dump.max_copy_size : msize; + while (msize) { + if (msize > max_size) + rsize = max_size; + else + rsize = msize; + + ret = add_boot_mem_region(rstart, rsize); + if (!ret) + break; + + msize -= rsize; + rstart += rsize; + } + + return ret; +} + +static int __init fadump_get_boot_mem_regions(void) +{ + unsigned long base, size, cur_size, hole_size, last_end; + unsigned long mem_size = fw_dump.boot_memory_size; struct memblock_region *reg; - unsigned long mstart, mend, msize; + int ret = 1; + + fw_dump.boot_mem_regs_cnt = 0; + last_end = 0; + hole_size = 0; + cur_size = 0; for_each_memblock(memory, reg) { - mstart = max_t(unsigned long, base, reg->base); - mend = reg->base + reg->size; - mend = min(base + size, mend); - - if (mstart < mend) { - msize = mend - mstart; - memblock_reserve(mstart, msize); - pr_info("Reserved %ldMB of memory at %#016lx for saving crash dump\n", - (msize >> 20), mstart); + base = reg->base; + size = reg->size; + hole_size += (base - last_end); + + if ((cur_size + size) >= mem_size) { + size = (mem_size - cur_size); + ret = add_boot_mem_regions(base, size); + break; } + + mem_size -= size; + cur_size += size; + ret = add_boot_mem_regions(base, size); + if (!ret) + break; + + last_end = base + size; } + fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size); + + return ret; } int __init fadump_reserve_mem(void) { - unsigned long base, size, memory_boundary; + u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; + bool is_memblock_bottom_up = memblock_bottom_up(); + int ret = 1; if (!fw_dump.fadump_enabled) return 0; if (!fw_dump.fadump_supported) { - printk(KERN_INFO "Firmware-assisted dump is not supported on" - " this hardware\n"); - fw_dump.fadump_enabled = 0; - return 0; + pr_info("Firmware-Assisted Dump is not supported on this hardware\n"); + goto error_out; } + /* * Initialize boot memory size * If dump is active then we have already calculated the size during * first kernel. */ - if (fdm_active) - fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len); - else { - fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + if (!fw_dump.dump_active) { + fw_dump.boot_memory_size = + PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA - if (!fw_dump.nocma) + if (!fw_dump.nocma) { + align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, - FADUMP_CMA_ALIGNMENT); + ALIGN(fw_dump.boot_memory_size, align); + } #endif + + bootmem_min = fw_dump.ops->fadump_get_bootmem_min(); + if (fw_dump.boot_memory_size < bootmem_min) { + pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%llx\n", + fw_dump.boot_memory_size, bootmem_min); + goto error_out; + } + + if (!fadump_get_boot_mem_regions()) { + pr_err("Too many holes in boot memory area to enable fadump\n"); + goto error_out; + } } /* @@ -493,10 +489,13 @@ int __init fadump_reserve_mem(void) " dump, now %#016llx\n", memory_limit); } if (memory_limit) - memory_boundary = memory_limit; + mem_boundary = memory_limit; else - memory_boundary = memblock_end_of_DRAM(); + mem_boundary = memblock_end_of_DRAM(); + base = fw_dump.boot_mem_top; + size = get_fadump_area_size(); + fw_dump.reserve_dump_area_size = size; if (fw_dump.dump_active) { pr_info("Firmware-assisted dump is active.\n"); @@ -510,58 +509,55 @@ int __init fadump_reserve_mem(void) #endif /* * If last boot has crashed then reserve all the memory - * above boot_memory_size so that we don't touch it until + * above boot memory size so that we don't touch it until * dump is written to disk by userspace tool. This memory - * will be released for general use once the dump is saved. + * can be released for general use by invalidating fadump. */ - base = fw_dump.boot_memory_size; - size = memory_boundary - base; - fadump_reserve_crash_area(base, size); - - fw_dump.fadumphdr_addr = - be64_to_cpu(fdm_active->rmr_region.destination_address) + - be64_to_cpu(fdm_active->rmr_region.source_len); - pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr); - fw_dump.reserve_dump_area_start = base; - fw_dump.reserve_dump_area_size = size; - } else { - size = get_fadump_area_size(); + fadump_reserve_crash_area(base); + pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr); + pr_debug("Reserve dump area start address: 0x%lx\n", + fw_dump.reserve_dump_area_start); + } else { /* * Reserve memory at an offset closer to bottom of the RAM to - * minimize the impact of memory hot-remove operation. We can't - * use memblock_find_in_range() here since it doesn't allocate - * from bottom to top. + * minimize the impact of memory hot-remove operation. */ - for (base = fw_dump.boot_memory_size; - base <= (memory_boundary - size); - base += size) { - if (memblock_is_region_memory(base, size) && - !memblock_is_region_reserved(base, size)) - break; + memblock_set_bottom_up(true); + base = memblock_find_in_range(base, mem_boundary, size, align); + + /* Restore the previous allocation mode */ + memblock_set_bottom_up(is_memblock_bottom_up); + + if (!base) { + pr_err("Failed to find memory chunk for reservation!\n"); + goto error_out; } - if ((base > (memory_boundary - size)) || - memblock_reserve(base, size)) { - pr_err("Failed to reserve memory\n"); - return 0; + fw_dump.reserve_dump_area_start = base; + + /* + * Calculate the kernel metadata address and register it with + * f/w if the platform supports. + */ + if (fw_dump.ops->fadump_setup_metadata && + (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0)) + goto error_out; + + if (memblock_reserve(base, size)) { + pr_err("Failed to reserve memory!\n"); + goto error_out; } - pr_info("Reserved %ldMB of memory at %ldMB for firmware-" - "assisted dump (System RAM: %ldMB)\n", - (unsigned long)(size >> 20), - (unsigned long)(base >> 20), - (unsigned long)(memblock_phys_mem_size() >> 20)); + pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n", + (size >> 20), base, (memblock_phys_mem_size() >> 20)); - fw_dump.reserve_dump_area_start = base; - fw_dump.reserve_dump_area_size = size; - return fadump_cma_init(); + ret = fadump_cma_init(); } - return 1; -} -unsigned long __init arch_reserved_kernel_pages(void) -{ - return memblock_reserved_size() / PAGE_SIZE; + return ret; +error_out: + fw_dump.fadump_enabled = 0; + return 0; } /* Look for fadump= cmdline option. */ @@ -596,61 +592,6 @@ static int __init early_fadump_reserve_mem(char *p) } early_param("fadump_reserve_mem", early_fadump_reserve_mem); -static int register_fw_dump(struct fadump_mem_struct *fdm) -{ - int rc, err; - unsigned int wait_time; - - pr_debug("Registering for firmware-assisted kernel dump...\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_REGISTER, fdm, - sizeof(struct fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - - } while (wait_time); - - err = -EIO; - switch (rc) { - default: - pr_err("Failed to register. Unknown Error(%d).\n", rc); - break; - case -1: - printk(KERN_ERR "Failed to register firmware-assisted kernel" - " dump. Hardware Error(%d).\n", rc); - break; - case -3: - if (!is_boot_memory_area_contiguous()) - pr_err("Can't have holes in boot memory area while registering fadump\n"); - else if (!is_reserved_memory_area_contiguous()) - pr_err("Can't have holes in reserved memory area while" - " registering fadump\n"); - - printk(KERN_ERR "Failed to register firmware-assisted kernel" - " dump. Parameter Error(%d).\n", rc); - err = -EINVAL; - break; - case -9: - printk(KERN_ERR "firmware-assisted kernel dump is already " - " registered."); - fw_dump.dump_registered = 1; - err = -EEXIST; - break; - case 0: - printk(KERN_INFO "firmware-assisted kernel dump registration" - " is successful\n"); - fw_dump.dump_registered = 1; - err = 0; - break; - } - return err; -} - void crash_fadump(struct pt_regs *regs, const char *str) { struct fadump_crash_info_header *fdh = NULL; @@ -693,71 +634,10 @@ void crash_fadump(struct pt_regs *regs, const char *str) fdh->online_mask = *cpu_online_mask; - /* Call ibm,os-term rtas call to trigger firmware assisted dump */ - rtas_os_term((char *)str); -} - -#define GPR_MASK 0xffffff0000000000 -static inline int fadump_gpr_index(u64 id) -{ - int i = -1; - char str[3]; - - if ((id & GPR_MASK) == REG_ID("GPR")) { - /* get the digits at the end */ - id &= ~GPR_MASK; - id >>= 24; - str[2] = '\0'; - str[1] = id & 0xff; - str[0] = (id >> 8) & 0xff; - sscanf(str, "%d", &i); - if (i > 31) - i = -1; - } - return i; -} - -static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, - u64 reg_val) -{ - int i; - - i = fadump_gpr_index(reg_id); - if (i >= 0) - regs->gpr[i] = (unsigned long)reg_val; - else if (reg_id == REG_ID("NIA")) - regs->nip = (unsigned long)reg_val; - else if (reg_id == REG_ID("MSR")) - regs->msr = (unsigned long)reg_val; - else if (reg_id == REG_ID("CTR")) - regs->ctr = (unsigned long)reg_val; - else if (reg_id == REG_ID("LR")) - regs->link = (unsigned long)reg_val; - else if (reg_id == REG_ID("XER")) - regs->xer = (unsigned long)reg_val; - else if (reg_id == REG_ID("CR")) - regs->ccr = (unsigned long)reg_val; - else if (reg_id == REG_ID("DAR")) - regs->dar = (unsigned long)reg_val; - else if (reg_id == REG_ID("DSISR")) - regs->dsisr = (unsigned long)reg_val; -} - -static struct fadump_reg_entry* -fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - - while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) { - fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), - be64_to_cpu(reg_entry->reg_value)); - reg_entry++; - } - reg_entry++; - return reg_entry; + fw_dump.ops->fadump_trigger(fdh, str); } -static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -772,7 +652,7 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) return buf; } -static void fadump_update_elfcore_header(char *bufp) +void fadump_update_elfcore_header(char *bufp) { struct elfhdr *elf; struct elf_phdr *phdr; @@ -784,7 +664,7 @@ static void fadump_update_elfcore_header(char *bufp) phdr = (struct elf_phdr *)bufp; if (phdr->p_type == PT_NOTE) { - phdr->p_paddr = fw_dump.cpu_notes_buf; + phdr->p_paddr = __pa(fw_dump.cpu_notes_buf_vaddr); phdr->p_offset = phdr->p_paddr; phdr->p_filesz = fw_dump.cpu_notes_buf_size; phdr->p_memsz = fw_dump.cpu_notes_buf_size; @@ -792,228 +672,100 @@ static void fadump_update_elfcore_header(char *bufp) return; } -static void *fadump_cpu_notes_buf_alloc(unsigned long size) +static void *fadump_alloc_buffer(unsigned long size) { - void *vaddr; + unsigned long count, i; struct page *page; - unsigned long order, count, i; + void *vaddr; - order = get_order(size); - vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); if (!vaddr) return NULL; - count = 1 << order; + count = PAGE_ALIGN(size) / PAGE_SIZE; page = virt_to_page(vaddr); for (i = 0; i < count; i++) - SetPageReserved(page + i); + mark_page_reserved(page + i); return vaddr; } -static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) +static void fadump_free_buffer(unsigned long vaddr, unsigned long size) { - struct page *page; - unsigned long order, count, i; - - order = get_order(size); - count = 1 << order; - page = virt_to_page(vaddr); - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); + free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL); } -/* - * Read CPU state dump data and convert it into ELF notes. - * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be - * used to access the data to allow for additional fields to be added without - * affecting compatibility. Each list of registers for a CPU starts with - * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, - * 8 Byte ASCII identifier and 8 Byte register value. The register entry - * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part - * of register value. For more details refer to PAPR document. - * - * Only for the crashing cpu we ignore the CPU dump data and get exact - * state from fadump crash info structure populated by first kernel at the - * time of crash. - */ -static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) +s32 fadump_setup_cpu_notes_buf(u32 num_cpus) { - struct fadump_reg_save_area_header *reg_header; - struct fadump_reg_entry *reg_entry; - struct fadump_crash_info_header *fdh = NULL; - void *vaddr; - unsigned long addr; - u32 num_cpus, *note_buf; - struct pt_regs regs; - int i, rc = 0, cpu = 0; - - if (!fdm->cpu_state_data.bytes_dumped) - return -EINVAL; - - addr = be64_to_cpu(fdm->cpu_state_data.destination_address); - vaddr = __va(addr); - - reg_header = vaddr; - if (be64_to_cpu(reg_header->magic_number) != REGSAVE_AREA_MAGIC) { - printk(KERN_ERR "Unable to read register save area.\n"); - return -ENOENT; - } - pr_debug("--------CPU State Data------------\n"); - pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number)); - pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset)); - - vaddr += be32_to_cpu(reg_header->num_cpu_offset); - num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); - pr_debug("NumCpus : %u\n", num_cpus); - vaddr += sizeof(u32); - reg_entry = (struct fadump_reg_entry *)vaddr; - /* Allocate buffer to hold cpu crash notes. */ fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); - note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); - if (!note_buf) { - printk(KERN_ERR "Failed to allocate 0x%lx bytes for " - "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf_vaddr = + (unsigned long)fadump_alloc_buffer(fw_dump.cpu_notes_buf_size); + if (!fw_dump.cpu_notes_buf_vaddr) { + pr_err("Failed to allocate %ld bytes for CPU notes buffer\n", + fw_dump.cpu_notes_buf_size); return -ENOMEM; } - fw_dump.cpu_notes_buf = __pa(note_buf); - - pr_debug("Allocated buffer for cpu notes of size %ld at %p\n", - (num_cpus * sizeof(note_buf_t)), note_buf); - if (fw_dump.fadumphdr_addr) - fdh = __va(fw_dump.fadumphdr_addr); - - for (i = 0; i < num_cpus; i++) { - if (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUSTRT")) { - printk(KERN_ERR "Unable to read CPU state data\n"); - rc = -ENOENT; - goto error_out; - } - /* Lower 4 bytes of reg_value contains logical cpu id */ - cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK; - if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { - SKIP_TO_NEXT_CPU(reg_entry); - continue; - } - pr_debug("Reading register data for cpu %d...\n", cpu); - if (fdh && fdh->crashing_cpu == cpu) { - regs = fdh->regs; - note_buf = fadump_regs_to_elf_notes(note_buf, ®s); - SKIP_TO_NEXT_CPU(reg_entry); - } else { - reg_entry++; - reg_entry = fadump_read_registers(reg_entry, ®s); - note_buf = fadump_regs_to_elf_notes(note_buf, ®s); - } - } - final_note(note_buf); - - if (fdh) { - pr_debug("Updating elfcore header (%llx) with cpu notes\n", - fdh->elfcorehdr_addr); - fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); - } + pr_debug("Allocated buffer for cpu notes of size %ld at 0x%lx\n", + fw_dump.cpu_notes_buf_size, + fw_dump.cpu_notes_buf_vaddr); return 0; - -error_out: - fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf), - fw_dump.cpu_notes_buf_size); - fw_dump.cpu_notes_buf = 0; - fw_dump.cpu_notes_buf_size = 0; - return rc; - } -/* - * Validate and process the dump data stored by firmware before exporting - * it through '/proc/vmcore'. - */ -static int __init process_fadump(const struct fadump_mem_struct *fdm_active) +void fadump_free_cpu_notes_buf(void) { - struct fadump_crash_info_header *fdh; - int rc = 0; - - if (!fdm_active || !fw_dump.fadumphdr_addr) - return -EINVAL; - - /* Check if the dump data is valid. */ - if ((be16_to_cpu(fdm_active->header.dump_status_flag) == FADUMP_ERROR_FLAG) || - (fdm_active->cpu_state_data.error_flags != 0) || - (fdm_active->rmr_region.error_flags != 0)) { - printk(KERN_ERR "Dump taken by platform is not valid\n"); - return -EINVAL; - } - if ((fdm_active->rmr_region.bytes_dumped != - fdm_active->rmr_region.source_len) || - !fdm_active->cpu_state_data.bytes_dumped) { - printk(KERN_ERR "Dump taken by platform is incomplete\n"); - return -EINVAL; - } - - /* Validate the fadump crash info header */ - fdh = __va(fw_dump.fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - printk(KERN_ERR "Crash info header is not valid.\n"); - return -EINVAL; - } - - rc = fadump_build_cpu_notes(fdm_active); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; + if (!fw_dump.cpu_notes_buf_vaddr) + return; - return 0; + fadump_free_buffer(fw_dump.cpu_notes_buf_vaddr, + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf_vaddr = 0; + fw_dump.cpu_notes_buf_size = 0; } -static void free_crash_memory_ranges(void) +static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { - kfree(crash_memory_ranges); - crash_memory_ranges = NULL; - crash_memory_ranges_size = 0; - max_crash_mem_ranges = 0; + kfree(mrange_info->mem_ranges); + mrange_info->mem_ranges = NULL; + mrange_info->mem_ranges_sz = 0; + mrange_info->max_mem_ranges = 0; } /* - * Allocate or reallocate crash memory ranges array in incremental units + * Allocate or reallocate mem_ranges array in incremental units * of PAGE_SIZE. */ -static int allocate_crash_memory_ranges(void) +static int fadump_alloc_mem_ranges(struct fadump_mrange_info *mrange_info) { - struct fad_crash_memory_ranges *new_array; + struct fadump_memory_range *new_array; u64 new_size; - new_size = crash_memory_ranges_size + PAGE_SIZE; - pr_debug("Allocating %llu bytes of memory for crash memory ranges\n", - new_size); + new_size = mrange_info->mem_ranges_sz + PAGE_SIZE; + pr_debug("Allocating %llu bytes of memory for %s memory ranges\n", + new_size, mrange_info->name); - new_array = krealloc(crash_memory_ranges, new_size, GFP_KERNEL); + new_array = krealloc(mrange_info->mem_ranges, new_size, GFP_KERNEL); if (new_array == NULL) { - pr_err("Insufficient memory for setting up crash memory ranges\n"); - free_crash_memory_ranges(); + pr_err("Insufficient memory for setting up %s memory ranges\n", + mrange_info->name); + fadump_free_mem_ranges(mrange_info); return -ENOMEM; } - crash_memory_ranges = new_array; - crash_memory_ranges_size = new_size; - max_crash_mem_ranges = (new_size / - sizeof(struct fad_crash_memory_ranges)); + mrange_info->mem_ranges = new_array; + mrange_info->mem_ranges_sz = new_size; + mrange_info->max_mem_ranges = (new_size / + sizeof(struct fadump_memory_range)); return 0; } -static inline int fadump_add_crash_memory(unsigned long long base, - unsigned long long end) +static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, + u64 base, u64 end) { - u64 start, size; + struct fadump_memory_range *mem_ranges = mrange_info->mem_ranges; bool is_adjacent = false; + u64 start, size; if (base == end) return 0; @@ -1022,38 +774,41 @@ static inline int fadump_add_crash_memory(unsigned long long base, * Fold adjacent memory ranges to bring down the memory ranges/ * PT_LOAD segments count. */ - if (crash_mem_ranges) { - start = crash_memory_ranges[crash_mem_ranges - 1].base; - size = crash_memory_ranges[crash_mem_ranges - 1].size; + if (mrange_info->mem_range_cnt) { + start = mem_ranges[mrange_info->mem_range_cnt - 1].base; + size = mem_ranges[mrange_info->mem_range_cnt - 1].size; if ((start + size) == base) is_adjacent = true; } if (!is_adjacent) { /* resize the array on reaching the limit */ - if (crash_mem_ranges == max_crash_mem_ranges) { + if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; - ret = allocate_crash_memory_ranges(); + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; + + /* Update to the new resized array */ + mem_ranges = mrange_info->mem_ranges; } start = base; - crash_memory_ranges[crash_mem_ranges].base = start; - crash_mem_ranges++; + mem_ranges[mrange_info->mem_range_cnt].base = start; + mrange_info->mem_range_cnt++; } - crash_memory_ranges[crash_mem_ranges - 1].size = (end - start); - pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", - (crash_mem_ranges - 1), start, end - 1, (end - start)); + mem_ranges[mrange_info->mem_range_cnt - 1].size = (end - start); + pr_debug("%s_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", + mrange_info->name, (mrange_info->mem_range_cnt - 1), + start, end - 1, (end - start)); return 0; } -static int fadump_exclude_reserved_area(unsigned long long start, - unsigned long long end) +static int fadump_exclude_reserved_area(u64 start, u64 end) { - unsigned long long ra_start, ra_end; + u64 ra_start, ra_end; int ret = 0; ra_start = fw_dump.reserve_dump_area_start; @@ -1061,18 +816,22 @@ static int fadump_exclude_reserved_area(unsigned long long start, if ((ra_start < end) && (ra_end > start)) { if ((start < ra_start) && (end > ra_end)) { - ret = fadump_add_crash_memory(start, ra_start); + ret = fadump_add_mem_range(&crash_mrange_info, + start, ra_start); if (ret) return ret; - ret = fadump_add_crash_memory(ra_end, end); + ret = fadump_add_mem_range(&crash_mrange_info, + ra_end, end); } else if (start < ra_start) { - ret = fadump_add_crash_memory(start, ra_start); + ret = fadump_add_mem_range(&crash_mrange_info, + start, ra_start); } else if (ra_end < end) { - ret = fadump_add_crash_memory(ra_end, end); + ret = fadump_add_mem_range(&crash_mrange_info, + ra_end, end); } } else - ret = fadump_add_crash_memory(start, end); + ret = fadump_add_mem_range(&crash_mrange_info, start, end); return ret; } @@ -1117,36 +876,36 @@ static int fadump_init_elfcore_header(char *bufp) static int fadump_setup_crash_memory_ranges(void) { struct memblock_region *reg; - unsigned long long start, end; - int ret; + u64 start, end; + int i, ret; pr_debug("Setup crash memory ranges.\n"); - crash_mem_ranges = 0; + crash_mrange_info.mem_range_cnt = 0; /* - * add the first memory chunk (RMA_START through boot_memory_size) as - * a separate memory chunk. The reason is, at the time crash firmware - * will move the content of this memory chunk to different location - * specified during fadump registration. We need to create a separate - * program header for this chunk with the correct offset. + * Boot memory region(s) registered with firmware are moved to + * different location at the time of crash. Create separate program + * header(s) for this memory chunk(s) with the correct offset. */ - ret = fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); - if (ret) - return ret; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + start = fw_dump.boot_mem_addr[i]; + end = start + fw_dump.boot_mem_sz[i]; + ret = fadump_add_mem_range(&crash_mrange_info, start, end); + if (ret) + return ret; + } for_each_memblock(memory, reg) { - start = (unsigned long long)reg->base; - end = start + (unsigned long long)reg->size; + start = (u64)reg->base; + end = start + (u64)reg->size; /* - * skip the first memory chunk that is already added (RMA_START - * through boot_memory_size). This logic needs a relook if and - * when RMA_START changes to a non-zero value. + * skip the memory chunk that is already added + * (0 through boot_memory_top). */ - BUILD_BUG_ON(RMA_START != 0); - if (start < fw_dump.boot_memory_size) { - if (end > fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + if (start < fw_dump.boot_mem_top) { + if (end > fw_dump.boot_mem_top) + start = fw_dump.boot_mem_top; else continue; } @@ -1167,17 +926,35 @@ static int fadump_setup_crash_memory_ranges(void) */ static inline unsigned long fadump_relocate(unsigned long paddr) { - if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) - return be64_to_cpu(fdm.rmr_region.destination_address) + paddr; - else - return paddr; + unsigned long raddr, rstart, rend, rlast, hole_size; + int i; + + hole_size = 0; + rlast = 0; + raddr = paddr; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + rstart = fw_dump.boot_mem_addr[i]; + rend = rstart + fw_dump.boot_mem_sz[i]; + hole_size += (rstart - rlast); + + if (paddr >= rstart && paddr < rend) { + raddr += fw_dump.boot_mem_dest_addr - hole_size; + break; + } + + rlast = rend; + } + + pr_debug("vmcoreinfo: paddr = 0x%lx, raddr = 0x%lx\n", paddr, raddr); + return raddr; } static int fadump_create_elfcore_headers(char *bufp) { - struct elfhdr *elf; + unsigned long long raddr, offset; struct elf_phdr *phdr; - int i; + struct elfhdr *elf; + int i, j; fadump_init_elfcore_header(bufp); elf = (struct elfhdr *)bufp; @@ -1220,12 +997,14 @@ static int fadump_create_elfcore_headers(char *bufp) (elf->e_phnum)++; /* setup PT_LOAD sections. */ - - for (i = 0; i < crash_mem_ranges; i++) { - unsigned long long mbase, msize; - mbase = crash_memory_ranges[i].base; - msize = crash_memory_ranges[i].size; - + j = 0; + offset = 0; + raddr = fw_dump.boot_mem_addr[0]; + for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) { + u64 mbase, msize; + + mbase = crash_mrange_info.mem_ranges[i].base; + msize = crash_mrange_info.mem_ranges[i].size; if (!msize) continue; @@ -1235,13 +1014,17 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_offset = mbase; - if (mbase == RMA_START) { + if (mbase == raddr) { /* - * The entire RMA region will be moved by firmware - * to the specified destination_address. Hence set - * the correct offset. + * The entire real memory region will be moved by + * firmware to the specified destination_address. + * Hence set the correct offset. */ - phdr->p_offset = be64_to_cpu(fdm.rmr_region.destination_address); + phdr->p_offset = fw_dump.boot_mem_dest_addr + offset; + if (j < (fw_dump.boot_mem_regs_cnt - 1)) { + offset += fw_dump.boot_mem_sz[j]; + raddr = fw_dump.boot_mem_addr[++j]; + } } phdr->p_paddr = mbase; @@ -1263,7 +1046,6 @@ static unsigned long init_fadump_header(unsigned long addr) if (!addr) return 0; - fw_dump.fadumphdr_addr = addr; fdh = __va(addr); addr += sizeof(struct fadump_crash_info_header); @@ -1271,7 +1053,7 @@ static unsigned long init_fadump_header(unsigned long addr) fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; fdh->elfcorehdr_addr = addr; /* We will set the crashing cpu id in crash_fadump() during crash. */ - fdh->crashing_cpu = CPU_UNKNOWN; + fdh->crashing_cpu = FADUMP_CPU_UNKNOWN; return addr; } @@ -1293,7 +1075,8 @@ static int register_fadump(void) if (ret) return ret; - addr = be64_to_cpu(fdm.rmr_region.destination_address) + be64_to_cpu(fdm.rmr_region.source_len); + addr = fw_dump.fadumphdr_addr; + /* Initialize fadump crash info header. */ addr = init_fadump_header(addr); vaddr = __va(addr); @@ -1302,74 +1085,27 @@ static int register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - return register_fw_dump(&fdm); -} - -static int fadump_unregister_dump(struct fadump_mem_struct *fdm) -{ - int rc = 0; - unsigned int wait_time; - - pr_debug("Un-register firmware-assisted dump\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_UNREGISTER, fdm, - sizeof(struct fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - } while (wait_time); - - if (rc) { - printk(KERN_ERR "Failed to un-register firmware-assisted dump." - " unexpected error(%d).\n", rc); - return rc; - } - fw_dump.dump_registered = 0; - return 0; -} - -static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm) -{ - int rc = 0; - unsigned int wait_time; - - pr_debug("Invalidating firmware-assisted dump registration\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_INVALIDATE, fdm, - sizeof(struct fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - } while (wait_time); - - if (rc) { - pr_err("Failed to invalidate firmware-assisted dump registration. Unexpected error (%d).\n", rc); - return rc; - } - fw_dump.dump_active = 0; - fdm_active = NULL; - return 0; + pr_debug("Registering for firmware-assisted kernel dump...\n"); + return fw_dump.ops->fadump_register(&fw_dump); } void fadump_cleanup(void) { + if (!fw_dump.fadump_supported) + return; + /* Invalidate the registration only if dump is active. */ if (fw_dump.dump_active) { - /* pass the same memory dump structure provided by platform */ - fadump_invalidate_dump(fdm_active); + pr_debug("Invalidating firmware-assisted dump registration\n"); + fw_dump.ops->fadump_invalidate(&fw_dump); } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ - fadump_unregister_dump(&fdm); - free_crash_memory_ranges(); + fw_dump.ops->fadump_unregister(&fw_dump); + fadump_free_mem_ranges(&crash_mrange_info); } + + if (fw_dump.ops->fadump_cleanup) + fw_dump.ops->fadump_cleanup(&fw_dump); } static void fadump_free_reserved_memory(unsigned long start_pfn, @@ -1394,90 +1130,197 @@ static void fadump_free_reserved_memory(unsigned long start_pfn, /* * Skip memory holes and free memory that was actually reserved. */ -static void fadump_release_reserved_area(unsigned long start, unsigned long end) +static void fadump_release_reserved_area(u64 start, u64 end) { + u64 tstart, tend, spfn, epfn; struct memblock_region *reg; - unsigned long tstart, tend; - unsigned long start_pfn = PHYS_PFN(start); - unsigned long end_pfn = PHYS_PFN(end); + spfn = PHYS_PFN(start); + epfn = PHYS_PFN(end); for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg)); + tend = min_t(u64, epfn, memblock_region_memory_end_pfn(reg)); if (tstart < tend) { fadump_free_reserved_memory(tstart, tend); - if (tend == end_pfn) + if (tend == epfn) break; - start_pfn = tend + 1; + spfn = tend; } } } /* - * Release the memory that was reserved in early boot to preserve the memory - * contents. The released memory will be available for general use. + * Sort the mem ranges in-place and merge adjacent ranges + * to minimize the memory ranges count. */ -static void fadump_release_memory(unsigned long begin, unsigned long end) +static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) { - unsigned long ra_start, ra_end; + struct fadump_memory_range *mem_ranges; + struct fadump_memory_range tmp_range; + u64 base, size; + int i, j, idx; + + if (!reserved_mrange_info.mem_range_cnt) + return; + + /* Sort the memory ranges */ + mem_ranges = mrange_info->mem_ranges; + for (i = 0; i < mrange_info->mem_range_cnt; i++) { + idx = i; + for (j = (i + 1); j < mrange_info->mem_range_cnt; j++) { + if (mem_ranges[idx].base > mem_ranges[j].base) + idx = j; + } + if (idx != i) { + tmp_range = mem_ranges[idx]; + mem_ranges[idx] = mem_ranges[i]; + mem_ranges[i] = tmp_range; + } + } + + /* Merge adjacent reserved ranges */ + idx = 0; + for (i = 1; i < mrange_info->mem_range_cnt; i++) { + base = mem_ranges[i-1].base; + size = mem_ranges[i-1].size; + if (mem_ranges[i].base == (base + size)) + mem_ranges[idx].size += mem_ranges[i].size; + else { + idx++; + if (i == idx) + continue; + + mem_ranges[idx] = mem_ranges[i]; + } + } + mrange_info->mem_range_cnt = idx + 1; +} + +/* + * Scan reserved-ranges to consider them while reserving/releasing + * memory for FADump. + */ +static inline int fadump_scan_reserved_mem_ranges(void) +{ + struct device_node *root; + const __be32 *prop; + int len, ret = -1; + unsigned long i; + + root = of_find_node_by_path("/"); + if (!root) + return ret; + + prop = of_get_property(root, "reserved-ranges", &len); + if (!prop) + return ret; + + /* + * Each reserved range is an (address,size) pair, 2 cells each, + * totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, size; + + base = of_read_number(prop + (i * 4) + 0, 2); + size = of_read_number(prop + (i * 4) + 2, 2); + + if (size) { + ret = fadump_add_mem_range(&reserved_mrange_info, + base, base + size); + if (ret < 0) { + pr_warn("some reserved ranges are ignored!\n"); + break; + } + } + } + + return ret; +} + +/* + * Release the memory that was reserved during early boot to preserve the + * crash'ed kernel's memory contents except reserved dump area (permanent + * reservation) and reserved ranges used by F/W. The released memory will + * be available for general use. + */ +static void fadump_release_memory(u64 begin, u64 end) +{ + u64 ra_start, ra_end, tstart; + int i, ret; + + fadump_scan_reserved_mem_ranges(); ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. + * Add reserved dump area to reserved ranges list + * and exclude all these ranges while releasing memory. */ - if (begin < ra_end && end > ra_start) { - if (begin < ra_start) - fadump_release_reserved_area(begin, ra_start); - if (end > ra_end) - fadump_release_reserved_area(ra_end, end); - } else - fadump_release_reserved_area(begin, end); + ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end); + if (ret != 0) { + /* + * Not enough memory to setup reserved ranges but the system is + * running shortage of memory. So, release all the memory except + * Reserved dump area (reused for next fadump registration). + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); + + return; + } + + /* Get the reserved ranges list in order first. */ + sort_and_merge_mem_ranges(&reserved_mrange_info); + + /* Exclude reserved ranges and release remaining memory */ + tstart = begin; + for (i = 0; i < reserved_mrange_info.mem_range_cnt; i++) { + ra_start = reserved_mrange_info.mem_ranges[i].base; + ra_end = ra_start + reserved_mrange_info.mem_ranges[i].size; + + if (tstart >= ra_end) + continue; + + if (tstart < ra_start) + fadump_release_reserved_area(tstart, ra_start); + tstart = ra_end; + } + + if (tstart < end) + fadump_release_reserved_area(tstart, end); } static void fadump_invalidate_release_mem(void) { - unsigned long reserved_area_start, reserved_area_end; - unsigned long destination_address; - mutex_lock(&fadump_mutex); if (!fw_dump.dump_active) { mutex_unlock(&fadump_mutex); return; } - destination_address = be64_to_cpu(fdm_active->cpu_state_data.destination_address); fadump_cleanup(); mutex_unlock(&fadump_mutex); + fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM()); + fadump_free_cpu_notes_buf(); + /* - * Save the current reserved memory bounds we will require them - * later for releasing the memory for general use. - */ - reserved_area_start = fw_dump.reserve_dump_area_start; - reserved_area_end = reserved_area_start + - fw_dump.reserve_dump_area_size; - /* - * Setup reserve_dump_area_start and its size so that we can - * reuse this reserved memory for Re-registration. + * Setup kernel metadata and initialize the kernel dump + * memory structure for FADump re-registration. */ - fw_dump.reserve_dump_area_start = destination_address; - fw_dump.reserve_dump_area_size = get_fadump_area_size(); - - fadump_release_memory(reserved_area_start, reserved_area_end); - if (fw_dump.cpu_notes_buf) { - fadump_cpu_notes_buf_free( - (unsigned long)__va(fw_dump.cpu_notes_buf), - fw_dump.cpu_notes_buf_size); - fw_dump.cpu_notes_buf = 0; - fw_dump.cpu_notes_buf_size = 0; - } - /* Initialize the kernel dump memory structure for FAD registration. */ - init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + if (fw_dump.ops->fadump_setup_metadata && + (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0)) + pr_warn("Failed to setup kernel metadata!\n"); + fw_dump.ops->fadump_init_mem_struct(&fw_dump); } static ssize_t fadump_release_memory_store(struct kobject *kobj, @@ -1528,7 +1371,7 @@ static ssize_t fadump_register_store(struct kobject *kobj, int ret = 0; int input = -1; - if (!fw_dump.fadump_enabled || fdm_active) + if (!fw_dump.fadump_enabled || fw_dump.dump_active) return -EPERM; if (kstrtoint(buf, 0, &input)) @@ -1541,13 +1384,15 @@ static ssize_t fadump_register_store(struct kobject *kobj, if (fw_dump.dump_registered == 0) { goto unlock_out; } + /* Un-register Firmware-assisted dump */ - fadump_unregister_dump(&fdm); + pr_debug("Un-register firmware-assisted dump\n"); + fw_dump.ops->fadump_unregister(&fw_dump); break; case 1: if (fw_dump.dump_registered == 1) { /* Un-register Firmware-assisted dump */ - fadump_unregister_dump(&fdm); + fw_dump.ops->fadump_unregister(&fw_dump); } /* Register Firmware-assisted dump */ ret = register_fadump(); @@ -1564,62 +1409,12 @@ unlock_out: static int fadump_region_show(struct seq_file *m, void *private) { - const struct fadump_mem_struct *fdm_ptr; - if (!fw_dump.fadump_enabled) return 0; mutex_lock(&fadump_mutex); - if (fdm_active) - fdm_ptr = fdm_active; - else { - mutex_unlock(&fadump_mutex); - fdm_ptr = &fdm; - } - - seq_printf(m, - "CPU : [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address), - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) + - be64_to_cpu(fdm_ptr->cpu_state_data.source_len) - 1, - be64_to_cpu(fdm_ptr->cpu_state_data.source_len), - be64_to_cpu(fdm_ptr->cpu_state_data.bytes_dumped)); - seq_printf(m, - "HPTE: [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->hpte_region.destination_address), - be64_to_cpu(fdm_ptr->hpte_region.destination_address) + - be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, - be64_to_cpu(fdm_ptr->hpte_region.source_len), - be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); - seq_printf(m, - "DUMP: [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->rmr_region.destination_address), - be64_to_cpu(fdm_ptr->rmr_region.destination_address) + - be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1, - be64_to_cpu(fdm_ptr->rmr_region.source_len), - be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); - - if (!fdm_active || - (fw_dump.reserve_dump_area_start == - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address))) - goto out; - - /* Dump is active. Show reserved memory region. */ - seq_printf(m, - " : [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - (unsigned long long)fw_dump.reserve_dump_area_start, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - 1, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - - fw_dump.reserve_dump_area_start, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - - fw_dump.reserve_dump_area_start); -out: - if (fdm_active) - mutex_unlock(&fadump_mutex); + fw_dump.ops->fadump_region_show(&fw_dump, m); + mutex_unlock(&fadump_mutex); return 0; } @@ -1690,14 +1485,77 @@ int __init setup_fadump(void) * if dump process fails then invalidate the registration * and release memory before proceeding for re-registration. */ - if (process_fadump(fdm_active) < 0) + if (fw_dump.ops->fadump_process(&fw_dump) < 0) fadump_invalidate_release_mem(); } /* Initialize the kernel dump memory structure for FAD registration. */ else if (fw_dump.reserve_dump_area_size) - init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + fw_dump.ops->fadump_init_mem_struct(&fw_dump); + fadump_init_files(); return 1; } subsys_initcall(setup_fadump); +#else /* !CONFIG_PRESERVE_FA_DUMP */ + +/* Scan the Firmware Assisted dump configuration details. */ +int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data) +{ + if ((depth != 1) || (strcmp(uname, "ibm,opal") != 0)) + return 0; + + opal_fadump_dt_scan(&fw_dump, node); + return 1; +} + +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * preserve crash data. The subsequent memory preserving kernel boot + * is likely to process this crash data. + */ +int __init fadump_reserve_mem(void) +{ + if (fw_dump.dump_active) { + /* + * If last boot has crashed then reserve all the memory + * above boot memory to preserve crash data. + */ + pr_info("Preserving crash data for processing in next boot.\n"); + fadump_reserve_crash_area(fw_dump.boot_mem_top); + } else + pr_debug("FADump-aware kernel..\n"); + + return 1; +} +#endif /* CONFIG_PRESERVE_FA_DUMP */ + +/* Preserve everything above the base address */ +static void __init fadump_reserve_crash_area(u64 base) +{ + struct memblock_region *reg; + u64 mstart, msize; + + for_each_memblock(memory, reg) { + mstart = reg->base; + msize = reg->size; + + if ((mstart + msize) < base) + continue; + + if (mstart < base) { + msize -= (base - mstart); + mstart = base; + } + + pr_info("Reserving %lluMB of memory at %#016llx for preserving crash data", + (msize >> 20), mstart); + memblock_reserve(mstart, msize); + } +} + +unsigned long __init arch_reserved_kernel_pages(void) +{ + return memblock_reserved_size() / PAGE_SIZE; +} diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index f255e22184b4..4a24f8f026c7 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -34,7 +34,16 @@ #include "head_32.h" -/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */ +/* 601 only have IBAT */ +#ifdef CONFIG_PPC_BOOK3S_601 +#define LOAD_BAT(n, reg, RA, RB) \ + li RA,0; \ + mtspr SPRN_IBAT##n##U,RA; \ + lwz RA,(n*16)+0(reg); \ + lwz RB,(n*16)+4(reg); \ + mtspr SPRN_IBAT##n##U,RA; \ + mtspr SPRN_IBAT##n##L,RB +#else #define LOAD_BAT(n, reg, RA, RB) \ /* see the comment for clear_bats() -- Cort */ \ li RA,0; \ @@ -44,12 +53,11 @@ lwz RB,(n*16)+4(reg); \ mtspr SPRN_IBAT##n##U,RA; \ mtspr SPRN_IBAT##n##L,RB; \ - beq 1f; \ lwz RA,(n*16)+8(reg); \ lwz RB,(n*16)+12(reg); \ mtspr SPRN_DBAT##n##U,RA; \ - mtspr SPRN_DBAT##n##L,RB; \ -1: + mtspr SPRN_DBAT##n##L,RB +#endif __HEAD .stabs "arch/powerpc/kernel/",N_SO,0,0,0f @@ -557,9 +565,9 @@ DataStoreTLBMiss: cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR #ifdef CONFIG_SWAP - li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED #else - li r1, _PAGE_RW | _PAGE_PRESENT + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT #endif bge- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ @@ -820,9 +828,6 @@ load_up_mmu: /* Load the BAT registers with the values set up by MMU_init. MMU_init takes care of whether we're on a 601 or not. */ - mfpvr r3 - srwi r3,r3,16 - cmpwi r3,1 lis r3,BATS@ha addi r3,r3,BATS@l tophys(r3,r3) @@ -897,9 +902,11 @@ start_here: bl machine_init bl __save_cpu_setup bl MMU_init +#ifdef CONFIG_KASAN BEGIN_MMU_FTR_SECTION bl MMU_init_hw_patch END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) +#endif /* * Go back to running unmapped so we can load up new values @@ -996,11 +1003,8 @@ EXPORT_SYMBOL(switch_mmu_context) */ clear_bats: li r10,0 - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi r9, 1 - beq 1f +#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT0U,r10 mtspr SPRN_DBAT0L,r10 mtspr SPRN_DBAT1U,r10 @@ -1009,7 +1013,7 @@ clear_bats: mtspr SPRN_DBAT2L,r10 mtspr SPRN_DBAT3U,r10 mtspr SPRN_DBAT3L,r10 -1: +#endif mtspr SPRN_IBAT0U,r10 mtspr SPRN_IBAT0L,r10 mtspr SPRN_IBAT1U,r10 @@ -1104,10 +1108,7 @@ mmu_off: */ initial_bats: lis r11,PAGE_OFFSET@h - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi 0,r9,1 - bne 4f +#ifdef CONFIG_PPC_BOOK3S_601 ori r11,r11,4 /* set up BAT registers for 601 */ li r8,0x7f /* valid, block length = 8MB */ mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */ @@ -1120,10 +1121,8 @@ initial_bats: addis r8,r8,0x800000@h mtspr SPRN_IBAT2U,r11 mtspr SPRN_IBAT2L,r8 - isync - blr - -4: tophys(r8,r11) +#else + tophys(r8,r11) #ifdef CONFIG_SMP ori r8,r8,0x12 /* R/W access, M=1 */ #else @@ -1135,10 +1134,10 @@ initial_bats: mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */ mtspr SPRN_IBAT0L,r8 mtspr SPRN_IBAT0U,r11 +#endif isync blr - #ifdef CONFIG_BOOTX_TEXT setup_disp_bat: /* @@ -1153,15 +1152,13 @@ setup_disp_bat: beqlr lwz r11,0(r8) lwz r8,4(r8) - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi 0,r9,1 - beq 1f +#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT3L,r8 mtspr SPRN_DBAT3U,r11 - blr -1: mtspr SPRN_IBAT3L,r8 +#else + mtspr SPRN_IBAT3L,r8 mtspr SPRN_IBAT3U,r11 +#endif blr #endif /* CONFIG_BOOTX_TEXT */ diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 4a692553651f..8abc7783dbe5 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -5,19 +5,6 @@ #include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ /* - * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE. - */ -.macro __LOAD_MSR_KERNEL r, x -.if \x >= 0x8000 - lis \r, (\x)@h - ori \r, \r, (\x)@l -.else - li \r, (\x) -.endif -.endm -#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x - -/* * Exception entry code. This code runs with address translation * turned off, i.e. using physical addresses. * We assume sprg3 has the physical address of the current @@ -92,7 +79,7 @@ #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #else - LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ MTMSRD(r10) /* (except for mach check in rtas) */ #endif lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ @@ -140,10 +127,10 @@ * otherwise we might risk taking an interrupt before we tell lockdep * they are enabled. */ - LOAD_MSR_KERNEL(r10, MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL) rlwimi r10, r9, 0, MSR_EE #else - LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) #endif #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 @@ -187,7 +174,7 @@ label: #define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ li r10,trap; \ stw r10,_TRAP(r11); \ - LOAD_MSR_KERNEL(r10, msr); \ + LOAD_REG_IMMEDIATE(r10, msr); \ bl tfer; \ .long hdlr; \ .long ret diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 91d297e696dd..ad79fddb974d 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -182,7 +182,8 @@ __secondary_hold: isync bctr #else - BUG_OPCODE +0: trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 #endif CLOSE_FIXED_SECTION(first_256B) @@ -635,7 +636,7 @@ __after_prom_start: sub r5,r5,r11 #else /* just copy interrupts */ - LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) + LOAD_REG_IMMEDIATE_SYM(r5, r11, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) #endif b 5f 3: @@ -998,7 +999,8 @@ start_here_common: bl start_kernel /* Not reached */ - BUG_OPCODE + trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 /* * We put a few things here that have to be page-aligned. diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 5ab9178c2347..19f583e18402 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -15,6 +15,7 @@ */ #include <linux/init.h> +#include <linux/magic.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> @@ -574,8 +575,6 @@ InstructionBreakpoint: * by decoding the registers used by the dcbx instruction and adding them. * DAR is set to the calculated address. */ - /* define if you don't want to use self modifying code */ -#define NO_SELF_MODIFYING_CODE FixupDAR:/* Entry point for dcbx workaround. */ mtspr SPRN_M_TW, r10 /* fetch instruction from memory. */ @@ -639,27 +638,6 @@ FixupDAR:/* Entry point for dcbx workaround. */ rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */ mtspr SPRN_DSISR, r10 142: /* continue, it was a dcbx, dcbi instruction. */ -#ifndef NO_SELF_MODIFYING_CODE - andis. r10,r11,0x1f /* test if reg RA is r0 */ - li r10,modified_instr@l - dcbtst r0,r10 /* touch for store */ - rlwinm r11,r11,0,0,20 /* Zero lower 10 bits */ - oris r11,r11,640 /* Transform instr. to a "add r10,RA,RB" */ - ori r11,r11,532 - stw r11,0(r10) /* store add/and instruction */ - dcbf 0,r10 /* flush new instr. to memory. */ - icbi 0,r10 /* invalidate instr. cache line */ - mfspr r11, SPRN_SPRG_SCRATCH1 /* restore r11 */ - mfspr r10, SPRN_SPRG_SCRATCH0 /* restore r10 */ - isync /* Wait until new instr is loaded from memory */ -modified_instr: - .space 4 /* this is where the add instr. is stored */ - bne+ 143f - subf r10,r0,r10 /* r10=r10-r0, only if reg RA is r0 */ -143: mtdar r10 /* store faulting EA in DAR */ - mfspr r10,SPRN_M_TW - b DARFixed /* Go back to normal TLB handling */ -#else mfctr r10 mtdar r10 /* save ctr reg in DAR */ rlwinm r10, r11, 24, 24, 28 /* offset into jump table for reg RB */ @@ -723,7 +701,6 @@ modified_instr: add r10, r10, r11 /* add it */ mfctr r11 /* restore r11 */ b 151b -#endif /* * This is where the main kernel code starts. @@ -741,6 +718,9 @@ start_here: /* stack */ lis r1,init_thread_union@ha addi r1,r1,init_thread_union@l + lis r0, STACK_END_MAGIC@h + ori r0, r0, STACK_END_MAGIC@l + stw r0, 0(r1) li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index c8d1fa2e9d53..1007ec36b4cb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -195,18 +195,63 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) tsk->thread.last_hit_ubp = NULL; } +static bool is_larx_stcx_instr(struct pt_regs *regs, unsigned int instr) +{ + int ret, type; + struct instruction_op op; + + ret = analyse_instr(&op, regs, instr); + type = GETTYPE(op.type); + return (!ret && (type == LARX || type == STCX)); +} + /* * Handle debug exception notifications. */ +static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, + unsigned long addr) +{ + unsigned int instr = 0; + + if (__get_user_inatomic(instr, (unsigned int *)regs->nip)) + goto fail; + + if (is_larx_stcx_instr(regs, instr)) { + printk_ratelimited("Breakpoint hit on instruction that can't be emulated." + " Breakpoint at 0x%lx will be disabled.\n", addr); + goto disable; + } + + /* Do not emulate user-space instructions, instead single-step them */ + if (user_mode(regs)) { + current->thread.last_hit_ubp = bp; + regs->msr |= MSR_SE; + return false; + } + + if (!emulate_step(regs, instr)) + goto fail; + + return true; + +fail: + /* + * We've failed in reliably handling the hw-breakpoint. Unregister + * it and throw a warning message to let the user know about it. + */ + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " + "0x%lx will be disabled.", addr); + +disable: + perf_event_disable_inatomic(bp); + return false; +} + int hw_breakpoint_handler(struct die_args *args) { int rc = NOTIFY_STOP; struct perf_event *bp; struct pt_regs *regs = args->regs; -#ifndef CONFIG_PPC_8xx - int stepped = 1; - unsigned int instr; -#endif struct arch_hw_breakpoint *info; unsigned long dar = regs->dar; @@ -251,32 +296,10 @@ int hw_breakpoint_handler(struct die_args *args) (dar - bp->attr.bp_addr < bp->attr.bp_len))) info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; -#ifndef CONFIG_PPC_8xx - /* Do not emulate user-space instructions, instead single-step them */ - if (user_mode(regs)) { - current->thread.last_hit_ubp = bp; - regs->msr |= MSR_SE; + if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info->address)) goto out; - } - - stepped = 0; - instr = 0; - if (!__get_user_inatomic(instr, (unsigned int *) regs->nip)) - stepped = emulate_step(regs, instr); /* - * emulate_step() could not execute it. We've failed in reliably - * handling the hw-breakpoint. Unregister it and throw a warning - * message to let the user know about it. - */ - if (!stepped) { - WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " - "0x%lx will be disabled.", info->address); - perf_event_disable_inatomic(bp); - goto out; - } -#endif - /* * As a policy, the callback is invoked in a 'trigger-after-execute' * fashion */ diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index fbd2d0007c52..0276bc8c8969 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -149,8 +149,8 @@ static const struct ppc_pci_io iowa_pci_io = { }; #ifdef CONFIG_PPC_INDIRECT_MMIO -static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, - pgprot_t prot, void *caller) +void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller) { struct iowa_bus *bus; void __iomem *res = __ioremap_caller(addr, size, prot, caller); @@ -163,20 +163,17 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, } return res; } -#else /* CONFIG_PPC_INDIRECT_MMIO */ -#define iowa_ioremap NULL #endif /* !CONFIG_PPC_INDIRECT_MMIO */ +bool io_workaround_inited; + /* Enable IO workaround */ static void io_workaround_init(void) { - static int io_workaround_inited; - if (io_workaround_inited) return; ppc_pci_io = iowa_pci_io; - ppc_md.ioremap = iowa_ioremap; - io_workaround_inited = 1; + io_workaround_inited = true; } /* Register new bus to support workaround */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0a67ce9f827e..9704f3f76e63 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl) #endif } +static void iommu_table_reserve_pages(struct iommu_table *tbl, + unsigned long res_start, unsigned long res_end) +{ + int i; + + WARN_ON_ONCE(res_end < res_start); + /* + * Reserve page 0 so it will not be used for any mappings. + * This avoids buggy drivers that consider page 0 to be invalid + * to crash the machine or even lose data. + */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + tbl->it_reserved_start = res_start; + tbl->it_reserved_end = res_end; + + /* Check if res_start..res_end isn't empty and overlaps the table */ + if (res_start && res_end && + (tbl->it_offset + tbl->it_size < res_start || + res_end < tbl->it_offset)) + return; + + for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) + set_bit(i - tbl->it_offset, tbl->it_map); +} + +static void iommu_table_release_pages(struct iommu_table *tbl) +{ + int i; + + /* + * In case we have reserved the first bit, we should not emit + * the warning below. + */ + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + + for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) + clear_bit(i - tbl->it_offset, tbl->it_map); +} + /* * Build a iommu_table structure. This contains a bit map which * is used to manage allocation of the tce space. */ -struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) +struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, + unsigned long res_start, unsigned long res_end) { unsigned long sz; static int welcomed = 0; @@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) tbl->it_map = page_address(page); memset(tbl->it_map, 0, sz); - /* - * Reserve page 0 so it will not be used for any mappings. - * This avoids buggy drivers that consider page 0 to be invalid - * to crash the machine or even lose data. - */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + iommu_table_reserve_pages(tbl, res_start, res_end); /* We only split the IOMMU table if we have 1GB or more of space */ if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) @@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref) return; } - /* - * In case we have reserved the first bit, we should not emit - * the warning below. - */ - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + iommu_table_release_pages(tbl); /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) @@ -981,29 +1013,32 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, +extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, + struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction) { long ret; unsigned long size = 0; - ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); - + ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false); if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL)) && !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, &size)) SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); - /* if (unlikely(ret)) - pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", - __func__, hwaddr, entry << tbl->it_page_shift, - hwaddr, ret); */ - return ret; } -EXPORT_SYMBOL_GPL(iommu_tce_xchg); +EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill); + +void iommu_tce_kill(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + if (tbl->it_ops->tce_kill) + tbl->it_ops->tce_kill(tbl, entry, pages, false); +} +EXPORT_SYMBOL_GPL(iommu_tce_kill); int iommu_take_ownership(struct iommu_table *tbl) { @@ -1017,22 +1052,21 @@ int iommu_take_ownership(struct iommu_table *tbl) * requires exchange() callback defined so if it is not * implemented, we disallow taking ownership over the table. */ - if (!tbl->it_ops->exchange) + if (!tbl->it_ops->xchg_no_kill) return -EINVAL; spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) spin_lock(&tbl->pools[i].lock); - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + iommu_table_release_pages(tbl); if (!bitmap_empty(tbl->it_map, tbl->it_size)) { pr_err("iommu_tce: it_map is not empty"); ret = -EBUSY; - /* Restore bit#0 set by iommu_init_table() */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ + iommu_table_reserve_pages(tbl, tbl->it_reserved_start, + tbl->it_reserved_end); } else { memset(tbl->it_map, 0xff, sz); } @@ -1055,9 +1089,8 @@ void iommu_release_ownership(struct iommu_table *tbl) memset(tbl->it_map, 0, sz); - /* Restore bit#0 set by iommu_init_table() */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + iommu_table_reserve_pages(tbl, tbl->it_reserved_start, + tbl->it_reserved_end); for (i = 0; i < tbl->nr_pools; i++) spin_unlock(&tbl->pools[i].lock); diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index b7b3a5e4e224..617eba82531c 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -64,16 +64,17 @@ #define KVM_INST_MTSRIN 0x7c0001e4 static bool kvm_patching_worked = true; -char kvm_tmp[1024 * 1024]; +extern char kvm_tmp[]; +extern char kvm_tmp_end[]; static int kvm_tmp_index; -static inline void kvm_patch_ins(u32 *inst, u32 new_inst) +static void __init kvm_patch_ins(u32 *inst, u32 new_inst) { *inst = new_inst; flush_icache_range((ulong)inst, (ulong)inst + 4); } -static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); @@ -82,7 +83,7 @@ static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); @@ -91,12 +92,12 @@ static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) { kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff)); } -static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_std(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc)); @@ -105,17 +106,17 @@ static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) { kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc)); } -static void kvm_patch_ins_nop(u32 *inst) +static void __init kvm_patch_ins_nop(u32 *inst) { kvm_patch_ins(inst, KVM_INST_NOP); } -static void kvm_patch_ins_b(u32 *inst, int addr) +static void __init kvm_patch_ins_b(u32 *inst, int addr) { #if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S) /* On relocatable kernels interrupts handlers and our code @@ -128,11 +129,11 @@ static void kvm_patch_ins_b(u32 *inst, int addr) kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK)); } -static u32 *kvm_alloc(int len) +static u32 * __init kvm_alloc(int len) { u32 *p; - if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) { + if ((kvm_tmp_index + len) > (kvm_tmp_end - kvm_tmp)) { printk(KERN_ERR "KVM: No more space (%d + %d)\n", kvm_tmp_index, len); kvm_patching_worked = false; @@ -151,7 +152,7 @@ extern u32 kvm_emulate_mtmsrd_orig_ins_offs; extern u32 kvm_emulate_mtmsrd_len; extern u32 kvm_emulate_mtmsrd[]; -static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) +static void __init kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) { u32 *p; int distance_start; @@ -204,7 +205,7 @@ extern u32 kvm_emulate_mtmsr_orig_ins_offs; extern u32 kvm_emulate_mtmsr_len; extern u32 kvm_emulate_mtmsr[]; -static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) +static void __init kvm_patch_ins_mtmsr(u32 *inst, u32 rt) { u32 *p; int distance_start; @@ -265,7 +266,7 @@ extern u32 kvm_emulate_wrtee_orig_ins_offs; extern u32 kvm_emulate_wrtee_len; extern u32 kvm_emulate_wrtee[]; -static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) +static void __init kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) { u32 *p; int distance_start; @@ -322,7 +323,7 @@ extern u32 kvm_emulate_wrteei_0_branch_offs; extern u32 kvm_emulate_wrteei_0_len; extern u32 kvm_emulate_wrteei_0[]; -static void kvm_patch_ins_wrteei_0(u32 *inst) +static void __init kvm_patch_ins_wrteei_0(u32 *inst) { u32 *p; int distance_start; @@ -363,7 +364,7 @@ extern u32 kvm_emulate_mtsrin_orig_ins_offs; extern u32 kvm_emulate_mtsrin_len; extern u32 kvm_emulate_mtsrin[]; -static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) +static void __init kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) { u32 *p; int distance_start; @@ -399,7 +400,7 @@ static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) #endif -static void kvm_map_magic_page(void *data) +static void __init kvm_map_magic_page(void *data) { u32 *features = data; @@ -414,7 +415,7 @@ static void kvm_map_magic_page(void *data) *features = out[0]; } -static void kvm_check_ins(u32 *inst, u32 features) +static void __init kvm_check_ins(u32 *inst, u32 features) { u32 _inst = *inst; u32 inst_no_rt = _inst & ~KVM_MASK_RT; @@ -658,7 +659,7 @@ static void kvm_check_ins(u32 *inst, u32 features) extern u32 kvm_template_start[]; extern u32 kvm_template_end[]; -static void kvm_use_magic_page(void) +static void __init kvm_use_magic_page(void) { u32 *p; u32 *start, *end; @@ -699,25 +700,13 @@ static void kvm_use_magic_page(void) kvm_patching_worked ? "worked" : "failed"); } -static __init void kvm_free_tmp(void) -{ - /* - * Inform kmemleak about the hole in the .bss section since the - * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y. - */ - kmemleak_free_part(&kvm_tmp[kvm_tmp_index], - ARRAY_SIZE(kvm_tmp) - kvm_tmp_index); - free_reserved_area(&kvm_tmp[kvm_tmp_index], - &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); -} - static int __init kvm_guest_init(void) { if (!kvm_para_available()) - goto free_tmp; + return 0; if (!epapr_paravirt_enabled) - goto free_tmp; + return 0; if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) kvm_use_magic_page(); @@ -727,9 +716,6 @@ static int __init kvm_guest_init(void) powersave_nap = 1; #endif -free_tmp: - kvm_free_tmp(); - return 0; } diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index eb2568f583ae..7af6f8b50c5d 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -192,6 +192,8 @@ kvm_emulate_mtmsr_orig_ins_offs: kvm_emulate_mtmsr_len: .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 +#ifdef CONFIG_BOOKE + /* also used for wrteei 1 */ .global kvm_emulate_wrtee kvm_emulate_wrtee: @@ -285,6 +287,10 @@ kvm_emulate_wrteei_0_branch_offs: kvm_emulate_wrteei_0_len: .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4 +#endif /* CONFIG_BOOKE */ + +#ifdef CONFIG_PPC_BOOK3S_32 + .global kvm_emulate_mtsrin kvm_emulate_mtsrin: @@ -334,5 +340,15 @@ kvm_emulate_mtsrin_orig_ins_offs: kvm_emulate_mtsrin_len: .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 +#endif /* CONFIG_PPC_BOOK3S_32 */ + + .balign 4 + .global kvm_tmp +kvm_tmp: + .space (64 * 1024) + +.global kvm_tmp_end +kvm_tmp_end: + .global kvm_template_end kvm_template_end: diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 18481b0e2788..04a7cba58eff 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -29,6 +29,8 @@ #include <asm/smp.h> #include <asm/hw_breakpoint.h> #include <asm/asm-prototypes.h> +#include <asm/svm.h> +#include <asm/ultravisor.h> int default_machine_kexec_prepare(struct kimage *image) { @@ -327,6 +329,13 @@ void default_machine_kexec(struct kimage *image) #ifdef CONFIG_PPC_PSERIES kexec_paca.lppaca_ptr = NULL; #endif + + if (is_secure_guest() && !(image->preserve_context || + image->type == KEXEC_TYPE_CRASH)) { + uv_unshare_all_pages(); + printk("kexec: Unshared all shared pages.\n"); + } + paca_ptrs[kexec_paca.paca_index] = &kexec_paca; setup_paca(&kexec_paca); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index b18df633eae9..34c1001e9e8b 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -33,13 +33,18 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_ue_event_queue); static void machine_check_process_queued_event(struct irq_work *work); -void machine_check_ue_event(struct machine_check_event *evt); +static void machine_check_ue_irq_work(struct irq_work *work); +static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +static struct irq_work mce_ue_event_irq_work = { + .func = machine_check_ue_irq_work, +}; + DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static void mce_set_error_info(struct machine_check_event *mce, @@ -144,6 +149,7 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; + mce->u.ue_error.ignore_event = mce_err->ignore_event; machine_check_ue_event(mce); } } @@ -199,11 +205,15 @@ void release_mce_event(void) get_mce_event(NULL, true); } +static void machine_check_ue_irq_work(struct irq_work *work) +{ + schedule_work(&mce_ue_event_work); +} /* * Queue up the MCE event which then can be handled later. */ -void machine_check_ue_event(struct machine_check_event *evt) +static void machine_check_ue_event(struct machine_check_event *evt) { int index; @@ -216,7 +226,7 @@ void machine_check_ue_event(struct machine_check_event *evt) memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); /* Queue work to process this event later. */ - schedule_work(&mce_ue_event_work); + irq_work_queue(&mce_ue_event_irq_work); } /* @@ -257,8 +267,17 @@ static void machine_process_ue_event(struct work_struct *work) /* * This should probably queued elsewhere, but * oh! well + * + * Don't report this machine check because the caller has a + * asked us to ignore the event, it has a fixup handler which + * will do the appropriate error handling and reporting. */ if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.ignore_event) { + __this_cpu_dec(mce_ue_count); + continue; + } + if (evt->u.ue_error.physical_address_provided) { unsigned long pfn; @@ -292,6 +311,12 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; evt = this_cpu_ptr(&mce_event_queue[index]); + + if (evt->error_type == MCE_ERROR_TYPE_UE && + evt->u.ue_error.ignore_event) { + __this_cpu_dec(mce_queue_count); + continue; + } machine_check_print_event_info(evt, false, false); __this_cpu_dec(mce_queue_count); } @@ -300,7 +325,7 @@ static void machine_check_process_queued_event(struct irq_work *work) void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest) { - const char *level, *sevstr, *subtype, *err_type; + const char *level, *sevstr, *subtype, *err_type, *initiator; uint64_t ea = 0, pa = 0; int n = 0; char dar_str[50]; @@ -385,6 +410,28 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } + switch(evt->initiator) { + case MCE_INITIATOR_CPU: + initiator = "CPU"; + break; + case MCE_INITIATOR_PCI: + initiator = "PCI"; + break; + case MCE_INITIATOR_ISA: + initiator = "ISA"; + break; + case MCE_INITIATOR_MEMORY: + initiator = "Memory"; + break; + case MCE_INITIATOR_POWERMGM: + initiator = "Power Management"; + break; + case MCE_INITIATOR_UNKNOWN: + default: + initiator = "Unknown"; + break; + } + switch (evt->error_type) { case MCE_ERROR_TYPE_UE: err_type = "UE"; @@ -451,6 +498,14 @@ void machine_check_print_event_info(struct machine_check_event *evt, if (evt->u.link_error.effective_address_provided) ea = evt->u.link_error.effective_address; break; + case MCE_ERROR_TYPE_DCACHE: + err_type = "D-Cache"; + subtype = "Unknown"; + break; + case MCE_ERROR_TYPE_ICACHE: + err_type = "I-Cache"; + subtype = "Unknown"; + break; default: case MCE_ERROR_TYPE_UNKNOWN: err_type = "Unknown"; @@ -483,9 +538,17 @@ void machine_check_print_event_info(struct machine_check_event *evt, level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str); } + printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator); + subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ? mc_error_class[evt->error_class] : "Unknown"; printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Display faulty slb contents for SLB errors. */ + if (evt->error_type == MCE_ERROR_TYPE_SLB) + slb_dump_contents(local_paca->mce_faulty_slbs); +#endif } EXPORT_SYMBOL_GPL(machine_check_print_event_info); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index a814d2dfb5b0..1cbf7f1a4e3d 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -11,6 +11,7 @@ #include <linux/types.h> #include <linux/ptrace.h> +#include <linux/extable.h> #include <asm/mmu.h> #include <asm/mce.h> #include <asm/machdep.h> @@ -18,6 +19,7 @@ #include <asm/pte-walk.h> #include <asm/sstep.h> #include <asm/exception-64s.h> +#include <asm/extable.h> /* * Convert an address related to an mm to a PFN. NOTE: we are in real @@ -26,7 +28,8 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { pte_t *ptep; - unsigned long flags; + unsigned int shift; + unsigned long pfn, flags; struct mm_struct *mm; if (user_mode(regs)) @@ -35,14 +38,23 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) mm = &init_mm; local_irq_save(flags); - if (mm == current->mm) - ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); - else - ptep = find_init_mm_pte(addr, NULL); + ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift); + + if (!ptep || pte_special(*ptep)) { + pfn = ULONG_MAX; + goto out; + } + + if (shift <= PAGE_SHIFT) + pfn = pte_pfn(*ptep); + else { + unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; + pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask))); + } + +out: local_irq_restore(flags); - if (!ptep || pte_special(*ptep)) - return ULONG_MAX; - return pte_pfn(*ptep); + return pfn; } /* flush SLBs and reload */ @@ -344,7 +356,7 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, false, 0, 0, 0, 0, 0 } }; -static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, +static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, uint64_t *phys_addr) { /* @@ -397,6 +409,8 @@ static int mce_handle_ierror(struct pt_regs *regs, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); handled = mce_flush(MCE_FLUSH_SLB); break; case MCE_ERROR_TYPE_ERAT: @@ -482,6 +496,8 @@ static int mce_handle_derror(struct pt_regs *regs, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); if (mce_flush(MCE_FLUSH_SLB)) handled = 1; break; @@ -541,7 +557,8 @@ static int mce_handle_derror(struct pt_regs *regs, * kernel/exception-64s.h */ if (get_paca()->in_mce < MAX_MCE_DEPTH) - mce_find_instr_ea_and_pfn(regs, addr, phys_addr); + mce_find_instr_ea_and_phys(regs, addr, + phys_addr); } found = 1; } @@ -558,9 +575,18 @@ static int mce_handle_derror(struct pt_regs *regs, return 0; } -static long mce_handle_ue_error(struct pt_regs *regs) +static long mce_handle_ue_error(struct pt_regs *regs, + struct mce_error_info *mce_err) { long handled = 0; + const struct exception_table_entry *entry; + + entry = search_kernel_exception_table(regs->nip); + if (entry) { + mce_err->ignore_event = true; + regs->nip = extable_fixup(entry); + return 1; + } /* * On specific SCOM read via MMIO we may get a machine check @@ -593,7 +619,7 @@ static long mce_handle_error(struct pt_regs *regs, &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) - handled = mce_handle_ue_error(regs); + handled = mce_handle_ue_error(regs, &mce_err); save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index fe4bd321730e..82df4b09e79f 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -292,22 +292,20 @@ _GLOBAL(flush_instruction_cache) iccci 0,r3 #endif #elif defined(CONFIG_FSL_BOOKE) -BEGIN_FTR_SECTION +#ifdef CONFIG_E200 mfspr r3,SPRN_L1CSR0 ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC /* msync; isync recommended here */ mtspr SPRN_L1CSR0,r3 isync blr -END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE) +#endif mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 +#elif defined(CONFIG_PPC_BOOK3S_601) + blr /* for 601, do nothing */ #else - mfspr r3,SPRN_PVR - rlwinm r3,r3,16,16,31 - cmpwi 0,r3,1 - beqlr /* for 601, do nothing */ /* 603/604 processor - use invalidate-all bit in HID0 */ mfspr r3,SPRN_HID0 ori r3,r3,HID0_ICFI @@ -326,10 +324,10 @@ EXPORT_SYMBOL(flush_instruction_cache) * flush_icache_range(unsigned long start, unsigned long stop) */ _GLOBAL(flush_icache_range) -BEGIN_FTR_SECTION +#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) PURGE_PREFETCHED_INS - blr /* for 601, do nothing */ -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) + blr /* for 601 and e200, do nothing */ +#else rlwinm r3,r3,0,0,31 - L1_CACHE_SHIFT subf r4,r3,r4 addi r4,r4,L1_CACHE_BYTES - 1 @@ -355,6 +353,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) sync /* additional sync needed on g4 */ isync blr +#endif _ASM_NOKPROBE_SYMBOL(flush_icache_range) EXPORT_SYMBOL(flush_icache_range) @@ -362,15 +361,15 @@ EXPORT_SYMBOL(flush_icache_range) * Flush a particular page from the data cache to RAM. * Note: this is necessary because the instruction cache does *not* * snoop from the data cache. - * This is a no-op on the 601 which has a unified cache. + * This is a no-op on the 601 and e200 which have a unified cache. * * void __flush_dcache_icache(void *page) */ _GLOBAL(__flush_dcache_icache) -BEGIN_FTR_SECTION +#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) PURGE_PREFETCHED_INS blr -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) +#else rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ mtctr r4 @@ -398,6 +397,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) sync isync blr +#endif #ifndef CONFIG_BOOKE /* @@ -409,10 +409,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) * void __flush_dcache_icache_phys(unsigned long physaddr) */ _GLOBAL(__flush_dcache_icache_phys) -BEGIN_FTR_SECTION +#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) PURGE_PREFETCHED_INS - blr /* for 601, do nothing */ -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) + blr /* for 601 and e200, do nothing */ +#else mfmsr r10 rlwinm r0,r10,0,28,26 /* clear DR */ mtmsr r0 @@ -433,6 +433,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) mtmsr r10 /* restore DR */ isync blr +#endif #endif /* CONFIG_BOOKE */ /* @@ -452,7 +453,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) stwu r9,16(r3) _GLOBAL(copy_page) + rlwinm r5, r3, 0, L1_CACHE_BYTES - 1 addi r3,r3,-4 + +0: twnei r5, 0 /* WARN if r3 is not cache aligned */ + EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + addi r4,r4,-4 li r5,4 diff --git a/arch/powerpc/kernel/note.S b/arch/powerpc/kernel/note.S new file mode 100644 index 000000000000..bcdad15395dd --- /dev/null +++ b/arch/powerpc/kernel/note.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PowerPC ELF notes. + * + * Copyright 2019, IBM Corporation + */ + +#include <linux/elfnote.h> +#include <asm/elfnote.h> + +/* + * Ultravisor-capable bit (PowerNV only). + * + * Bit 0 indicates that the powerpc kernel binary knows how to run in an + * ultravisor-enabled system. + * + * In an ultravisor-enabled system, some machine resources are now controlled + * by the ultravisor. If the kernel is not ultravisor-capable, but it ends up + * being run on a machine with ultravisor, the kernel will probably crash + * trying to access ultravisor resources. For instance, it may crash in early + * boot trying to set the partition table entry 0. + * + * In an ultravisor-enabled system, a bootloader could warn the user or prevent + * the kernel from being run if the PowerPC ultravisor capability doesn't exist + * or the Ultravisor-capable bit is not set. + */ +#ifdef CONFIG_PPC_POWERNV +#define PPCCAP_ULTRAVISOR_BIT (1 << 0) +#else +#define PPCCAP_ULTRAVISOR_BIT 0 +#endif + +/* + * Add the PowerPC Capabilities in the binary ELF note. It is a bitmap that + * can be used to advertise kernel capabilities to userland. + */ +#define PPC_CAPABILITIES_BITMAP (PPCCAP_ULTRAVISOR_BIT) + +ELFNOTE(PowerPC, PPC_ELFNOTE_CAPABILITIES, + .long PPC_CAPABILITIES_BITMAP) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index e3ad8aa4730d..949eceb254d8 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -14,6 +14,8 @@ #include <asm/sections.h> #include <asm/pgtable.h> #include <asm/kexec.h> +#include <asm/svm.h> +#include <asm/ultravisor.h> #include "setup.h" @@ -52,6 +54,43 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, #ifdef CONFIG_PPC_PSERIES +#define LPPACA_SIZE 0x400 + +static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align, + unsigned long limit, int cpu) +{ + size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE); + static unsigned long shared_lppaca_size; + static void *shared_lppaca; + void *ptr; + + if (!shared_lppaca) { + memblock_set_bottom_up(true); + + shared_lppaca = + memblock_alloc_try_nid(shared_lppaca_total_size, + PAGE_SIZE, MEMBLOCK_LOW_LIMIT, + limit, NUMA_NO_NODE); + if (!shared_lppaca) + panic("cannot allocate shared data"); + + memblock_set_bottom_up(false); + uv_share_page(PHYS_PFN(__pa(shared_lppaca)), + shared_lppaca_total_size >> PAGE_SHIFT); + } + + ptr = shared_lppaca + shared_lppaca_size; + shared_lppaca_size += size; + + /* + * This is very early in boot, so no harm done if the kernel crashes at + * this point. + */ + BUG_ON(shared_lppaca_size >= shared_lppaca_total_size); + + return ptr; +} + /* * See asm/lppaca.h for more detail. * @@ -65,7 +104,7 @@ static inline void init_lppaca(struct lppaca *lppaca) *lppaca = (struct lppaca) { .desc = cpu_to_be32(0xd397d781), /* "LpPa" */ - .size = cpu_to_be16(0x400), + .size = cpu_to_be16(LPPACA_SIZE), .fpregs_in_use = 1, .slb_count = cpu_to_be16(64), .vmxregs_in_use = 0, @@ -75,19 +114,22 @@ static inline void init_lppaca(struct lppaca *lppaca) static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) { struct lppaca *lp; - size_t size = 0x400; - BUILD_BUG_ON(size < sizeof(struct lppaca)); + BUILD_BUG_ON(sizeof(struct lppaca) > LPPACA_SIZE); if (early_cpu_has_feature(CPU_FTR_HVMODE)) return NULL; - lp = alloc_paca_data(size, 0x400, limit, cpu); + if (is_secure_guest()) + lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu); + else + lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); + init_lppaca(lp); return lp; } -#endif /* CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_PPC_PSERIES */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index f627e15bb43c..1c448cf25506 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1379,10 +1379,6 @@ void __init pcibios_resource_survey(void) pr_debug("PCI: Assigning unassigned resources...\n"); pci_assign_unassigned_resources(); } - - /* Call machine dependent fixup */ - if (ppc_md.pcibios_fixup) - ppc_md.pcibios_fixup(); } /* This is used by the PCI hotplug driver to allocate resource diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 0b0cf8168b47..fc62c4bc47b1 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -55,11 +55,18 @@ EXPORT_SYMBOL_GPL(pci_find_bus_by_node); void pcibios_release_device(struct pci_dev *dev) { struct pci_controller *phb = pci_bus_to_host(dev->bus); + struct pci_dn *pdn = pci_get_pdn(dev); eeh_remove_device(dev); if (phb->controller_ops.release_device) phb->controller_ops.release_device(dev); + + /* free()ing the pci_dn has been deferred to us, do it now */ + if (pdn && (pdn->flags & PCI_DN_FLAG_DEAD)) { + pci_dbg(dev, "freeing dead pdn\n"); + kfree(pdn); + } } /** diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 50942a1d1a5f..b49e1060a3bf 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -263,6 +263,10 @@ static int __init pcibios_init(void) /* Call common code to handle resource allocation */ pcibios_resource_survey(); + /* Call machine dependent fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + /* Call machine dependent post-init code */ if (ppc_md.pcibios_after_init) ppc_md.pcibios_after_init(); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index b7030b1189d0..f83d1f69b1dd 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -54,14 +54,20 @@ static int __init pcibios_init(void) pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); /* Scan all of the recorded PCI controllers. */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) pcibios_scan_phb(hose); - pci_bus_add_devices(hose->bus); - } /* Call common code to handle resource allocation */ pcibios_resource_survey(); + /* Add devices. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + pci_bus_add_devices(hose->bus); + + /* Call machine dependent fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + printk(KERN_DEBUG "PCI: Probing PCI hardware done\n"); return 0; diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index c4c8c237a106..9524009ca1ae 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -323,6 +323,7 @@ void pci_remove_device_node_info(struct device_node *dn) { struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; struct device_node *parent; + struct pci_dev *pdev; #ifdef CONFIG_EEH struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -336,12 +337,28 @@ void pci_remove_device_node_info(struct device_node *dn) WARN_ON(!list_empty(&pdn->child_list)); list_del(&pdn->list); + /* Drop the parent pci_dn's ref to our backing dt node */ parent = of_get_parent(dn); if (parent) of_node_put(parent); - dn->data = NULL; - kfree(pdn); + /* + * At this point we *might* still have a pci_dev that was + * instantiated from this pci_dn. So defer free()ing it until + * the pci_dev's release function is called. + */ + pdev = pci_get_domain_bus_and_slot(pdn->phb->global_number, + pdn->busno, pdn->devfn); + if (pdev) { + /* NB: pdev has a ref to dn */ + pci_dbg(pdev, "marked pdn (from %pOF) as dead\n", dn); + pdn->flags |= PCI_DN_FLAG_DEAD; + } else { + dn->data = NULL; + kfree(pdn); + } + + pci_dev_put(pdev); } EXPORT_SYMBOL_GPL(pci_remove_device_node_info); diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 409c6c1beabf..f91d7e94872e 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -34,31 +34,75 @@ static u32 get_int_prop(struct device_node *np, const char *name, u32 def) * pci_parse_of_flags - Parse the flags cell of a device tree PCI address * @addr0: value of 1st cell of a device tree PCI address. * @bridge: Set this flag if the address is from a bridge 'ranges' property + * + * PCI Bus Binding to IEEE Std 1275-1994 + * + * Bit# 33222222 22221111 11111100 00000000 + * 10987654 32109876 54321098 76543210 + * phys.hi cell: npt000ss bbbbbbbb dddddfff rrrrrrrr + * phys.mid cell: hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh + * phys.lo cell: llllllll llllllll llllllll llllllll + * + * where: + * n is 0 if the address is relocatable, 1 otherwise + * p is 1 if the addressable region is "prefetchable", 0 otherwise + * t is 1 if the address is aliased (for non-relocatable I/O), + * below 1 MB (for Memory),or below 64 KB (for relocatable I/O). + * ss is the space code, denoting the address space: + * 00 denotes Configuration Space + * 01 denotes I/O Space + * 10 denotes 32-bit-address Memory Space + * 11 denotes 64-bit-address Memory Space + * bbbbbbbb is the 8-bit Bus Number + * ddddd is the 5-bit Device Number + * fff is the 3-bit Function Number + * rrrrrrrr is the 8-bit Register Number */ +#define OF_PCI_ADDR0_SPACE(ss) (((ss)&3)<<24) +#define OF_PCI_ADDR0_SPACE_CFG OF_PCI_ADDR0_SPACE(0) +#define OF_PCI_ADDR0_SPACE_IO OF_PCI_ADDR0_SPACE(1) +#define OF_PCI_ADDR0_SPACE_MMIO32 OF_PCI_ADDR0_SPACE(2) +#define OF_PCI_ADDR0_SPACE_MMIO64 OF_PCI_ADDR0_SPACE(3) +#define OF_PCI_ADDR0_SPACE_MASK OF_PCI_ADDR0_SPACE(3) +#define OF_PCI_ADDR0_RELOC (1UL<<31) +#define OF_PCI_ADDR0_PREFETCH (1UL<<30) +#define OF_PCI_ADDR0_ALIAS (1UL<<29) +#define OF_PCI_ADDR0_BUS 0x00FF0000UL +#define OF_PCI_ADDR0_DEV 0x0000F800UL +#define OF_PCI_ADDR0_FN 0x00000700UL +#define OF_PCI_ADDR0_BARREG 0x000000FFUL + unsigned int pci_parse_of_flags(u32 addr0, int bridge) { - unsigned int flags = 0; + unsigned int flags = 0, as = addr0 & OF_PCI_ADDR0_SPACE_MASK; - if (addr0 & 0x02000000) { + if (as == OF_PCI_ADDR0_SPACE_MMIO32 || as == OF_PCI_ADDR0_SPACE_MMIO64) { flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; - flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; - if (flags & PCI_BASE_ADDRESS_MEM_TYPE_64) - flags |= IORESOURCE_MEM_64; - flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; - if (addr0 & 0x40000000) - flags |= IORESOURCE_PREFETCH - | PCI_BASE_ADDRESS_MEM_PREFETCH; + + if (as == OF_PCI_ADDR0_SPACE_MMIO64) + flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 | IORESOURCE_MEM_64; + + if (addr0 & OF_PCI_ADDR0_ALIAS) + flags |= PCI_BASE_ADDRESS_MEM_TYPE_1M; + + if (addr0 & OF_PCI_ADDR0_PREFETCH) + flags |= IORESOURCE_PREFETCH | + PCI_BASE_ADDRESS_MEM_PREFETCH; + /* Note: We don't know whether the ROM has been left enabled * by the firmware or not. We mark it as disabled (ie, we do * not set the IORESOURCE_ROM_ENABLE flag) for now rather than * do a config space read, it will be force-enabled if needed */ - if (!bridge && (addr0 & 0xff) == 0x30) + if (!bridge && (addr0 & OF_PCI_ADDR0_BARREG) == PCI_ROM_ADDRESS) flags |= IORESOURCE_READONLY; - } else if (addr0 & 0x01000000) + + } else if (as == OF_PCI_ADDR0_SPACE_IO) flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + if (flags) flags |= IORESOURCE_SIZEALIGN; + return flags; } diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 7a84c9f1778e..639ceae7da9d 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1587,8 +1587,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, + unsigned long tls) { struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); @@ -1629,10 +1630,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_PPC64 if (!is_32bit_task()) - childregs->gpr[13] = childregs->gpr[6]; + childregs->gpr[13] = tls; else #endif - childregs->gpr[2] = childregs->gpr[6]; + childregs->gpr[2] = tls; } f = ret_from_fork; @@ -2033,10 +2034,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int count = 0; int firstframe = 1; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - struct ftrace_ret_stack *ret_stack; - extern void return_to_handler(void); - unsigned long rth = (unsigned long)return_to_handler; - int curr_frame = 0; + unsigned long ret_addr; + int ftrace_idx = 0; #endif if (tsk == NULL) @@ -2065,15 +2064,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) if (!firstframe || ip != lr) { printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if ((ip == rth) && curr_frame >= 0) { - ret_stack = ftrace_graph_get_ret_stack(current, - curr_frame++); - if (ret_stack) - pr_cont(" (%pS)", - (void *)ret_stack->ret); - else - curr_frame = -1; - } + ret_addr = ftrace_graph_ret_addr(current, + &ftrace_idx, ip, stack); + if (ret_addr != ip) + pr_cont(" (%pS)", (void *)ret_addr); #endif if (firstframe) pr_cont(" (unreliable)"); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7159e791a70d..6620f37abe73 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -55,6 +55,7 @@ #include <asm/firmware.h> #include <asm/dt_cpu_ftrs.h> #include <asm/drmem.h> +#include <asm/ultravisor.h> #include <mm/mmu_decl.h> @@ -702,9 +703,12 @@ void __init early_init_devtree(void *params) #ifdef CONFIG_PPC_POWERNV /* Some machines might need OPAL info for debugging, grab it now. */ of_scan_flat_dt(early_init_dt_scan_opal, NULL); + + /* Scan tree for ultravisor feature */ + of_scan_flat_dt(early_init_dt_scan_ultravisor, NULL); #endif -#ifdef CONFIG_FA_DUMP +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) /* scan tree to see if dump is active during last boot */ of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); #endif @@ -731,7 +735,7 @@ void __init early_init_devtree(void *params) if (PHYSICAL_START > MEMORY_START) memblock_reserve(MEMORY_START, 0x8000); reserve_kdump_trampoline(); -#ifdef CONFIG_FA_DUMP +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) /* * If we fail to reserve memory for firmware-assisted dump then * fallback to kexec based kdump. diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 514707ef6779..a4e7762dd286 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -40,6 +40,7 @@ #include <asm/sections.h> #include <asm/machdep.h> #include <asm/asm-prototypes.h> +#include <asm/ultravisor-api.h> #include <linux/linux_logo.h> @@ -94,7 +95,7 @@ static int of_workarounds __prombss; #define PROM_BUG() do { \ prom_printf("kernel BUG at %s line 0x%x!\n", \ __FILE__, __LINE__); \ - __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR); \ + __builtin_trap(); \ } while (0) #ifdef DEBUG_PROM @@ -171,6 +172,10 @@ static bool __prombss prom_radix_disable; static bool __prombss prom_xive_disable; #endif +#ifdef CONFIG_PPC_SVM +static bool __prombss prom_svm_enable; +#endif + struct platform_support { bool hash_mmu; bool radix_mmu; @@ -812,6 +817,17 @@ static void __init early_cmdline_parse(void) prom_debug("XIVE disabled from cmdline\n"); } #endif /* CONFIG_PPC_PSERIES */ + +#ifdef CONFIG_PPC_SVM + opt = prom_strstr(prom_cmd_line, "svm="); + if (opt) { + bool val; + + opt += sizeof("svm=") - 1; + if (!prom_strtobool(opt, &val)) + prom_svm_enable = val; + } +#endif /* CONFIG_PPC_SVM */ } #ifdef CONFIG_PPC_PSERIES @@ -1712,6 +1728,43 @@ static void __init prom_close_stdin(void) } } +#ifdef CONFIG_PPC_SVM +static int prom_rtas_hcall(uint64_t args) +{ + register uint64_t arg1 asm("r3") = H_RTAS; + register uint64_t arg2 asm("r4") = args; + + asm volatile("sc 1\n" : "=r" (arg1) : + "r" (arg1), + "r" (arg2) :); + return arg1; +} + +static struct rtas_args __prombss os_term_args; + +static void __init prom_rtas_os_term(char *str) +{ + phandle rtas_node; + __be32 val; + u32 token; + + prom_debug("%s: start...\n", __func__); + rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas")); + prom_debug("rtas_node: %x\n", rtas_node); + if (!PHANDLE_VALID(rtas_node)) + return; + + val = 0; + prom_getprop(rtas_node, "ibm,os-term", &val, sizeof(val)); + token = be32_to_cpu(val); + prom_debug("ibm,os-term: %x\n", token); + if (token == 0) + prom_panic("Could not get token for ibm,os-term\n"); + os_term_args.token = cpu_to_be32(token); + prom_rtas_hcall((uint64_t)&os_term_args); +} +#endif /* CONFIG_PPC_SVM */ + /* * Allocate room for and instantiate RTAS */ @@ -3168,6 +3221,46 @@ static void unreloc_toc(void) #endif #endif +#ifdef CONFIG_PPC_SVM +/* + * Perform the Enter Secure Mode ultracall. + */ +static int enter_secure_mode(unsigned long kbase, unsigned long fdt) +{ + register unsigned long r3 asm("r3") = UV_ESM; + register unsigned long r4 asm("r4") = kbase; + register unsigned long r5 asm("r5") = fdt; + + asm volatile("sc 2" : "+r"(r3) : "r"(r4), "r"(r5)); + + return r3; +} + +/* + * Call the Ultravisor to transfer us to secure memory if we have an ESM blob. + */ +static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +{ + int ret; + + if (!prom_svm_enable) + return; + + /* Switch to secure mode. */ + prom_printf("Switching to secure mode.\n"); + + ret = enter_secure_mode(kbase, fdt); + if (ret != U_SUCCESS) { + prom_printf("Returned %d from switching to secure mode.\n", ret); + prom_rtas_os_term("Switch to secure mode failed.\n"); + } +} +#else +static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +{ +} +#endif /* CONFIG_PPC_SVM */ + /* * We enter here early on, when the Open Firmware prom is still * handling exceptions and the MMU hash table for us. @@ -3366,6 +3459,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unreloc_toc(); #endif + /* Move to secure memory if we're supposed to be secure guests. */ + setup_secure_guest(kbase, hdr); + __start(hdr, kbase, 0, 0, 0, 0, 0); return 0; diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 5faf0a64c92b..c5fa251b8950 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -16,6 +16,7 @@ #include <linux/capability.h> #include <linux/delay.h> #include <linux/cpu.h> +#include <linux/sched.h> #include <linux/smp.h> #include <linux/completion.h> #include <linux/cpumask.h> @@ -871,15 +872,17 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, return 0; for_each_cpu(cpu, cpus) { + struct device *dev = get_cpu_device(cpu); + switch (state) { case DOWN: - cpuret = cpu_down(cpu); + cpuret = device_offline(dev); break; case UP: - cpuret = cpu_up(cpu); + cpuret = device_online(dev); break; } - if (cpuret) { + if (cpuret < 0) { pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", __func__, ((state == UP) ? "up" : "down"), @@ -896,6 +899,7 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, cpumask_clear_cpu(cpu, cpus); } } + cond_resched(); } return ret; @@ -922,13 +926,11 @@ int rtas_online_cpus_mask(cpumask_var_t cpus) return ret; } -EXPORT_SYMBOL(rtas_online_cpus_mask); int rtas_offline_cpus_mask(cpumask_var_t cpus) { return rtas_cpu_state_change_mask(DOWN, cpus); } -EXPORT_SYMBOL(rtas_offline_cpus_mask); int rtas_ibm_suspend_me(u64 handle) { @@ -968,6 +970,8 @@ int rtas_ibm_suspend_me(u64 handle) data.token = rtas_token("ibm,suspend-me"); data.complete = &done; + lock_device_hotplug(); + /* All present CPUs must be online */ cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); cpuret = rtas_online_cpus_mask(offline_mask); @@ -1006,6 +1010,7 @@ out_hotplug_enable: __func__); out: + unlock_device_hotplug(); free_cpumask_var(offline_mask); return atomic_read(&data.error); } diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index e1c9cf079503..7cfcb294b11c 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -28,7 +28,7 @@ static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NO bool barrier_nospec_enabled; static bool no_nospec; static bool btb_flush_enabled; -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static bool no_spectrev2; #endif @@ -114,7 +114,7 @@ static __init int security_feature_debugfs_init(void) device_initcall(security_feature_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static int __init handle_nospectre_v2(char *p) { no_spectrev2 = true; @@ -122,6 +122,9 @@ static int __init handle_nospectre_v2(char *p) return 0; } early_param("nospectre_v2", handle_nospectre_v2); +#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E void setup_spectre_v2(void) { if (no_spectrev2 || cpu_mitigations_off()) @@ -399,7 +402,17 @@ static void toggle_count_cache_flush(bool enable) void setup_count_cache_flush(void) { - toggle_count_cache_flush(true); + bool enable = true; + + if (no_spectrev2 || cpu_mitigations_off()) { + if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) || + security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED)) + pr_warn("Spectre v2 mitigations not under software control, can't disable\n"); + + enable = false; + } + + toggle_count_cache_flush(enable); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 5e6543aba1b3..25aaa3903000 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -800,9 +800,15 @@ static __init void print_system_info(void) pr_info("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); #ifdef CONFIG_PPC64 pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); +#ifdef CONFIG_PPC_BOOK3S + pr_info("vmalloc start = 0x%lx\n", KERN_VIRT_START); + pr_info("IO start = 0x%lx\n", KERN_IO_START); + pr_info("vmemmap start = 0x%lx\n", (unsigned long)vmemmap); +#endif #endif - print_system_hash_info(); + if (!early_radix_enabled()) + print_system_hash_info(); if (PHYSICAL_START > 0) pr_info("physical_start = 0x%llx\n", diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 94517e4a2723..a7541edf0cdb 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -206,6 +206,6 @@ __init void initialize_cache_info(void) dcache_bsize = cur_cpu_spec->dcache_bsize; icache_bsize = cur_cpu_spec->icache_bsize; ucache_bsize = 0; - if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601) || IS_ENABLED(CONFIG_E200)) ucache_bsize = icache_bsize = dcache_bsize; } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 1e2276963f6d..e2a46cfed5fd 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -182,7 +182,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, * FIXME: IMHO these tests do not belong in * arch-dependent code, they are generic. */ - ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, NULL); + ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack); #ifdef CONFIG_KPROBES /* * Mark stacktraces with kretprobed functions on them diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e2147d7c9e72..80a676da11cb 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -19,6 +19,7 @@ #include <asm/smp.h> #include <asm/pmc.h> #include <asm/firmware.h> +#include <asm/svm.h> #include "cacheinfo.h" #include "setup.h" @@ -715,6 +716,23 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ +#ifdef CONFIG_PPC_SVM +static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", is_secure_guest()); +} +static DEVICE_ATTR(svm, 0444, show_svm, NULL); + +static void create_svm_file(void) +{ + device_create_file(cpu_subsys.dev_root, &dev_attr_svm); +} +#else +static void create_svm_file(void) +{ +} +#endif /* CONFIG_PPC_SVM */ + static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -1058,6 +1076,8 @@ static int __init topology_init(void) sysfs_create_dscr_default(); #endif /* CONFIG_PPC64 */ + create_svm_file(); + return 0; } subsys_initcall(topology_init); diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index be1ca98fce5c..7ea0ca044b65 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -944,7 +944,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. Return the address we want to divert to. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp) { unsigned long return_hooker; @@ -956,7 +957,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) return_hooker = ppc_function_entry(return_to_handler); - if (!function_graph_enter(parent, ip, 0, NULL)) + if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) parent = return_hooker; out: return parent; diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index 183f608efb81..e023ae59c429 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -50,6 +50,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addi r5, r1, 48 /* load r4 with local address */ lwz r4, 44(r1) subi r4, r4, MCOUNT_INSN_SIZE diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 74acbf16a666..f9fd5f743eba 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -294,6 +294,7 @@ _GLOBAL(ftrace_graph_caller) std r2, 24(r1) ld r2, PACATOC(r13) /* get kernel TOC in r2 */ + addi r5, r1, 112 mfctr r4 /* ftrace_caller has moved local addr here */ std r4, 40(r1) mflr r3 /* ftrace_caller has restored LR from stack */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S index e41a7d13c99c..6708e24db0ab 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.S +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S @@ -41,6 +41,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addi r5, r1, 112 /* load r4 with local address */ ld r4, 128(r1) subi r4, r4, MCOUNT_INSN_SIZE diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 11caa0291254..82f43535e686 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -472,6 +472,7 @@ void system_reset_exception(struct pt_regs *regs) if (debugger(regs)) goto out; + kmsg_dump(KMSG_DUMP_OOPS); /* * A system reset is a request to dump, so we always send * it through the crashdump code (if fadump or kdump are diff --git a/arch/powerpc/kernel/ucall.S b/arch/powerpc/kernel/ucall.S new file mode 100644 index 000000000000..07296bc39166 --- /dev/null +++ b/arch/powerpc/kernel/ucall.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Generic code to perform an ultravisor call. + * + * Copyright 2019, IBM Corporation. + * + */ +#include <asm/ppc_asm.h> +#include <asm/export.h> + +_GLOBAL(ucall_norets) +EXPORT_SYMBOL_GPL(ucall_norets) + sc 2 /* Invoke the ultravisor */ + blr /* Return r3 = status */ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index d60598113a9f..eae9ddaecbcf 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -94,28 +94,6 @@ static struct vdso_patch_def vdso_patches[] = { CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, "__kernel_sync_dicache", "__kernel_sync_dicache_p5" }, -#ifdef CONFIG_PPC32 - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_gettimeofday", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_clock_gettime", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_clock_getres", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_get_tbfreq", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_time", NULL - }, -#endif }; /* diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 6984125b9fc0..6c7401bd284e 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -70,6 +70,7 @@ V_FUNCTION_END(__kernel_get_syscall_map) * * returns the timebase frequency in HZ */ +#ifndef CONFIG_PPC_BOOK3S_601 V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 @@ -82,3 +83,4 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) +#endif diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 099a6db14e67..00c025ba4a92 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -144,10 +144,13 @@ VERSION __kernel_datapage_offset; __kernel_get_syscall_map; +#ifndef CONFIG_PPC_BOOK3S_601 __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; + __kernel_time; __kernel_get_tbfreq; +#endif __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp32; @@ -155,7 +158,6 @@ VERSION #ifdef CONFIG_PPC64 __kernel_getcpu; #endif - __kernel_time; local: *; }; |